In [3]:
import gensim
from gensim import corpora
from pprint import pprint
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis  # Import pyLDAvis


class Tokenizer:
    def __init__(self, documents: list[str]):
        self.documents = documents

    def tokenize(self) -> list[list[str]]:
        return [document.lower().split() for document in self.documents]


class DictionaryCreator:
    def __init__(self, tokenized_documents: list[list[str]]):
        self.tokenized_documents = tokenized_documents

    def create_dictionary(self) -> corpora.Dictionary:
        return corpora.Dictionary(self.tokenized_documents)


class CorpusCreator:
    def __init__(
        self, dictionary: corpora.Dictionary, tokenized_documents: list[list[str]]
    ):
        self.dictionary = dictionary
        self.tokenized_documents = tokenized_documents

    def create_corpus(self) -> list[list[int]]:
        return [
            self.dictionary.doc2bow(document) for document in self.tokenized_documents
        ]


class LDAModelTrainer:
    def __init__(
        self,
        corpus: list[list[int]],
        dictionary: corpora.Dictionary,
        num_topics: int,
        passes: int,
    ):
        self.corpus = corpus
        self.dictionary = dictionary
        self.num_topics = num_topics
        self.passes = passes

    def train_model(self) -> gensim.models.LdaModel:
        return gensim.models.LdaModel(
            corpus=self.corpus,
            id2word=self.dictionary,
            num_topics=self.num_topics,
            passes=self.passes,
        )


class TopicVisualizer:
    def __init__(
        self,
        lda_model: gensim.models.LdaModel,
        corpus: list[list[int]],
        dictionary: corpora.Dictionary,
    ):
        self.lda_model = lda_model
        self.corpus = corpus
        self.dictionary = dictionary

    def visualize_topics(self) -> None:
        lda_display = gensimvis.prepare(
            self.lda_model, self.corpus, self.dictionary, sort_topics=False
        )
        try:
            pyLDAvis.save_html(lda_display, "lda_visualization.html")
        except gensim.utils.SaveLoadError as e:
            print(f"Error: {e}")


def load_data() -> list[str]:
    """Load the sample data"""
    return [
        "Human machine interface for lab abc computer applications",
        "A survey of user opinion of computer system response time",
        "The EPS user interface management system",
        "System and human system engineering testing of EPS",
        "Relation of user perceived response time to error measurement",
        "The generation of random binary unordered trees",
        "The intersection graph of paths in trees",
        "Graph minors IV Widths of trees and well quasi ordering",
        "Graph minors A survey",
    ]


def preprocess_data(
    documents: list[str],
) -> tuple[list[list[str]], corpora.Dictionary, list[list[int]]]:
    tokenizer = Tokenizer(documents)
    tokenized_documents = tokenizer.tokenize()
    dictionary = DictionaryCreator(tokenized_documents).create_dictionary()
    corpus = CorpusCreator(dictionary, tokenized_documents).create_corpus()
    return tokenized_documents, dictionary, corpus


def train_model(
    corpus: list[list[int]],
    dictionary: corpora.Dictionary,
    num_topics: int,
    passes: int,
) -> gensim.models.LdaModel:
    lda_model_trainer = LDAModelTrainer(corpus, dictionary, num_topics, passes)
    return lda_model_trainer.train_model()

[(0,
  '0.085*"of" + 0.065*"trees" + 0.065*"graph" + 0.046*"minors" + 0.046*"the" + '
  '0.026*"iv" + 0.026*"ordering" + 0.026*"quasi" + 0.026*"well" + '
  '0.026*"widths"'),
 (1,
  '0.083*"system" + 0.083*"of" + 0.058*"computer" + 0.058*"human" + 0.034*"a" '
  '+ 0.034*"survey" + 0.033*"time" + 0.033*"response" + 0.033*"and" + '
  '0.033*"opinion"'),
 (2,
  '0.067*"user" + 0.067*"the" + 0.067*"eps" + 0.067*"interface" + '
  '0.066*"management" + 0.066*"system" + 0.017*"a" + 0.017*"survey" + '
  '0.017*"response" + 0.017*"time"')]
