### Installing Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')
!pip install numpy gensim nltk pyLDAvis tqdm
!pip install pandas==1.4.0


INFO: pip is looking at multiple versions of pyldavis to determine which version is compatible with other requirements. This could take a while.


ERROR: Could not find a version that satisfies the requirement pandas>=2.0.0 (from pyldavis) (from versions: none)
ERROR: No matching distribution found for pandas>=2.0.0




### Importing Dataset

In [2]:
import pandas as pd
from tqdm import tqdm
data = pd.read_csv('./abcnews-date-text.csv', on_bad_lines='skip')
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

In [3]:
print(len(documents))
print(documents[:5])

1244184
                                       headline_text  index
0  aba decides against community broadcasting lic...      0
1     act fire witnesses must be aware of defamation      1
2     a g calls for infrastructure protection summit      2
3           air nz staff in aust strike for pay rise      3
4      air nz strike to affect australian travellers      4


### Importing Libraries

In [4]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Error loading wordnet: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

### Preprocessing the Raw data
    
    Steps :
    1) Lemmatization
    2) Stemming
    3) Removing Stopwords
    4) Removing words with low character counts

In [5]:
def lemmatize_stemming(text):
    return SnowballStemmer(language='english').stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [6]:
doc_sample = documents[documents['index'] == 4310].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['ratepayers', 'group', 'wants', 'compulsory', 'local', 'govt', 'voting']


 tokenized and lemmatized document: 
['ratepay', 'group', 'want', 'compulsori', 'local', 'govt', 'vote']


### Mapping preprocessing to the dataset

In [7]:
# import multiprocessing
# from multiprocessing import Pool
# num_partitions = 5
# num_cores = 1
# a,b,c,d,e = np.array_split( documents['headline_text'],num_partitions)
# pool = Pool(num_cores)
tqdm.pandas()
# df = pd.concat(pool.map(preprocess, [a,b,c,d,e]))
# pool.close()
# pool.join()
processed_docs = documents['headline_text'].progress_map(preprocess)
# processed_docs[:10]

100%|█████████████████████████████████████████████████████████████████████| 1244184/1244184 [01:49<00:00, 11382.38it/s]


In [8]:
dictionary = gensim.corpora.Dictionary(tqdm(processed_docs))
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

100%|████████████████████████████████████████████████████████████████████| 1244184/1244184 [00:08<00:00, 153071.49it/s]

0 broadcast
1 communiti
2 decid
3 licenc
4 awar
5 defam
6 wit
7 call
8 infrastructur
9 protect
10 summit





### Filter the data and keep 1000000 datapoints 

In [9]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

### Vectorizing the data - **Bag of Words**

In [10]:
bow_corpus = []
for i in tqdm(range(len(processed_docs))):
    bow_corpus.append(dictionary.doc2bow(processed_docs[i]))
bow_corpus[4310]

100%|████████████████████████████████████████████████████████████████████| 1244184/1244184 [00:08<00:00, 152215.06it/s]


[(162, 1), (240, 1), (292, 1), (589, 1), (838, 1), (3571, 1), (3572, 1)]

In [11]:
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 162 ("govt") appears 1 time.
Word 240 ("group") appears 1 time.
Word 292 ("vote") appears 1 time.
Word 589 ("local") appears 1 time.
Word 838 ("want") appears 1 time.
Word 3571 ("compulsori") appears 1 time.
Word 3572 ("ratepay") appears 1 time.


### Vectorizing the data - **TFIDF**

In [12]:
from gensim import corpora, models
tfidf = models.TfidfModel(tqdm(bow_corpus))
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

100%|███████████████████████████████████████████████████████████████████| 1244184/1244184 [00:00<00:00, 1248884.69it/s]


[(0, 0.5844216176085719),
 (1, 0.38716866963787633),
 (2, 0.5013820927104505),
 (3, 0.5071171375845095)]


### Training the model 

In [13]:
lda_model = gensim.models.LdaMulticore(tqdm(bow_corpus), num_topics=10, id2word=dictionary, passes=2, workers=7)

100%|█████████████████████████████████████████████████████████████████████| 1244184/1244184 [00:32<00:00, 38054.95it/s]


In [14]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.018*"donald" + 0.015*"minist" + 0.014*"hospit" + 0.010*"drive" + 0.009*"hobart" + 0.008*"hunter" + 0.008*"river" + 0.008*"northern" + 0.008*"award" + 0.008*"video"
Topic: 1 
Words: 0.022*"crash" + 0.016*"die" + 0.016*"miss" + 0.016*"home" + 0.014*"perth" + 0.013*"bushfir" + 0.012*"death" + 0.012*"woman" + 0.010*"polic" + 0.010*"search"
Topic: 2 
Words: 0.021*"interview" + 0.019*"open" + 0.018*"market" + 0.017*"record" + 0.014*"final" + 0.012*"australian" + 0.011*"australia" + 0.010*"fall" + 0.009*"share" + 0.009*"guilti"
Topic: 3 
Words: 0.021*"elect" + 0.020*"chang" + 0.014*"hous" + 0.013*"busi" + 0.013*"council" + 0.012*"say" + 0.012*"labor" + 0.011*"farmer" + 0.011*"plan" + 0.011*"speak"
Topic: 4 
Words: 0.031*"queensland" + 0.017*"world" + 0.016*"win" + 0.013*"time" + 0.011*"royal" + 0.010*"beat" + 0.010*"australia" + 0.009*"commiss" + 0.008*"andrew" + 0.008*"port"
Topic: 5 
Words: 0.031*"court" + 0.027*"charg" + 0.024*"case" + 0.024*"murder" + 0.022*"face" + 0.0

### Training the model using **TFIDF**

In [15]:
len(corpus_tfidf)

1244184

In [16]:
lda_model_tfidf = gensim.models.LdaMulticore(tqdm(corpus_tfidf), num_topics=10, id2word=dictionary, passes=2, workers=7)


100%|█████████████████████████████████████████████████████████████████████| 1244184/1244184 [00:38<00:00, 32521.61it/s]


In [17]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.020*"news" + 0.016*"interview" + 0.014*"rural" + 0.012*"covid" + 0.011*"drum" + 0.010*"speak" + 0.009*"coronavirus" + 0.008*"andrew" + 0.008*"lockdown" + 0.007*"tuesday"
Topic: 1 Word: 0.008*"morrison" + 0.008*"stori" + 0.008*"wednesday" + 0.007*"david" + 0.007*"farmer" + 0.006*"coronavirus" + 0.006*"drought" + 0.006*"quarantin" + 0.005*"australia" + 0.005*"novemb"
Topic: 2 Word: 0.011*"govern" + 0.008*"health" + 0.007*"fund" + 0.006*"plan" + 0.006*"council" + 0.005*"budget" + 0.005*"feder" + 0.005*"industri" + 0.005*"say" + 0.004*"sport"
Topic: 3 Word: 0.013*"polic" + 0.010*"crash" + 0.009*"death" + 0.008*"die" + 0.008*"climat" + 0.008*"woman" + 0.007*"coast" + 0.007*"fatal" + 0.006*"grandstand" + 0.006*"age"
Topic: 4 Word: 0.013*"donald" + 0.007*"thursday" + 0.007*"market" + 0.006*"australian" + 0.006*"share" + 0.006*"wall" + 0.006*"extend" + 0.006*"korea" + 0.005*"north" + 0.005*"alan"
Topic: 5 Word: 0.022*"trump" + 0.006*"scott" + 0.006*"christma" + 0.005*"coronavi

In [18]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.3972466289997101	 
Topic: 0.021*"elect" + 0.020*"chang" + 0.014*"hous" + 0.013*"busi" + 0.013*"council" + 0.012*"say" + 0.012*"labor" + 0.011*"farmer" + 0.011*"plan" + 0.011*"speak"

Score: 0.3528382480144501	 
Topic: 0.035*"govern" + 0.017*"worker" + 0.016*"return" + 0.012*"group" + 0.012*"live" + 0.009*"say" + 0.008*"work" + 0.008*"unit" + 0.008*"right" + 0.008*"sale"

Score: 0.162348210811615	 
Topic: 0.031*"queensland" + 0.017*"world" + 0.016*"win" + 0.013*"time" + 0.011*"royal" + 0.010*"beat" + 0.010*"australia" + 0.009*"commiss" + 0.008*"andrew" + 0.008*"port"

Score: 0.012511699460446835	 
Topic: 0.017*"coast" + 0.016*"school" + 0.015*"north" + 0.015*"rise" + 0.012*"flood" + 0.012*"price" + 0.011*"south" + 0.011*"gold" + 0.011*"west" + 0.011*"farm"

Score: 0.012509656138718128	 
Topic: 0.024*"covid" + 0.024*"trump" + 0.020*"coronavirus" + 0.020*"health" + 0.016*"rural" + 0.015*"victoria" + 0.014*"report" + 0.014*"news" + 0.012*"nation" + 0.011*"australia"

Score: 0.012

In [19]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.6909939646720886	 
Topic: 0.011*"govern" + 0.008*"health" + 0.007*"fund" + 0.006*"plan" + 0.006*"council" + 0.005*"budget" + 0.005*"feder" + 0.005*"industri" + 0.005*"say" + 0.004*"sport"

Score: 0.2089741826057434	 
Topic: 0.022*"trump" + 0.006*"scott" + 0.006*"christma" + 0.005*"coronavirus" + 0.005*"june" + 0.005*"cyclon" + 0.005*"pandem" + 0.005*"search" + 0.005*"water" + 0.005*"rate"

Score: 0.012504386715590954	 
Topic: 0.013*"donald" + 0.007*"thursday" + 0.007*"market" + 0.006*"australian" + 0.006*"share" + 0.006*"wall" + 0.006*"extend" + 0.006*"korea" + 0.005*"north" + 0.005*"alan"

Score: 0.012504244223237038	 
Topic: 0.015*"countri" + 0.012*"hour" + 0.010*"weather" + 0.009*"royal" + 0.007*"live" + 0.007*"commiss" + 0.007*"queensland" + 0.007*"coronavirus" + 0.007*"victoria" + 0.006*"turnbul"

Score: 0.012504222802817822	 
Topic: 0.008*"morrison" + 0.008*"stori" + 0.008*"wednesday" + 0.007*"david" + 0.007*"farmer" + 0.006*"coronavirus" + 0.006*"drought" + 0.006*"quar

### Evaluating the model using **Coherence Score** and **Perplexity** 

In [20]:
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda_model_tfidf, texts=processed_docs, dictionary=dictionary, coherence='u_mass', processes=7)
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  -7.260419823058737


In [21]:

print("Preplexity : ",lda_model_tfidf.log_perplexity(corpus_tfidf))

Preplexity :  -9.977694535315248


### Testing the model with unseen document

In [22]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
tfidf_vector = tfidf[bow_vector]
for index, score in sorted(lda_model_tfidf[tfidf_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))

Score: 0.2986299395561218	 Topic: 0.010*"kill" + 0.008*"violenc" + 0.006*"truck" + 0.006*"crash" + 0.006*"domest"
Score: 0.26904547214508057	 Topic: 0.016*"murder" + 0.014*"charg" + 0.013*"court" + 0.009*"sentenc" + 0.009*"jail"
Score: 0.21315154433250427	 Topic: 0.013*"polic" + 0.010*"crash" + 0.009*"death" + 0.008*"die" + 0.008*"climat"
Score: 0.0313185378909111	 Topic: 0.011*"govern" + 0.008*"health" + 0.007*"fund" + 0.006*"plan" + 0.006*"council"
Score: 0.03131619840860367	 Topic: 0.008*"morrison" + 0.008*"stori" + 0.008*"wednesday" + 0.007*"david" + 0.007*"farmer"
Score: 0.03131333738565445	 Topic: 0.013*"donald" + 0.007*"thursday" + 0.007*"market" + 0.006*"australian" + 0.006*"share"
Score: 0.03130798041820526	 Topic: 0.012*"australia" + 0.010*"world" + 0.009*"final" + 0.008*"leagu" + 0.006*"cricket"
Score: 0.03130712732672691	 Topic: 0.015*"countri" + 0.012*"hour" + 0.010*"weather" + 0.009*"royal" + 0.007*"live"
Score: 0.031306762248277664	 Topic: 0.022*"trump" + 0.006*"scott" +

In [23]:
from datetime import datetime

def hyperparameter_tuner():
    grid = {}
    grid['Validation_Set'] = {}

    # Topics range
    min_topics = 2
    max_topics = 11
    step_size = 1
    topics_range = range(min_topics, max_topics, step_size)

    # Alpha parameter
    alpha = list(np.arange(0.01, 1, 0.3))
    alpha.append('symmetric')
    alpha.append('asymmetric')

    # Beta parameter
    beta = list(np.arange(0.01, 1, 0.3))
    beta.append('symmetric')

    # Validation sets
    num_of_docs = len(corpus_tfidf)
    corpus_sets = [gensim.utils.ClippedCorpus(corpus_tfidf, int(num_of_docs*0.75)), 
                   corpus_tfidf]

    corpus_title = ['75% Corpus', '100% Corpus']

    model_results = {'Validation_Set': [],
                     'Topics': [],
                     'Alpha': [],
                     'Beta': [],
                     'Coherence': []
                    }

    # Can take a long time to run
    if 1 == 1:
        pbar = tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))

        # iterate through validation corpuses
        for i in range(len(corpus_sets)):
            # iterate through number of topics
            for k in topics_range:
                # iterate through alpha values
                for a in alpha:
                    # iterare through beta values
                    for b in beta:
                        # get the coherence score for the given parameters
                        cv = compute_coherence_values(corpus=corpus_sets[i], 
                                                      k=k, a=a, b=b)
                        # Save the model results
                        model_results['Validation_Set'].append(corpus_title[i])
                        model_results['Topics'].append(k)
                        model_results['Alpha'].append(a)
                        model_results['Beta'].append(b)
                        model_results['Coherence'].append(cv)

                        pbar.update(1)
        pd.DataFrame(model_results).to_csv(f'./result-tuner/lda_tuning-result-{datetime.now().strftime("%d-%m-%Y-%H-%M-%S")}.csv', index=False)
        pbar.close()

def compute_coherence_values(corpus,k, a, b):

    lda_model = gensim.models.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=k, passes=1, workers=7, alpha=a, eta=b)

    coherence_model_lda = CoherenceModel(model=lda_model_tfidf, texts=processed_docs, dictionary=dictionary, coherence='u_mass',processes = 15)

    return coherence_model_lda.get_coherence()

hyperparameter_tuner()

100%|██████████████████████████████████████████████████████████████████████████████| 540/540 [6:02:34<00:00, 40.29s/it]


### Complete Python code for Topic Modelling

In [24]:
import warnings
warnings.filterwarnings('ignore')

In [25]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2023)
import nltk
nltk.download('wordnet')
from gensim import corpora, models
import pyLDAvis
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook(local=True)
from gensim.models import LdaMulticore,CoherenceModel
from tqdm import tqdm
from datetime import datetime


class DataGetter:
    def __init__(self,dataset_path):
        self.dataset_path = dataset_path
    
    def get_data(self):
        data = pd.read_csv(self.dataset_path, on_bad_lines='skip')
        data_text = data[['headline_text']]
        data_text['index'] = data_text.index
        self.dataset = data_text
        return self.dataset
    
class Preprocessor:
    def __init__(self,raw_data,language = 'english'):
        self.raw_data = raw_data
        self.language = language
        self.preprocessed_data = []
        
        
    def stemmer(self,text):
        snowball_stemmer = SnowballStemmer(language=self.language)
        return snowball_stemmer.stem(text)
    
    def lemmatizer(self,text):
        wordnet_lemmatizer = WordNetLemmatizer()
        return wordnet_lemmatizer.lemmatize(text, pos='v')

    def text_preprocessor(self,text):
        preprocessed_text = []
        for token in gensim.utils.simple_preprocess(text):
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
                preprocessed_text.append(self.stemmer(self.lemmatizer(text)))
        return preprocessed_text
    
    def preprocess(self):
        tqdm.pandas()
        self.preprocessed_data = self.raw_data['headline_text'].progress_map(self.text_preprocessor)
        return self.preprocessed_data

class LDAModel:
    
    def __init__(self,preprocessed_data,vectorization_method = 'tfidf'):
        self.preprocessed_data = preprocessed_data
        self.vectorization_method = vectorization_method
        self.dictionary = gensim.corpora.Dictionary(tqdm(self.preprocessed_data))
        self.dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
        self.corpus = []
        self.coherence_score = 0
    
    def fit(self):
        if self.vectorization_method == 'bow':
            print("\n\nVectorizing........\n")
            for i in tqdm(range(len(self.preprocessed_data))):
                self.corpus.append(self.dictionary.doc2bow(self.preprocessed_data[i]))
            print("\n\nTraining LDA Model.........\n")
            self.model = LdaMulticore( self.corpus, num_topics=10, id2word=self.dictionary, passes=2, random_state=100,chunksize=100, workers=7)
            
        elif self.vectorization_method == 'tfidf':
            print("\n\nVectorizing........\n")
            for i in tqdm(range(len(self.preprocessed_data))):
                self.corpus.append(self.dictionary.doc2bow(self.preprocessed_data[i]))
            self.vectorizer = models.TfidfModel(tqdm(self.corpus))
            self.corpus = self.vectorizer[self.corpus]
            print("\n\nTraining LDA Model.........\n")
            self.model = LdaMulticore((self.corpus), num_topics=10, id2word=self.dictionary, passes=2, random_state=100, chunksize=100, workers=7)
        
        return self.model
    
    def validate_model(self):
        # Compute Coherence Score
        coherence_model_lda = CoherenceModel(model=self.model, texts=self.preprocessed_data, dictionary=self.dictionary, coherence='u_mass', processes=7)
        self.coherence_score = coherence_model_lda.get_coherence()
        print('Coherence Score: ', self.coherence_score)
        
        self.perplexity = self.model.log_perplexity(self.corpus)
        print("Preplexity : ",self.perplexity)
        return {"coherence score":self.coherence_score,"preplexity":self.perplexity}
    
    def get_topic_for_new_data(self,new_data):
        bow_vector = self.dictionary.doc2bow(preprocess(new_data))
        if self.vectorization_method == 'tfidf':
            tfidf_vector = self.vectorizer[bow_vector]
            for index, score in sorted(self.model[tfidf_vector], key=lambda tup: -1*tup[1]):
                print("Score: {}\n Topic: {}".format(score, self.model.print_topic(index, 5)))
        elif self.vectorization_method == 'bow':
            for index, score in sorted(self.model[bow_vector], key=lambda tup: -1*tup[1]):
                print("Score: {}\n Topic: {}".format(score, self.model.print_topic(index, 5)))
            
    
    def hyperparameter_tuner(self):
        grid = {}
        grid['Validation_Set'] = {}

        # Topics range
        min_topics = 2
        max_topics = 11
        step_size = 1
        topics_range = range(min_topics, max_topics, step_size)

        # Alpha parameter
        alpha = list(np.arange(0.01, 1, 0.3))
        alpha.append('symmetric')
        alpha.append('asymmetric')

        # Beta parameter
        beta = list(np.arange(0.01, 1, 0.3))
        beta.append('symmetric')
        
        # Validation sets
        num_of_docs = len(self.corpus)
        corpus_sets = [gensim.utils.ClippedCorpus(self.corpus, int(num_of_docs*0.75)), 
                       self.corpus]

        corpus_title = ['75% Corpus', '100% Corpus']

        self.model_results = {'Validation_Set': [],
                         'Topics': [],
                         'Alpha': [],
                         'Beta': [],
                         'Coherence': []
                        }

        # Can take a long time to run
        if 1 == 1:
            pbar = tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))

            # iterate through validation corpuses
            for i in range(len(corpus_sets)):
                # iterate through number of topics
                for k in topics_range:
                    # iterate through alpha values
                    for a in alpha:
                        # iterare through beta values
                        for b in beta:
                            # get the coherence score for the given parameters
                            cv = self.compute_coherence_values(corpus=corpus_sets[i], 
                                                          k=k, a=a, b=b)
                            # Save the model results
                            self.model_results['Validation_Set'].append(corpus_title[i])
                            self.model_results['Topics'].append(k)
                            self.model_results['Alpha'].append(a)
                            self.model_results['Beta'].append(b)
                            self.model_results['Coherence'].append(cv)

                            pbar.update(1)
            pd.DataFrame(self.model_results).to_csv(f'./result-tuner/lda_tuning-result-{datetime.now().strftime("%d-%m-%Y-%H-%M-%S")}.csv', index=False)
            pbar.close()
    
    def compute_coherence_values(self, corpus,k, a, b):
    
        lda_model = LdaMulticore(corpus=corpus, id2word=self.dictionary, num_topics=k, random_state=100, chunksize=100, passes=1, workers=7, alpha=a, eta=b)

        coherence_model_lda = CoherenceModel(model=self.model, texts=self.preprocessed_data, dictionary=self.dictionary, coherence='u_mass')

        return coherence_model_lda.get_coherence()
    
    
    def get_dictionary(self):
        return self.dictionary
    
    def get_corpus(self):
        return self.corpus
    
    def get_vectorizer(self):
        return self.vectorizer


class LDAVisualizer:
    
    def __init__(self,model,vectorized_data,vectorizer):
        self.model = model
        self.vectorizer = vectorizer
        self.vectorized_data = vectorized_data
    
    def visualize_model(self):
        self.visualizer = pyLDAvis.gensim_models.prepare(self.model, self.vectorized_data, self.vectorizer)
        return self.visualizer
    
    def save_visualization(self):
        pyLDAvis.save_html(self.visualizer, f'./result-vis/lda_result-{datetime.now().strftime("%d-%m-%Y-%H-%M-%S")}.html')
    

class Driver:
    
    def __init__(self):
        pass
    
    def run_model(self):
        print("Loading Dataset.............\n")
        self.dataset = DataGetter('./abcnews-date-text.csv').get_data()
        print("Dataset Loaded!\n\n")
        print("Preprocessing Dataset........\n")
        self.preprocessed_data = Preprocessor(self.dataset).preprocess()
        print("Preprocessed Data : \n",self.preprocessed_data,end="\n")
        self.model = LDAModel(self.preprocessed_data)
        self.model.fit()
        print("Model :",self.model.model,end="\n\n")
#         for idx, topic in self.model.model.print_topics(-1):
#             print('Topic: {} \nWords: {}'.format(idx, topic))
            
        self.model.validate_model()
            
        return self.model

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Paradox\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
code_driver = Driver()
lda_model = code_driver.run_model()

Loading Dataset.............

Dataset Loaded!


Preprocessing Dataset........



 37%|█████████████████████████▋                                            | 456661/1244184 [00:44<01:13, 10742.63it/s]

In [None]:
visualizer = LDAVisualizer(lda_model.model,lda_model.get_corpus(),lda_model.get_dictionary())
vis = visualizer.visualize_model()
visualizer.save_visualization()
# pyLDAvis.display(vis)