# **Latent Dirichlet Allocation (LDA)-based Topic Modeling and Clustering**

In [1]:
import pandas as pd
import nltk
import gensim
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from gensim.parsing.preprocessing import STOPWORDS
import numpy as np
from gensim import corpora,models
import time
import pickle
nltk.download('stopwords')
nltk.download('wordnet')
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abdul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abdul\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#Mounting google drive
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
#importing the CSV file of the master reports from google drive into a dataframe
master_reports = pd.read_csv('CSV/master_reports.csv')
master_reports = master_reports.drop(columns=['Unnamed: 0'])

In [4]:
def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 5:
            result.append(lemmatize(token))
    return result

In [5]:
master_reports['Description'] = master_reports['Description'].map(preprocess)

### **Creating Bag of Words (BoW)**

In [6]:
#Creating a dictionary using gensim library
dictionary = gensim.corpora.Dictionary(master_reports['Description'])
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [7]:
#Print top 20 words from the dictionary
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 20:
        break

0 actually
1 builders
2 change
3 comment
4 compare
5 complete
6 consider
7 consistency
8 contain
9 default
10 derive
11 document
12 editor
13 effect
14 ensure
15 external
16 extremely
17 inconsistent
18 internal
19 maintain
20 mandatory


In [8]:
#Creating BoW using the the dictionary
bow_corpus = [dictionary.doc2bow(doc) for doc in master_reports['Description']]

In [9]:
#Printing the BoW for single document
bow_doc_8 = bow_corpus[8]
for i in range(len(bow_doc_8)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_8[i][0], 
                                               dictionary[bow_doc_8[i][0]], 
bow_doc_8[i][1]))

Word 2 ("change") appears 1 time.
Word 30 ("resource") appears 1 time.
Word 48 ("expect") appears 1 time.
Word 114 ("future") appears 1 time.
Word 181 ("implementation") appears 1 time.
Word 182 ("iresource") appears 1 time.
Word 183 ("obsolete") appears 1 time.


In [10]:
# open a file, where you stored the pickled data
f= open('CSV/bow_corpus.pickle', 'wb')

# dump information to that file
pickle.dump(bow_corpus, f)

In [11]:
# open a file, where you stored the pickled data
file = open('CSV/dictionary.pickle', 'wb')

# dump information to that file
pickle.dump(dictionary, file)

### **LDA-based Topic Modeling**

In [12]:
#Preparing the parameters for LDA model
corpus = bow_corpus
no_of_topics = 10
dictionary = dictionary
p = 20
k = 2
epochs = 100

#Training the LDA model on the BoW corpus
lda_model = gensim.models.LdaMulticore(corpus, num_topics=no_of_topics, id2word=dictionary, passes=p, workers=k, iterations=epochs)

In [13]:
# save model to disk (no need to use pickle module)
lda_model.save('CSV/lda_model.model')

In [14]:
# Printing the topics and the propability distributions of words in those topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.088*"thread" + 0.056*"javathread" + 0.055*"daemon" + 0.047*"threadblocked" + 0.026*"cwindowsdll" + 0.025*"worker" + 0.022*"native" + 0.020*"symbol" + 0.020*"monitor" + 0.019*"runnable"
Topic: 1 
Words: 0.059*"dialog" + 0.052*"button" + 0.044*"action" + 0.035*"select" + 0.027*"context" + 0.023*"filter" + 0.022*"disable" + 0.022*"enable" + 0.021*"toolbar" + 0.020*"command"
Topic: 2 
Words: 0.064*"editor" + 0.027*"window" + 0.021*"select" + 0.021*"change" + 0.015*"windows" + 0.015*"character" + 0.015*"reproduce" + 0.013*"editors" + 0.013*"eclipse" + 0.013*"problem"
Topic: 3 
Words: 0.092*"public" + 0.088*"import" + 0.065*"display" + 0.031*"static" + 0.023*"snippet" + 0.021*"libclientdylib" + 0.018*"button" + 0.017*"composite" + 0.017*"column" + 0.016*"private"
Topic: 4 
Words: 0.041*"return" + 0.041*"method" + 0.022*"change" + 0.021*"create" + 0.017*"object" + 0.016*"target" + 0.015*"property" + 0.013*"remove" + 0.012*"string" + 0.012*"implement"
Topic: 5 
Words: 0.092*

In [15]:
#Let's evaluate the model using Perplexity and Coherence Bag of words- Title
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=master_reports['Description'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.395842296219169

Coherence Score:  0.6235623994694964


In [16]:
# pip install pyldavis

In [17]:
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


In [18]:
# Visualize the topics for LDA model
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
vis

  and should_run_async(code)


### **Clustering premised on Topic Modeling**

In [19]:
#Creating 10 empty clusters and pushing master reports in each of them based on topic modeling and saving them in individual csv file
for c in range(10):
    exec('topic_{} = pd.DataFrame()'.format(c))
    for i in range(len(master_reports)):
        topic=lda_model[dictionary.doc2bow(master_reports.Description[i])]
        topic= np.asarray(topic)
        if int(topic[np.argmax(topic[:,1]),0])== c:
            exec('topic_{} = topic_{}.append(master_reports.loc[[i]])'.format(c,c))
            exec('topic_{} = topic_{}.reset_index(drop=True)'.format(c,c))
            exec('topic_{}.to_csv("CSV/topic_{}.csv")'.format(c,c))

  and should_run_async(code)
