## Much of code is credit to: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [1]:
import re
import os
import sys
import numpy as np
import pandas as pd
from pprint import pprint
import pickle

from nltk.collocations import *
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import nltk

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.corpora.dictionary import Dictionary

from gensim.test.utils import datapath
from gensim.models.word2vec import Text8Corpus
from gensim.models.phrases import Phrases, Phraser

import spacy

import pyLDAvis
import pyLDAvis.gensim  
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [16]:
def iterator(index):
    
    labels = ['ID','Name','Date','topicName','scrubbedtext']
    podKnow_Data = pd.DataFrame.from_records(results, columns = labels)
    
    #isolate scrubbed text values and convert to lowercase to avoid duplicates
    scrubbedData = str(podKnow_Data.iloc[index-1:index, 4].values).lower()
    
    #remove junk values
    scrubbedData = scrubbedData.replace("\"", "").replace(",", "").replace("\'",  "").splitlines()
            
    return scrubbedData

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

    
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

results = []
counter = 0
totalList = []

saveLocation = r'C:\Users\Jeremy\Documents\output\scrubbed transcripts'
for folderName,subfolders,fileName in os.walk(r'C:\Users\Jeremy\Documents\output\scrubbed transcripts\gcsst\scrubbed'):
    
    for file in fileName:
        if str(file.endswith(".txt_scrubbed")):
            f = open(os.path.join(folderName,file),'rb')
            data = pickle.load(f)
            
            value0, value1, value2,value3, *extraWords = file.split('_')
            value4 = data
            rows = (value0,value1,value2,value3, value4)
            results.append(rows)
            
            counter = counter + 1
            
            data = iterator(counter)
            
            data_words =  list(sent_to_words(data))

            bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
            trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
        
            bigram_mod = gensim.models.phrases.Phraser(bigram)
            trigram_mod = gensim.models.phrases.Phraser(trigram)  
            
            data_words_bigrams = make_bigrams(data_words)
    
            nlp = spacy.load('en', disable=['parser', 'ner'])
    
            data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    
            id2word = corpora.Dictionary(data_lemmatized)
    
            texts = data_lemmatized
    
            corpus = [id2word.doc2bow(text) for text in texts]
    
            lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           alpha= 75,
                                           eval_every=5,
                                           per_word_topics = True,
                                           passes=20)
    
            doc_lda = lda_model[corpus]
                        
            coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
            coherence_lda = coherence_model_lda.get_coherence()
            
            
            model_list, coherence_values = compute_coherence_values(lda_model, dictionary=id2word, corpus=corpus, texts=texts, start=2, limit=40, step=6)
            # Show graph
            limit=40; start=2; step=6;
            x = range(start, limit, step)
            plt.plot(x, coherence_values)
            plt.xlabel("Num Topics")
            plt.ylabel("Coherence score")
            plt.legend(("coherence_values"), loc='best')
            
            pngFileName = file.replace(".txt_scrubbed", ".png")
            plt.savefig(pngFileName)
            
            plt.clf()
            plt.cla()
            plt.close()
            
              
            lda_display = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, sort_topics=False)
            pyLDAvis.enable_notebook()
            
            htmlFileName = file.replace(".txt_scrubbed", ".html")
            pyLDAvis.save_html(lda_display, htmlFileName)
            

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


KeyboardInterrupt: 

In [None]:
import pyLDAvis
lsa_display = pyLDAvis.gensim.prepare(lsa_model, corpus, id2word)
pyLDAvis.display(lsa_display)