## Much of code is credit to: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [2]:
import re
import os
import sys
import numpy as np
import pandas as pd
from pprint import pprint
import pickle

from nltk.collocations import *
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import nltk

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.test.utils import datapath
from gensim.models import CoherenceModel

from gensim.test.utils import datapath
from gensim.models.word2vec import Text8Corpus
from gensim.models.phrases import Phrases, Phraser
from gensim.models.ldamulticore import LdaMulticore
from gensim.test.utils import datapath

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import spacy

import pyLDAvis
import pyLDAvis.gensim  
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
%matplotlib inline
from nltk.metrics.spearman import *
from nltk.metrics import ContingencyMeasures
import collections

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [12]:
#gets a specified index and returns text data from dataframe 
def iterator(index):
    
    labels = ['ID','Name','Date','topicName','scrubbedtext']
    podKnow_Data = pd.DataFrame.from_records(results, columns = labels)
    
    #isolate scrubbed text values and convert to lowercase to avoid duplicates
    scrubbedData = str(podKnow_Data.iloc[index-1:index, 4].values).lower()
    
    #remove junk values
    scrubbedData = scrubbedData.replace("\"", "").replace(",", "").replace("\'",  "").splitlines()
            
    return scrubbedData

#lemmitizes words
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp("".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


#tokenizes words
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        

#loads two models and computes hellinger distance betweeen them     
def computeHellingerDistance():
                
            m1 = LdaMulticore.load("model1")
            m2 = LdaMulticore.load("model2")

            mdiff, annotation =  m1.diff(m2, distance='hellinger', annotation = True)
            topic_diff = mdiff
            
            #prints hellinger distance between topics 
            print(topic_diff)
            
            #prints commonalities in words between a given topic and the topic it's being compared to 
            print(annotation)
    
        

#creates and formats data and saves models for later comparison 
def formatDataAndModel(finalBigrams, counter):
    
            #remove junk values 
            data_words_bigrams = finalBigrams.replace("(", "").replace(")", "").replace("'", " ").replace("," ,"")

            
            data_words_bigrams = data_words_bigrams.split()
            
            finalBigrams = (list(sorted((data_words_bigrams))))
            
            #separate bigrams by '_' character and combine them into a single value 
            finalBigrams = [i+ '_' + j for i,j in zip(finalBigrams[::2], finalBigrams[1::2])]
            
            data_lemmatized = lemmatization(finalBigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    
            #maps IDs to words
            id2word = corpora.Dictionary(data_lemmatized)
    
            #simply receives lemmitized text
            texts = data_lemmatized
    
            #maps new lemmitized data to IDs
            corpus = [id2word.doc2bow(text) for text in texts]
    
    
            #generates LDA model from input data 
            lda_model = LdaMulticore(
            corpus=corpus, num_topics=10, id2word=id2word,
            workers=4, eval_every=None, passes=10, batch=True)
        
            #saves LDA model with counter index 
            lda_model.save(r"model" + str(counter))
            
            if(counter == 2):
                computeHellingerDistance()
        
            
                                        

results = []
counter = 0
totalList = []


#driver code block that ideally computes distance between the only two transcripts in a folder
saveLocation = r'C:\Users\Frank Einstein\Podknow\data\transcripts\gcsst\scrubbed'
for folderName,subfolders,fileName in os.walk(r'C:\Users\Frank Einstein\Podknow\data\transcripts\gcsst\scrubbed'):
    
    
    try:   
          for file in fileName:
            if str(file.endswith(".txt_scrubbed")):
                f = open(os.path.join(folderName,file),'rb')
            data = pickle.load(f)
            
            value0, value1, value2,value3, *extraWords = file.split('_')
            value4 = data
            rows = (value0,value1,value2,value3, value4)
            results.append(rows)
            
            finalBigrams = ""
            
            counter = counter + 1
            
            #gets scrubbed data from a given counter index
            data = iterator(counter)
            
            #tokenizes data                                                
            tokens = nltk.wordpunct_tokenize(str(data))
            
            nlp = spacy.load('en', disable=['parser', 'ner'])
            
            #this block removes junk characters and only accepts words 3 letters or longer
            finder = BigramCollocationFinder.from_words(tokens)

            finder.apply_word_filter(lambda w: len(w) < 3)
            
            #find top 200 best bigrams    
            bigram_measures = nltk.collocations.BigramAssocMeasures()
            bigrams = list(sorted(finder.nbest(bigram_measures.likelihood_ratio, 20)))
            
            #convert bigrams to string for formatting
            for x in bigrams: 
                finalBigrams += str(x)
            

            formatDataAndModel(finalBigrams, counter)
            
            
    

    except:
        print("error")
              
        

[[6.00082696e-01 7.33392555e-08 1.00000000e+00 6.00082696e-01
  6.00082576e-01 6.00082636e-01 6.00082696e-01 6.00082726e-01
  6.00082636e-01 9.99999928e-01]
 [1.46678511e-07 6.00082787e-01 6.00082546e-01 1.46678511e-07
  2.07434740e-07 1.46678511e-07 6.55966243e-08 1.85535271e-07
  1.60678258e-07 6.00082666e-01]
 [1.60678258e-07 6.00082696e-01 6.00082576e-01 1.60678258e-07
  1.46678511e-07 9.27676357e-08 9.27676357e-08 2.17559390e-07
  1.46678511e-07 6.00082726e-01]
 [1.13616686e-07 6.00082696e-01 6.00082636e-01 1.13616686e-07
  1.31193249e-07 6.55966243e-08 6.55966243e-08 1.60678258e-07
  9.27676357e-08 6.00082696e-01]
 [6.00082606e-01 1.00000000e+00 7.33392555e-08 6.00082606e-01
  6.00082636e-01 6.00082606e-01 6.00082516e-01 6.00082636e-01
  6.00082636e-01 9.99999856e-01]
 [1.46678511e-07 6.00082787e-01 6.00082546e-01 1.46678511e-07
  2.07434740e-07 1.46678511e-07 6.55966243e-08 1.85535271e-07
  1.60678258e-07 6.00082666e-01]
 [1.46678511e-07 6.00082636e-01 6.00082696e-01 1.46678511e

  list([[], ['wait_winner', 'defenses_diver', 'quicken_reality', 'mix_move', 'college_condition', 'going_hand']])]]
error
