## This notebook combines tokenized data from the Natural Language Toolkit with Gensim LDA models to perform topic modeling using either the entire corpus or only a selection of bigrams. It procedurally loops through every file in a folder and outputs pyLDAvis results for each podcast. 

## Much of code is credit to: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [68]:
import re
import os
import sys
import numpy as np
import pandas as pd
from pprint import pprint
import pickle

import nltk
from nltk.collocations import *
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist


import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

from gensim.test.utils import datapath
from gensim.models.word2vec import Text8Corpus
from gensim.models.phrases import Phrases, Phraser

import spacy

import pyLDAvis
import pyLDAvis.gensim  
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [69]:
#gets a specified index and returns text data from dataframe 
def iterator(index):
    
    labels = ['ID','Name','Date','topicName','scrubbedtext']
    podKnow_Data = pd.DataFrame.from_records(results, columns = labels)
    
    #isolate scrubbed text values and convert to lowercase to avoid duplicates
    scrubbedData = str(podKnow_Data.iloc[index-1:index, 4].values).lower()
    
    #remove junk values
    scrubbedData = scrubbedData.replace("\"", "").replace(",", "").replace("\'",  "").splitlines()
            
    return scrubbedData

In [3]:
#lemmitizes words
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp("".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [4]:
#tokenizes words    
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [5]:
#outputs an HTML file with pyLDAvis result        
def printToHTML(ldaModel, corpus, id2word):
    
            lda_display = pyLDAvis.gensim.prepare(ldaModel, corpus, id2word, sort_topics=False)
            pyLDAvis.enable_notebook()
            
            htmlFileName = file.replace(".txt_scrubbed", ".html")
            pyLDAvis.save_html(lda_display, htmlFileName)

In [65]:
#creates bigrams, formats the data and produces a model as a result
def formatDataAndModel(finalBigrams):
    
            finalBigrams = (list((finalBigrams)))
                       
    
            data_lemmatized = lemmatization(finalBigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    
            #maps IDs to words
            id2word = corpora.Dictionary(data_lemmatized)
    
            #simply receives lemmitized text
            texts = data_lemmatized
    
            #maps new lemmitized data to IDs
            corpus = [id2word.doc2bow(text) for text in texts]
    
    
            #builds model 
            lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           alpha= 'auto',
                                           eval_every=5,
                                           per_word_topics = True,
                                           passes=20)
    
            doc_lda = lda_model[corpus]
                        
            printToHTML(lda_model, corpus, id2word)

In [72]:
results = []
counter = 0
totalList = []


saveLocation = r'C:\Users\Jeremy\Podknow\data\transcripts\gcsst\scrubbed'
for folderName,subfolders,fileName in os.walk(r'C:\Users\Jeremy\Podknow\data\transcripts\gcsst\scrubbed'):
    
    
   # try:   
          for file in fileName:
            if str(file.endswith(".txt_scrubbed")):
                f = open(os.path.join(folderName,file),'rb')
            data = pickle.load(f)
            
            value0, value1, value2,value3, *extraWords = file.split('_')
            value4 = data
            rows = (value0,value1,value2,value3, value4)
            results.append(rows)
            
            
            counter = counter + 1
            
            nlp = spacy.load('en', disable=['parser', 'ner'])
            
            #get all scrubbed data from a given podcast index
            data = iterator(counter)
            
            data_words =  list(sent_to_words(data))
            
            tokens = nltk.wordpunct_tokenize(str(data))
            
            bigram_measures = nltk.collocations.BigramAssocMeasures()
    
            #this block removes junk characters and only accepts words 3 letters or longer
            finder = BigramCollocationFinder.from_words(tokens)
            finder.apply_word_filter(lambda w: len(w) < 3)
            
            #find top 1000 best bigrams
            bigrams = list(sorted(finder.nbest(bigram_measures.raw_freq, 1000)))
            
            finalBigrams = []
            
            for x in bigrams: 
                x = str(x).replace("')", "").replace("('", "").replace("\'",  "").replace(",", "").replace(" ", "_")
                finalBigrams.append(x)
                print(x)
                
            
            #simply comment / uncomment one of following lines to model using the whole corpus or bigrams
            
            #formatDataAndModel(tokens)
            
            
            formatDataAndModel(finalBigrams)
           
            
    #except:
     #   print("error")
              
        

10000_month
1000000_even
107_day
1999_guys
200000_dude
2300_one
2500_calories
280000_position
80000_position
("[["_list
ability_resources
able_dude
able_took
absol_people
absolute_yeah
absolutely_dude
absolutely_true
accelerate_growth
action_secondly
actually_cut
actually_feel
actually_good
actually_hits
actually_like
actually_makes
actually_question
actually_really
actually_say
actually_think
adam_adams
adams_yet
add_something
adjustment_making
advice_like
ago_longer
ago_okay
ago_people
agree_100
agree_sure
agree_yeah
alabama_importantly
alarm_use
alison_nash
allison_martin
almost_willingly
along_said
alright_love
alternative_put
always_join
always_judged
always_nicknames
always_say
always_think
always_wants
amazing_handsome
amazing_things
amazing_would
ambition_success
amount_discipline
amounts_money
andy_company
andy_frisella
andy_keynote
anger_problem
announced_three
announcer_dude
announcer_quote
another_aspect
another_one
answered_question
anybody_guys
anybody_planet
anything_bas

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


0dc_acceleration
100_things
1000_packages
141_rejoin
14th_february
14th_may
1700_like
17000_square
194_echo
200_miles
200_people
2000_square
2018_atp
2020_soon
24000_year
298_million
298000298_million
2nd_decade
300_million
3000_ends
3rd_dimension
400_500
40s_leg
4th_5th
500_hours
500_pairs
5000_cans
5000_kansas
5000_name
5th_cindy
700_850
7000_leather
8000_bond
8000_say
850_australian
("[["_list
abandoned_actually
abandoned_factory
abandoned_know
abandoned_mills
abandoned_much
abandoned_people
abandoned_showing
ability_answer
able_apply
able_carry
able_compare
able_invested
able_make
able_phenomenal
able_really
able_right
able_say
able_see
able_sleep
abrasion_detector
abrasion_movement
abrasion_resistance
absolute_absolutely
absolute_hardcore
absolutely_awesome
absolutely_capitalist
absolutely_knew
absolutely_performance
absolutely_remember
absolutely_right
absorb_like
acceleration_girl
accepted_friends
access_west
accomplish_mission
achieve_know
achieve_next
across_board
across_broad

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
