## GENSIM LDA TOPIC MODELLING

> ### <font color=green>1. Install the necessary modules</font>

> ### <font color=green>2. Import the modules</font>

In [19]:
import logging
import re
import json
import os
from collections import namedtuple

import numpy as np
import pandas as pd
from pprint import pprint
from matplotlib import pyplot as plt
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from sklearn.model_selection import train_test_split
from timeit import default_timer

In [21]:
# Convert the tweet_doc into tweet_tokens and remove non_alphanumeric strings in the tokens

df['tweet_tokens'] = df['preprocessed_tweets'].apply(lambda x: x.split())
#logging.info('Length of total dataset: {}'.format(len(df)))

df.head()

Unnamed: 0,user_id,preprocessed_tweets,tweet_tokens
0,932563433605812226,coworker tell ban live north dakota back go b...,"[coworker, tell, ban, live, north, dakota, bac..."
1,858771417382387713,today retweet second later reason summary cli...,"[today, retweet, second, later, reason, summar..."
2,936079485186822144,solve problem professor first class kid macbo...,"[solve, problem, professor, first, class, kid,..."
3,2248326669,create thiswarofmine knew affect people persp...,"[create, thiswarofmine, knew, affect, people, ..."
4,22557879,opinion first month office donald trump tell ...,"[opinion, first, month, office, donald, trump,..."


> ### <font color=green>5. Split the data set into 4 sets - 20%, 40%, 60%, 80%</font>

In [22]:
df_80, df_20 = train_test_split(df, test_size=0.2)
df_60, df_40 = train_test_split(df, test_size=0.4)

> ### <font color=green>6. Define a function to create directories</font>

In [23]:
def create_Directories(path):
  ''' Checks for the directory. If not present creates the directory'''
  try: 
      os.makedirs(path)
  except OSError:
      if not os.path.isdir(path):
          raise

In [35]:
#CONSTANTS
# Output Directory to save
#Delete Jupyter notebook root logger handler

OUTPUT_DIR = '././models'
SAVE_DIR = OUTPUT_DIR+'/df_40_p40_i50'
create_Directories(OUTPUT_DIR)
create_Directories(SAVE_DIR)
log_file = SAVE_DIR+'/lda.log'
%config Application.log_level="DEBUG"
#Create logger as usual
logger = logging.getLogger(__name__)
fhandler = logging.FileHandler(filename=log_file, mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

logging.debug('test...')

In [25]:
logging.debug('Running......')

> ### <font color=green>6. Create and save the dictionary and Mm_corpus (term-document frequency) model</font>

In [26]:
def create_dict_corpus(doc_list, fname, OUTPUT_DIR=OUTPUT_DIR):
  '''Creates a dictionary and corpus file using a dataframe and saves the file as 'dict' file 
  and 'MM corpus' file given by fname
  '''
  if not os.path.exists(OUTPUT_DIR + '/'+ fname + '.dict'):
      dictionary = corpora.Dictionary(doc_list)
      dictionary.save(OUTPUT_DIR + '/' + fname + '.dict')
      doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_list]
      corpora.MmCorpus.serialize(OUTPUT_DIR + '/' + fname + '.mm', doc_term_matrix)
      mm_corpus = corpora.MmCorpus(OUTPUT_DIR + '/' + fname + '.mm')
  else:
      dictionary = corpora.Dictionary.load(OUTPUT_DIR + '/' + fname + '.dict')
      mm_corpus = corpora.MmCorpus(OUTPUT_DIR + '/' + fname + '.mm')
  return (dictionary, mm_corpus)

> ### <font color=green>7. Define LDA Multicore on the corpus</font>

In [27]:
def run_lda(corpus, dictionary, texts, start_topic=10, end_topic=100, step_size_of_topic=10, passes=1, iterations=50):
  lda_model = dict()
  coh_model_umass = dict()
  coh_model_uci = dict()
  coh_model_ucv = dict()
  eval_frame = pd.DataFrame(columns=['Num_Topics','Log_Perplexity_P_{0}_I_{1}'.format(passes, iterations), 
                                     'Topic_Coherence(u_mass)_P_{0}_I_{1}'.format(passes, iterations),
                                     'Topic_Coherence(c_uci)_P_{0}_I_{1}'.format(passes, iterations),
                                     'Topic_Coherence(c_v)_P_{0}_I_{1}'.format(passes, iterations)])
  logging.debug('******* RUNNING LDA *************')
  for i in range(start_topic, end_topic+1, step_size_of_topic):
    print('Running LDA for the number of topics: {}'.format(i))
    lda_model[i] = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=i, passes=passes, iterations=iterations, chunksize=2500)
    coh_model_umass[i] = CoherenceModel(model=lda_model[i], corpus=corpus, dictionary=dictionary, coherence='u_mass')
    coh_model_uci[i] = CoherenceModel(model=lda_model[i], texts=texts, coherence='c_uci')
    coh_model_ucv[i] = CoherenceModel(model=lda_model[i], texts=texts, coherence='c_v')
    eval_frame.loc[len(eval_frame)] = [i, lda_model[i].log_perplexity(corpus), coh_model_umass[i].get_coherence(), 
                                       coh_model_uci[i].get_coherence(), coh_model_ucv[i].get_coherence()]
  models = namedtuple('models',['lda_models', 'eval_frame'])
  return models(lda_model, eval_frame)

> ### <font color=green>7. Save the LDA models</font>

In [28]:
def save_model(DIR, lda_model):
  for num_topics, model in dict(lda_model).items():
    print("Saving LDA model with number of topics: {}".format(num_topics))
    model.save(DIR+'/'+str(num_topics)+'.lda', separately=False)

> ### <font color=green>8. Main function of the program</font>

In [29]:
def main(dictionary, corpus, texts, SAVE_DIR=SAVE_DIR):
  #start logging to a file
  
  start_time = default_timer()
  print('Enter the LDA parameters...\n')
  start_topic, end_topic, step_size_of_topic, passes, iterations = map(int, 
                                                                       raw_input('Start Topic, End_topic, step_size, passes, iterations : ').split(','))
  prompt = str(raw_input('Do you wish to run LDA (y/n)? : ')).lower()
  if prompt=='y':
    logging.debug('Running LDA and Topic Coherence ...\n')
    models = run_lda(corpus=corpus, dictionary=dictionary, texts=texts, start_topic=start_topic, end_topic=end_topic, 
                     step_size_of_topic=step_size_of_topic, passes=passes, iterations=iterations)
    end_time = default_timer()
    with open(log_file,'w') as f:
      f.write("############ PROGRAM EXECUTION SUMMARY ###########\n")
      f.write("----> Program start time: {0}\n".format(start_time))
      f.write("----> Program end time: {0}\n".format(end_time))
      f.write("----> Total program time taken in secs: {0} hrs\n".format((end_time-start_time)/(60*60)))
      f.write("################################\n\n")
      f.write("---------------- PROGRAM ENDED ----------------------")
    
    #save the models
    save_model(lda_model=models.lda_models, DIR=SAVE_DIR)
  return models

> ### <font color=green>9. Define the plot function</font>

In [30]:
def plot(df, x1, x2, y1, y2, title1, title2, save=False, figsize=(20,5), OUTPUT_DIR=OUTPUT_DIR):
  fig, axes = plt.subplots(nrows=1, ncols=2)
  ax1 = df.plot(ax=axes[0], x=x1, y=y1, color='blue',
          figsize=figsize, title='Number of Topics(K) Vs Log Perplexity')
  #ax1.set(xlabel=xlabel1, ylabel=ylabel1)
  
  ax2 = df.plot(ax=axes[1], x=x2, y=y2, color='red',
          figsize=figsize, title='Number of Topics(K) Vs Topic Coherence')
  #ax2.set(xlabel=xlabel2, ylabel=ylabel2)
  if save==True:
    create_Directories(OUTPUT_DIR)
    plt.savefig(OUTPUT_DIR+'/TopicsVsPerplexity&Coherence.png')

> ### <font color=green>10. Run LDA and plots graphs for 80% CORPUS - Passes=50, Iterations=50</font>

In [89]:
SAVE_DIR = OUTPUT_DIR+'/df_80_p50_i50'
create_Directories(OUTPUT_DIR)
create_Directories(SAVE_DIR)
(dictionary, corpus) = create_dict_corpus(list(df_80['tweet_tokens']), 'data_80', OUTPUT_DIR=SAVE_DIR)
models = main(dictionary=dictionary, corpus=corpus, SAVE_DIR=SAVE_DIR, texts=list(df_80['tweet_tokens']))
df_80_p50_i50 = models.eval_frame
print(df_80_p50_i50)
df_80_p50_i50.to_csv(SAVE_DIR+'/measurements.csv', index=False)

Enter the LDA parameters...

Start Topic, End_topic, step_size, passes, iterations : 10,100,10,50,50
Do you wish to run LDA (y/n)? : y
Running LDA for the number of topics: 10
Running LDA for the number of topics: 20
Running LDA for the number of topics: 30
Running LDA for the number of topics: 40
Running LDA for the number of topics: 50
Running LDA for the number of topics: 60
Running LDA for the number of topics: 70
Running LDA for the number of topics: 80
Running LDA for the number of topics: 90


  diff = np.log(self.expElogbeta)


Running LDA for the number of topics: 100
Saving LDA model with number of topics: 100
Saving LDA model with number of topics: 70
Saving LDA model with number of topics: 40
Saving LDA model with number of topics: 10
Saving LDA model with number of topics: 80
Saving LDA model with number of topics: 50
Saving LDA model with number of topics: 20
Saving LDA model with number of topics: 90
Saving LDA model with number of topics: 60
Saving LDA model with number of topics: 30
   Num_Topics  Log_Perplexity_P_50_I_50  Topic_Coherence(u_mass)_P_50_I_50  \
0        10.0                 -8.384001                          -0.864757   
1        20.0                 -8.335302                          -1.265985   
2        30.0                 -8.321912                          -1.895736   
3        40.0                 -8.309289                          -1.951398   
4        50.0                 -8.300436                          -1.748856   
5        60.0                 -8.303207                    

> ### <font color=green>11.Plot the graphs</font>

In [None]:
df_80_p50_i50.head()

> ### <font color=green>11.LDA 80% corpus for passes=40 , iterations=50</font>

In [15]:
(dictionary, corpus) = create_dict_corpus(list(df_80['tweet_tokens']), 'data_80', OUTPUT_DIR=SAVE_DIR)
models = main(dictionary=dictionary, corpus=corpus, SAVE_DIR=SAVE_DIR, texts=list(df_80['tweet_tokens']))
df_80_p40_i50 = models.eval_frame
print(df_80_p40_i50)
df_80_p40_i50.to_csv(SAVE_DIR+'/measurements.csv', index=False)

Enter the LDA parameters...

Start Topic, End_topic, step_size, passes, iterations : 10,100,10,40,50
Do you wish to run LDA (y/n)? : y
Running LDA for the number of topics: 10
Running LDA for the number of topics: 20
Running LDA for the number of topics: 30
Running LDA for the number of topics: 40
Running LDA for the number of topics: 50
Running LDA for the number of topics: 60
Running LDA for the number of topics: 70
Running LDA for the number of topics: 80
Running LDA for the number of topics: 90


  diff = np.log(self.expElogbeta)


Running LDA for the number of topics: 100
Saving LDA model with number of topics: 100
Saving LDA model with number of topics: 70
Saving LDA model with number of topics: 40
Saving LDA model with number of topics: 10
Saving LDA model with number of topics: 80
Saving LDA model with number of topics: 50
Saving LDA model with number of topics: 20
Saving LDA model with number of topics: 90
Saving LDA model with number of topics: 60
Saving LDA model with number of topics: 30
   Num_Topics  Log_Perplexity_P_40_I_50  Topic_Coherence(u_mass)_P_40_I_50  \
0        10.0                 -8.397411                          -0.221766   
1        20.0                 -8.330701                          -1.073522   
2        30.0                 -8.324421                          -1.381581   
3        40.0                 -8.298727                          -1.646607   
4        50.0                 -8.306759                          -1.783331   
5        60.0                 -8.296918                    

> ### <font color=green>11.LDA 60% corpus for passes=50 , iterations=50</font>

In [None]:
SAVE_DIR = OUTPUT_DIR+'/df_60_p50_i50_1'
create_Directories(SAVE_DIR)
(dictionary, corpus) = create_dict_corpus(list(df_60['tweet_tokens']), 'data_60', OUTPUT_DIR=SAVE_DIR)
models = main(dictionary=dictionary, corpus=corpus, SAVE_DIR=SAVE_DIR, texts=list(df_60['tweet_tokens']))
df_60_p50_i50 = models.eval_frame
print(df_60_p50_i50)
df_60_p50_i50.to_csv(SAVE_DIR+'/measurements.csv', index=False)

Enter the LDA parameters...

Start Topic, End_topic, step_size, passes, iterations : 10,100,10,50,50
Do you wish to run LDA (y/n)? : y
Running LDA for the number of topics: 10
Running LDA for the number of topics: 20
Running LDA for the number of topics: 30
Running LDA for the number of topics: 40
Running LDA for the number of topics: 50
Running LDA for the number of topics: 60
Running LDA for the number of topics: 70
Running LDA for the number of topics: 80
Running LDA for the number of topics: 90
Running LDA for the number of topics: 100


> ### <font color=green>11.LDA 60% corpus for passes=40 , iterations=50</font>

In [17]:
SAVE_DIR = OUTPUT_DIR+'/df_60_p40_i50'
create_Directories(SAVE_DIR)
(dictionary, corpus) = create_dict_corpus(list(df_60['tweet_tokens']), 'data_60', OUTPUT_DIR=SAVE_DIR)
models = main(dictionary=dictionary, corpus=corpus, SAVE_DIR=SAVE_DIR, texts=list(df_60['tweet_tokens']))
df_60_p40_i50 = models.eval_frame
print(df_60_p40_i50)
df_60_p40_i50.to_csv(SAVE_DIR+'/measurements.csv', index=False)

Enter the LDA parameters...

Start Topic, End_topic, step_size, passes, iterations : 10,100,10,40,50
Do you wish to run LDA (y/n)? : y
Running LDA for the number of topics: 10
Running LDA for the number of topics: 20


  m_lr_i = np.log(numerator / denominator)
  return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))


Running LDA for the number of topics: 30
Running LDA for the number of topics: 40
Running LDA for the number of topics: 50
Running LDA for the number of topics: 60
Running LDA for the number of topics: 70
Running LDA for the number of topics: 80
Running LDA for the number of topics: 90


  diff = np.log(self.expElogbeta)


Running LDA for the number of topics: 100
Saving LDA model with number of topics: 100
Saving LDA model with number of topics: 70
Saving LDA model with number of topics: 40
Saving LDA model with number of topics: 10
Saving LDA model with number of topics: 80
Saving LDA model with number of topics: 50
Saving LDA model with number of topics: 20
Saving LDA model with number of topics: 90
Saving LDA model with number of topics: 60
Saving LDA model with number of topics: 30
   Num_Topics  Log_Perplexity_P_40_I_50  Topic_Coherence(u_mass)_P_40_I_50  \
0        10.0                 -8.377786                          -0.669523   
1        20.0                 -8.338646                          -0.925415   
2        30.0                 -8.342528                          -1.646460   
3        40.0                 -8.320786                          -1.713431   
4        50.0                 -8.325969                          -1.709168   
5        60.0                 -8.311528                    

> ### <font color=green>11.LDA 60% corpus for passes=30 , iterations=50</font>

In [31]:
SAVE_DIR = OUTPUT_DIR+'/df_60_p30_i50'
create_Directories(SAVE_DIR)
(dictionary, corpus) = create_dict_corpus(list(df_60['tweet_tokens']), 'data_60', OUTPUT_DIR=SAVE_DIR)
models = main(dictionary=dictionary, corpus=corpus, SAVE_DIR=SAVE_DIR, texts=list(df_60['tweet_tokens']))
df_60_p30_i50 = models.eval_frame
print(df_60_p30_i50)
df_60_p30_i50.to_csv(SAVE_DIR+'/measurements.csv', index=False)

Enter the LDA parameters...

Start Topic, End_topic, step_size, passes, iterations : 10,100,10,30,50
Do you wish to run LDA (y/n)? : y
Running LDA for the number of topics: 10
Running LDA for the number of topics: 20
Running LDA for the number of topics: 30
Running LDA for the number of topics: 40
Running LDA for the number of topics: 50
Running LDA for the number of topics: 60
Running LDA for the number of topics: 70
Running LDA for the number of topics: 80
Running LDA for the number of topics: 90
Running LDA for the number of topics: 100
Saving LDA model with number of topics: 100
Saving LDA model with number of topics: 70
Saving LDA model with number of topics: 40
Saving LDA model with number of topics: 10
Saving LDA model with number of topics: 80
Saving LDA model with number of topics: 50
Saving LDA model with number of topics: 20
Saving LDA model with number of topics: 90
Saving LDA model with number of topics: 60
Saving LDA model with number of topics: 30
   Num_Topics  Log_Perp

> ### <font 4
olor=green>11.LDA 60% corpus for passes=50 , iterations=50</font>

In [32]:
SAVE_DIR = OUTPUT_DIR+'/df_40_p50_i50'
create_Directories(SAVE_DIR)
(dictionary, corpus) = create_dict_corpus(list(df_40['tweet_tokens']), 'data_40', OUTPUT_DIR=SAVE_DIR)
models = main(dictionary=dictionary, corpus=corpus, SAVE_DIR=SAVE_DIR, texts=list(df_40['tweet_tokens']))
df_40_p50_i50 = models.eval_frame
print(df_40_p50_i50)
df_40_p50_i50.to_csv(SAVE_DIR+'/measurements.csv', index=False)

Enter the LDA parameters...

Start Topic, End_topic, step_size, passes, iterations : 10,100,10,50,50
Do you wish to run LDA (y/n)? : y
Running LDA for the number of topics: 10
Running LDA for the number of topics: 20
Running LDA for the number of topics: 30
Running LDA for the number of topics: 40
Running LDA for the number of topics: 50
Running LDA for the number of topics: 60
Running LDA for the number of topics: 70
Running LDA for the number of topics: 80
Running LDA for the number of topics: 90
Running LDA for the number of topics: 100
Saving LDA model with number of topics: 100
Saving LDA model with number of topics: 70
Saving LDA model with number of topics: 40
Saving LDA model with number of topics: 10
Saving LDA model with number of topics: 80
Saving LDA model with number of topics: 50
Saving LDA model with number of topics: 20
Saving LDA model with number of topics: 90
Saving LDA model with number of topics: 60
Saving LDA model with number of topics: 30
   Num_Topics  Log_Perp

In [37]:
SAVE_DIR = OUTPUT_DIR+'/df_40_p40_i50'
create_Directories(SAVE_DIR)
(dictionary, corpus) = create_dict_corpus(list(df_40['tweet_tokens']), 'data_40', OUTPUT_DIR=SAVE_DIR)
models = main(dictionary=dictionary, corpus=corpus, SAVE_DIR=SAVE_DIR, texts=list(df_40['tweet_tokens']))
df_40_p40_i50 = models.eval_frame
print(df_40_p40_i50)
df_40_p40_i50.to_csv(SAVE_DIR+'/measurements.csv', index=False)

Enter the LDA parameters...

Start Topic, End_topic, step_size, passes, iterations : 10,100,10,40,50
Do you wish to run LDA (y/n)? : y
Running LDA for the number of topics: 10
Running LDA for the number of topics: 20
Running LDA for the number of topics: 30
Running LDA for the number of topics: 40
Running LDA for the number of topics: 50
Running LDA for the number of topics: 60
Running LDA for the number of topics: 70
Running LDA for the number of topics: 80
Running LDA for the number of topics: 90
Running LDA for the number of topics: 100
Saving LDA model with number of topics: 100
Saving LDA model with number of topics: 70
Saving LDA model with number of topics: 40
Saving LDA model with number of topics: 10
Saving LDA model with number of topics: 80
Saving LDA model with number of topics: 50
Saving LDA model with number of topics: 20
Saving LDA model with number of topics: 90
Saving LDA model with number of topics: 60
Saving LDA model with number of topics: 30
   Num_Topics  Log_Perp

In [36]:
with open(log_file,'w') as f:
  pass