In [134]:
import re
import pandas as pd
import numpy as np
import string
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

from datetime import datetime

import pickle 
import pyLDAvis
import pyLDAvis.gensim

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm

import gensim
from gensim import corpora
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
from gensim.models.ldamodel import LdaModel

from wordcloud import WordCloud
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
from pprint import pprint

from tqdm import tqdm_notebook as tqdm_note
import tqdm


import warnings
warnings.filterwarnings('ignore')

# Import the document list

In [175]:
#import the doc_list file
#load the pickle 
filename= 'doc_spacyall'
infile= open(filename,'rb')
load_doc_spacyall = pickle.load(infile)
infile.close()

In [176]:
#import the doc_list file
#load the pickle 
filename= 'doc_nltk_bitri'
infile= open(filename,'rb')
load_doc_nltk_bitri = pickle.load(infile)
infile.close()

In [177]:
#import the doc_list file
#load the pickle 
filename= 'doc_clean_tweet'
infile= open(filename,'rb')
load_doc_clean_tweet = pickle.load(infile)
infile.close()

#### Cautions : This notebook takes a long time to Run. Thus I will not restart the kernel   

## Before buliding the LDA Model,I am trying to search for the best K topics by refering to coherence score.

In [139]:
#Function for LDA model, 
#pass the corpus with different pre-proccessing, number of topics, and number of iterition(passes)

def lda_model_fuc (filename, k, n):
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus=filename,
                                               id2word=words,
                                               num_topics=k, 
                                               random_state=2,
                                               update_every=1,
                                               passes=n,
                                               alpha='auto',
                                               per_word_topics=True)
    pprint(lda_model.print_topics(num_words=10))


In [140]:
#Function for conference score
#help with determine the number of topics.

def coherence (k,n,load_doc_name):
    words = corpora.Dictionary(load_doc_name)
    corpus = [words.doc2bow(doc) for doc in load_doc_name]
    lda_model = LDA(corpus=corpus, id2word=words, num_topics=k, random_state=100,
                chunksize=200, passes=n, per_word_topics=True)

    coherence_model_lda = CoherenceModel(model=lda_model, texts=load_doc_name, dictionary=words, coherence='c_v')
    coherence_lda = round(coherence_model_lda.get_coherence(),4)
    print(f'Coherence Score of {k} Topics: {(coherence_lda)}')

## Comparing the coherence score of pre-processing `SpaCy pipelines` for 3-10 number of topics 

#### caution: the comparison of the coherence score take long time to run

In [141]:
#set the number of iterition(passes) to 20 
for num_topic in tqdm_note(range(3,11)):
    coherence (num_topic,20,load_doc_spacyall)

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

Coherence Score of 3 Topics: 0.3442
Coherence Score of 4 Topics: 0.3519
Coherence Score of 5 Topics: 0.3295
Coherence Score of 6 Topics: 0.3491
Coherence Score of 7 Topics: 0.3194
Coherence Score of 8 Topics: 0.309
Coherence Score of 9 Topics: 0.2965
Coherence Score of 10 Topics: 0.315



In [142]:
#set the number of iterition(passes) to 10 
for num_topic in tqdm_note(range(3,11)):
    coherence (num_topic,10,load_doc_spacyall)

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

Coherence Score of 3 Topics: 0.3455
Coherence Score of 4 Topics: 0.3519
Coherence Score of 5 Topics: 0.3295
Coherence Score of 6 Topics: 0.3491
Coherence Score of 7 Topics: 0.3176
Coherence Score of 8 Topics: 0.3065
Coherence Score of 9 Topics: 0.2965
Coherence Score of 10 Topics: 0.3147



## Comparing the coherence score of pre-processing `NLTK &Bigram/Trigram` for 3-10 number of topics 

#### caution: the comparison of the coherence score take long time to run

In [None]:
##set the number of iterition(passes) to 20

#for num_topic in tqdm_note(range(3,11)):
 #   coherence (num_topic,20,load_doc_nltk_bitri)

Here I will only run for 10 passes as we can see that the score with passes 10 times and the score with passes 20 are almost the same.

In [143]:
#set the number of iterition(passes) to 10 

for num_topic in tqdm_note(range(3,11)):
    coherence (num_topic,10,load_doc_nltk_bitri)

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

Coherence Score of 3 Topics: 0.3515
Coherence Score of 4 Topics: 0.3405
Coherence Score of 5 Topics: 0.3369
Coherence Score of 6 Topics: 0.3549
Coherence Score of 7 Topics: 0.3201
Coherence Score of 8 Topics: 0.3251
Coherence Score of 9 Topics: 0.3117
Coherence Score of 10 Topics: 0.3254



## Comparing the coherence score of pre-processing `tweet cleaning` for 3-10 number of topics 

#### caution: the comparing of the coherence score take long time to run

In [61]:
#set the number of iterition(passes) to 20 

#for num_topic in tqdm_note(range(3,11)):
 #   coherence (num_topic,20,load_doc_clean_tweet)

Coherence Score of 3 Topics: 0.3568536642512174
Coherence Score of 4 Topics: 0.3133538450797512
Coherence Score of 5 Topics: 0.32682706598055083
Coherence Score of 6 Topics: 0.2956369565190991
Coherence Score of 7 Topics: 0.3442207300846051
Coherence Score of 8 Topics: 0.331717454078419
Coherence Score of 9 Topics: 0.32139580483673746
Coherence Score of 10 Topics: 0.3270063066711949


Here I will only run for 10 passes as we can see that the score with passes 10 times and the score with passes 20 are almost the same.

In [144]:
# #set the number of iterition(passes) to 10 

for num_topic in tqdm_note(range(3,11)):
    coherence (num_topic,10,load_doc_clean_tweet)

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

Coherence Score of 3 Topics: 0.3688
Coherence Score of 4 Topics: 0.3353
Coherence Score of 5 Topics: 0.336
Coherence Score of 6 Topics: 0.3142
Coherence Score of 7 Topics: 0.3207
Coherence Score of 8 Topics: 0.304
Coherence Score of 9 Topics: 0.3123
Coherence Score of 10 Topics: 0.296



__Short summary of the above coherence score__

Overall the coherence score are low. The highest score is 0.3688 tweet cleaning with 3 topics
The passess didn't affect the coherence score as we can see that the score with passes 10 times and the score with passes 20 are almost the same.

It's possible to tune for the parameter k , **number of topics**, though it's not necessarily a very straightforward way to improve the model, I will try to tune the hyperparameter aiming at getting a higher coherence score.

### Tuning hyperparameter : alpha and eta(beta) are hyperparameters that affect sparsity of the topics. 

__Tuning LDA model can be tough__

More legitimate implementations of LDA have hyperparameters such as alpha. Alpha is a scaler that helps minimize an error term. Most LDA models that are implented will set this automatically and it's usually, 95% of the time a fine solution. 

As the coherence scores above are not ideal with auto alpha, I will perform a series of sensitivity tests to help determine the following model hyperparameters:
- Number of Topics (K)
- Dirichlet hyperparameter alpha: Document-Topic Density
- Dirichlet hyperparameter beta: Word-Topic Density

Alpha parameter is Dirichlet prior concentration parameter that represents document-topic density — with a higher alpha, documents are assumed to be made up of more topics and result in more specific topic distribution per document.

Beta parameter is the same prior concentration parameter that represents topic-word density — with high beta, topics are assumed to made of up most of the words and result in a more specific word distribution per topic.

In [75]:
# function for tunning coherence score
# Here i will set the passes = 20 because the coherence score are 
def coherence_tuning(corpus, load_doc_name, k, a, b):
    words = corpora.Dictionary(load_doc_name)
    corpus = [words.doc2bow(doc) for doc in load_doc_name]
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=words,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=20,
                                           alpha=a,
                                           eta=b,
                                           per_word_topics= True)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=load_doc_name, dictionary=words, coherence='c_v')
    
    #return coherence_model_lda.get_coherence()
    coherence_lda = coherence_model_lda.get_coherence()
    print(f'Coherence Score of {k} Topics: {coherence_lda}')

In [83]:
grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 3
max_topics = 6
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.1, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.1, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [corpus]

corpus_title = ['100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

#### Cautions: the cell below take 14 hours to run

In [84]:
# Can take a long time to run
#I run this cell only once and will not run it again as it takes a very long time
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = coherence_tuning(corpus,load_doc_spacyall, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()


  0%|          | 0/60 [00:00<?, ?it/s][A
  2%|▏         | 1/60 [12:40<12:27:51, 760.53s/it][A

Coherence Score of 3 Topics: 0.31233030049287835



  3%|▎         | 2/60 [25:13<12:12:57, 758.23s/it][A

Coherence Score of 3 Topics: 0.29071935193145293



  5%|▌         | 3/60 [37:58<12:02:22, 760.40s/it][A

Coherence Score of 3 Topics: 0.3338800741499322



  7%|▋         | 4/60 [50:26<11:46:07, 756.56s/it][A

Coherence Score of 3 Topics: 0.2959705337560576



  8%|▊         | 5/60 [1:03:52<11:47:14, 771.54s/it][A

Coherence Score of 3 Topics: 0.33485752940179964



 10%|█         | 6/60 [1:17:03<11:39:26, 777.15s/it][A

Coherence Score of 3 Topics: 0.3600810768710076



 12%|█▏        | 7/60 [1:29:40<11:21:17, 771.27s/it][A

Coherence Score of 3 Topics: 0.3658920207079472



 13%|█▎        | 8/60 [1:43:46<11:27:54, 793.73s/it][A

Coherence Score of 3 Topics: 0.33998467679111344



 15%|█▌        | 9/60 [2:01:30<12:23:31, 874.73s/it][A

Coherence Score of 3 Topics: 0.37193037543286595



 17%|█▋        | 10/60 [2:19:15<12:56:22, 931.65s/it][A

Coherence Score of 3 Topics: 0.3620268528794126



 18%|█▊        | 11/60 [2:32:43<12:10:32, 894.55s/it][A

Coherence Score of 3 Topics: 0.3529998159326832



 20%|██        | 12/60 [2:45:45<11:28:42, 860.88s/it][A

Coherence Score of 3 Topics: 0.35952763638169943



 22%|██▏       | 13/60 [2:58:26<10:50:50, 830.86s/it][A

Coherence Score of 3 Topics: 0.3276435949527317



 23%|██▎       | 14/60 [3:11:39<10:28:16, 819.49s/it][A

Coherence Score of 3 Topics: 0.36023143576578304



 25%|██▌       | 15/60 [3:25:36<10:18:44, 824.99s/it][A

Coherence Score of 3 Topics: 0.38650308858002597



 27%|██▋       | 16/60 [3:39:10<10:02:31, 821.61s/it][A

Coherence Score of 3 Topics: 0.3026075470132749



 28%|██▊       | 17/60 [3:51:08<9:26:28, 790.42s/it] [A

Coherence Score of 3 Topics: 0.35965685919413976



 30%|███       | 18/60 [4:04:17<9:12:57, 789.95s/it][A

Coherence Score of 3 Topics: 0.345564927516729



 32%|███▏      | 19/60 [4:17:06<8:55:31, 783.70s/it][A

Coherence Score of 3 Topics: 0.41316498511403105



 33%|███▎      | 20/60 [4:29:24<8:33:18, 769.97s/it][A

Coherence Score of 3 Topics: 0.3770002893034549



 35%|███▌      | 21/60 [4:42:42<8:26:02, 778.53s/it][A

Coherence Score of 4 Topics: 0.40761141006306223



 37%|███▋      | 22/60 [4:56:08<8:18:16, 786.76s/it][A

Coherence Score of 4 Topics: 0.3998717720580424



 38%|███▊      | 23/60 [5:10:22<8:17:39, 807.02s/it][A

Coherence Score of 4 Topics: 0.4085525499955733



 40%|████      | 24/60 [5:24:00<8:06:08, 810.24s/it][A

Coherence Score of 4 Topics: 0.34748535615131515



 42%|████▏     | 25/60 [5:36:49<7:45:24, 797.86s/it][A

Coherence Score of 4 Topics: 0.3199072590731923



 43%|████▎     | 26/60 [5:50:35<7:36:55, 806.34s/it][A

Coherence Score of 4 Topics: 0.35827971083910215



 45%|████▌     | 27/60 [6:03:54<7:22:09, 803.92s/it][A

Coherence Score of 4 Topics: 0.36715356267838417



 47%|████▋     | 28/60 [6:18:00<7:15:37, 816.78s/it][A

Coherence Score of 4 Topics: 0.3388160695074661



 48%|████▊     | 29/60 [6:32:10<7:07:01, 826.50s/it][A

Coherence Score of 4 Topics: 0.37602138800015283



 50%|█████     | 30/60 [6:46:08<6:55:06, 830.22s/it][A

Coherence Score of 4 Topics: 0.3351375183533646



 52%|█████▏    | 31/60 [6:59:44<6:39:08, 825.80s/it][A

Coherence Score of 4 Topics: 0.35719213558467233



 53%|█████▎    | 32/60 [7:13:50<6:28:11, 831.84s/it][A

Coherence Score of 4 Topics: 0.3468843763365209



 55%|█████▌    | 33/60 [7:27:12<6:10:18, 822.91s/it][A

Coherence Score of 4 Topics: 0.31373025648235225



 57%|█████▋    | 34/60 [7:40:41<5:54:47, 818.76s/it][A

Coherence Score of 4 Topics: 0.3595171749376418



 58%|█████▊    | 35/60 [7:54:32<5:42:42, 822.50s/it][A

Coherence Score of 4 Topics: 0.4469748529454709



 60%|██████    | 36/60 [8:08:05<5:27:48, 819.53s/it][A

Coherence Score of 4 Topics: 0.32080188278184363



 62%|██████▏   | 37/60 [8:21:05<5:09:40, 807.83s/it][A

Coherence Score of 4 Topics: 0.3600177784485105



 63%|██████▎   | 38/60 [8:34:29<4:55:42, 806.49s/it][A

Coherence Score of 4 Topics: 0.37165197179708725



 65%|██████▌   | 39/60 [8:48:10<4:43:48, 810.89s/it][A

Coherence Score of 4 Topics: 0.3959035522135009



 67%|██████▋   | 40/60 [9:01:46<4:30:49, 812.47s/it][A

Coherence Score of 4 Topics: 0.3274242626281167



 68%|██████▊   | 41/60 [9:16:12<4:22:21, 828.51s/it][A

Coherence Score of 5 Topics: 0.3571042485045842



 70%|███████   | 42/60 [9:31:04<4:14:17, 847.66s/it][A

Coherence Score of 5 Topics: 0.4360955816488737



 72%|███████▏  | 43/60 [9:45:44<4:02:52, 857.19s/it][A

Coherence Score of 5 Topics: 0.41735114020465514



 73%|███████▎  | 44/60 [10:02:47<4:01:50, 906.89s/it][A

Coherence Score of 5 Topics: 0.2952045817673471



 75%|███████▌  | 45/60 [10:16:58<3:42:34, 890.30s/it][A

Coherence Score of 5 Topics: 0.3491293644766922



 77%|███████▋  | 46/60 [10:33:38<3:35:25, 923.28s/it][A

Coherence Score of 5 Topics: 0.40891121939673536



 78%|███████▊  | 47/60 [10:49:11<3:20:37, 925.96s/it][A

Coherence Score of 5 Topics: 0.42332313949064115



 80%|████████  | 48/60 [11:04:38<3:05:16, 926.38s/it][A

Coherence Score of 5 Topics: 0.35650552112991607



 82%|████████▏ | 49/60 [11:18:45<2:45:27, 902.51s/it][A

Coherence Score of 5 Topics: 0.35340402810138627



 83%|████████▎ | 50/60 [11:33:04<2:28:14, 889.40s/it][A

Coherence Score of 5 Topics: 0.40426221696806036



 85%|████████▌ | 51/60 [11:47:26<2:12:12, 881.38s/it][A

Coherence Score of 5 Topics: 0.34638352997757205



 87%|████████▋ | 52/60 [12:00:48<1:54:20, 857.51s/it][A

Coherence Score of 5 Topics: 0.3671504545298053



 88%|████████▊ | 53/60 [12:14:57<1:39:44, 854.91s/it][A

Coherence Score of 5 Topics: 0.31269540208018026



 90%|█████████ | 54/60 [12:30:39<1:28:06, 881.16s/it][A

Coherence Score of 5 Topics: 0.39189973235088493



 92%|█████████▏| 55/60 [12:45:41<1:13:56, 887.40s/it][A

Coherence Score of 5 Topics: 0.39850885392992996



 93%|█████████▎| 56/60 [13:00:19<58:57, 884.50s/it]  [A

Coherence Score of 5 Topics: 0.3392961892665832



 95%|█████████▌| 57/60 [13:14:19<43:32, 870.99s/it][A

Coherence Score of 5 Topics: 0.37148273090921224



 97%|█████████▋| 58/60 [13:29:05<29:11, 875.53s/it][A

Coherence Score of 5 Topics: 0.4325252149550566



 98%|█████████▊| 59/60 [13:43:47<14:37, 877.70s/it][A

Coherence Score of 5 Topics: 0.4308405576536586



100%|██████████| 60/60 [13:58:22<00:00, 838.37s/it][A

Coherence Score of 5 Topics: 0.3283152261973334





In [88]:
#loading the saved csv as dataframe
lda_tunning = pd.read_csv('lda_tuning_results.csv')

In [147]:
#finding the best hyperparameters
lda_tunning.iloc[34]

Validation_Set           100% Corpus
Topics                             4
Alpha                      symmetric
Beta              0.7000000000000001
Coherence                        NaN
Name: 34, dtype: object

In [94]:
#checking the best with the best hyperparaters
best_score = coherence_tuning(corpus,load_doc_spacyall,k=4, a='symmetric', b=0.7000000000000001)

Coherence Score of 4 Topics: 0.4400192893615461


#### __Summary:__
From the tuning process , I make a mistake that I set the output of `function of coherence tunning `( as ` print(f'Coherence Score of {k} Topics: {coherence_lda}'` , this is a string and that is why the value of coherence score on cvs file named `lda_tuning_results` become NaN. 
However, I am not going to rerun the cell as it takes too long time to run.

I can find the hyperparameters (number of topic is 4, alpha is symmetric, beta is 0.7000000000000001) of best score by looking at the output below, the best score after tunning is around 0.45.
It is on the index 35 of the dataframe lda_tunning.

Coherence Score of 4 Topics: 0.4469748529454709
 58%|█████▊    | 35/60 [7:54:32<5:42:42, 822.50s/it]
 

This time I only run for one input `load_doc_spacyall` due to the long running time. Originally I plan to tune the other 2 inputs `load_doc_nltk_bitri` and `load_doc_clean_tweet` as well.

Since the running time takes long time, I will first set 4 topics, alpha is auto and beta is none for each input. I will determine which input to use with the best hyperparameter. 

