# Installations and Libraries

In [106]:
import time 
# Time the running of everything
start_of_notebook_time = time.time()

# To ignore warnings in output 
import warnings
warnings.filterwarnings('ignore', category = DeprecationWarning)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [2]:
# Installations - Please read the instructions carefully
# Uncomment below two lines if you are using Google Colab.
#import sys
#if 'google.colab' in sys.modules:
'''
!pip install emoji --upgrade
!pip install pandas-profiling==2.*
!pip install plotly==4.*
!python -m spacy download en_core_web_lg
!pip install pyldavis
!pip install gensim
!pip install chart_studio
'''
# Uncomment all the multi-line comments to install required packages

'\n!pip install emoji --upgrade\n!pip install pandas-profiling==2.*\n!pip install plotly==4.*\n!python -m spacy download en_core_web_lg\n!pip install pyldavis\n!pip install gensim\n!pip install chart_studio\n'

In [3]:
# Required Libraries

#Base and Cleaning 
import json
import requests
import pandas as pd
import numpy as np
import emoji
import regex
import re
import string
from collections import Counter

#Visualizations
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt 
import pyLDAvis.gensim
import chart_studio
import chart_studio.plotly as py 
import chart_studio.tools as tls

#Natural Language Processing (NLP)
import spacy
import gensim
from spacy.tokenizer import Tokenizer
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS as SW
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from wordcloud import STOPWORDS
stopwords = set(STOPWORDS)

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [4]:
#Setting up chart studios to save visualizations
Username = 'so-me'
api_key = 'MnGv47xSLbpMq7mDjvLT'

chart_studio.tools.set_credentials_file(username=Username, api_key=api_key)

  and should_run_async(code)


In [135]:
response_folder = 'C:/Users/sidpa/1_PROJECT'

# Data Cleaning

In [5]:
# Loading the JSON file 
url_elon = 'https://raw.githubusercontent.com/Lambda-School-Labs/social-media-strategy-ds/feature/topic/python_notebooks/elonmusk_followers_english.json'

df = requests.get(url_elon).json()

# Converting the dataset to pandas DataFrame and renaming the columns 
df = pd.DataFrame(df.values())
df = df.rename(columns={0:'original_tweets'})

#Removing emojies from text
#Reference 1 : https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
#Reference 2 : https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python

def give_emoji_free_text(text):
    """
    Removes emoji's from tweets
    Accepts:
        Text (tweets)
    Returns:
        Text (emoji free tweets)
    """
    emoji_list = [c for c in text if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
    return clean_text

def url_free_text(text):
    '''
    Cleans text from urls
    '''
    text = re.sub(r'http\S+', '', text)
    return text

# Apply the function above and get tweets free of emoji's
call_emoji_free = lambda x: give_emoji_free_text(x)

# Apply `call_emoji_free` which calls the function to remove all emoji's
df['emoji_free_tweets'] = df['original_tweets'].apply(call_emoji_free)

#Create a new column with url free tweets
df['url_free_tweets'] = df['emoji_free_tweets'].apply(url_free_text)

df

  and should_run_async(code)


Unnamed: 0,original_tweets,emoji_free_tweets,url_free_tweets
0,This kid will forever be a legend 😂 https://t....,This kid will forever be a legend 😂 https://t....,This kid will forever be a legend 😂
1,"If you truly believe Lebrons mindset, competit...","If you truly believe Lebrons mindset, competit...","If you truly believe Lebrons mindset, competit..."
2,BUTTLICKER! OUR PRICES HAVE NEVER BEEN LOWER!!!,BUTTLICKER! OUR PRICES HAVE NEVER BEEN LOWER!!!,BUTTLICKER! OUR PRICES HAVE NEVER BEEN LOWER!!!
3,@Bhuvan_Bam ❤️❤️,@Bhuvan_Bam ❤️❤️,@Bhuvan_Bam ❤️❤️
4,I'm not crying you're crying.\nhttps://t.co/Bc...,I'm not crying you're crying. https://t.co/BcF...,I'm not crying you're crying.
...,...,...,...
9941,@MirandaSleeper The offerings in that f-3 are ...,@MirandaSleeper The offerings in that f-3 are ...,@MirandaSleeper The offerings in that f-3 are ...
9942,This will be the defining segment of “The Last...,This will be the defining segment of “The Last...,This will be the defining segment of “The Last...
9943,"@frank_miskelly I don’t like it, I LOVE IT! Bu...","@frank_miskelly I don’t like it, I LOVE IT! Bu...","@frank_miskelly I don’t like it, I LOVE IT! Bu..."
9944,Excited for my brother @Shufly10 as he embarks...,Excited for my brother @Shufly10 as he embarks...,Excited for my brother @Shufly10 as he embarks...


In [6]:
# Execute this by uncommenting it to download spacy if there is some error, else IGNORE!!
#!python -m spacy download en_core_web_lg

  and should_run_async(code)


# Tokenizing

In [7]:
# Load spacy
# Make sure to restart the runtime after running installations and libraries tab

nlp = spacy.load('en_core_web_lg')

  and should_run_async(code)


In [8]:
"""
Import Gensim and Wordcloud to use their stopwords as well and use the combined stopwords of ALL as the variable:
ALL_STOP_WORDS
"""
# Timing Start
program_start_time = time.time()

# Tokenizer
tokenizer = Tokenizer(nlp.vocab)

print(nlp.Defaults.stop_words)

# Custom stopwords
custom_stopwords = ['hi','\n','\n\n', '&amp;', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@']

# Customize stop words by adding to the default list
STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords)

# ALL_STOP_WORDS = spacy + gensim + wordcloud
ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords)


tokens = []

for doc in tokenizer.pipe(df['url_free_tweets'], batch_size=500):
    doc_tokens = []    
    for token in doc: 
        if token.text.lower() not in STOP_WORDS:
            doc_tokens.append(token.text.lower())   
    tokens.append(doc_tokens)

# Makes tokens column
df['tokens'] = tokens

# Timing End
program_end_time = time.time()

# View df
df

  and should_run_async(code)


{'after', 'call', 'ten', "'ll", 'again', 'and', 'it', 'except', 'everyone', '‘d', 'either', 'within', 'without', 'an', 'around', 'just', 'no', 'cannot', 'forty', 'himself', 'perhaps', 'each', 'therein', 'while', 'when', 'off', 'thereby', 'done', 'this', 'fifty', 'whither', "n't", 'even', 'along', 'across', 'did', 'though', 'first', 'put', 'herein', 'has', 'whence', 'front', 'six', 'than', 'with', 'somehow', 'therefore', 'somewhere', '‘ll', 'all', 'several', "'s", 'both', 'once', 'meanwhile', 'you', 'why', 'rather', 'of', 'eleven', 'everything', 'becomes', 'itself', 'full', 'to', 'hereafter', 'about', 'thence', 'namely', 'others', 'back', 'fifteen', 'her', 'might', 'could', 'beyond', 'n‘t', 'many', 'beside', 'was', 'sometimes', 'do', 'n’t', "'ve", 'none', 'same', 'regarding', 'further', 'whereupon', 'doing', 'moreover', 'that', 'the', 'indeed', 'nevertheless', 'on', 'thereupon', 'being', 'until', 'amongst', 'everywhere', 'hereupon', 'whom', 'although', 'us', 'anyhow', 'then', 'sixty', '

Unnamed: 0,original_tweets,emoji_free_tweets,url_free_tweets,tokens
0,This kid will forever be a legend 😂 https://t....,This kid will forever be a legend 😂 https://t....,This kid will forever be a legend 😂,"[kid, forever, legend, 😂]"
1,"If you truly believe Lebrons mindset, competit...","If you truly believe Lebrons mindset, competit...","If you truly believe Lebrons mindset, competit...","[truly, believe, lebrons, mindset,, competitiv..."
2,BUTTLICKER! OUR PRICES HAVE NEVER BEEN LOWER!!!,BUTTLICKER! OUR PRICES HAVE NEVER BEEN LOWER!!!,BUTTLICKER! OUR PRICES HAVE NEVER BEEN LOWER!!!,"[buttlicker!, prices, lower!!!]"
3,@Bhuvan_Bam ❤️❤️,@Bhuvan_Bam ❤️❤️,@Bhuvan_Bam ❤️❤️,"[@bhuvan_bam, ❤️❤️]"
4,I'm not crying you're crying.\nhttps://t.co/Bc...,I'm not crying you're crying. https://t.co/BcF...,I'm not crying you're crying.,"[crying, you're, crying.]"
...,...,...,...,...
9941,@MirandaSleeper The offerings in that f-3 are ...,@MirandaSleeper The offerings in that f-3 are ...,@MirandaSleeper The offerings in that f-3 are ...,"[@mirandasleeper, offerings, f-3, closed, prev..."
9942,This will be the defining segment of “The Last...,This will be the defining segment of “The Last...,This will be the defining segment of “The Last...,"[defining, segment, “the, dance.”]"
9943,"@frank_miskelly I don’t like it, I LOVE IT! Bu...","@frank_miskelly I don’t like it, I LOVE IT! Bu...","@frank_miskelly I don’t like it, I LOVE IT! Bu...","[@frank_miskelly, don’t, it,, love, it!, think..."
9944,Excited for my brother @Shufly10 as he embarks...,Excited for my brother @Shufly10 as he embarks...,Excited for my brother @Shufly10 as he embarks...,"[excited, brother, @shufly10, embarks, new, jo..."


In [9]:
# See how long it took
print(program_end_time - program_start_time, "seconds to finish")

2.3106327056884766 seconds to finish


  and should_run_async(code)


# Lemmatization

In [10]:
# Reference 4 : https://stackoverflow.com/questions/45306988/column-of-lists-convert-list-to-string-as-a-new-column

# Timing Start
program_start_time = time.time()

# Make tokens a string again
df['tokens_back_to_text'] = [' '.join(map(str, l)) for l in df['tokens']]

def get_lemmas(text):
    '''Used to lemmatize the processed tweets'''
    lemmas = []
    
    doc = nlp(text)
    
    # Something goes here :P
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
            lemmas.append(token.lemma_)
    
    return lemmas

df['lemmas'] = df['tokens_back_to_text'].apply(get_lemmas)

# Make lemmas a string again
df['lemmas_back_to_text'] = [' '.join(map(str, l)) for l in df['lemmas']]
#df[['original_tweet', 'lemmas_back_to_text']]

# Timing End
program_end_time = time.time()

  and should_run_async(code)


In [11]:
#Printing Lemmetization Time
print(program_end_time - program_start_time, "seconds to finish")

86.73705101013184 seconds to finish


  and should_run_async(code)


In [12]:
# Timing Start
program_start_time = time.time()

# Tokenizer
tokenizer = Tokenizer(nlp.vocab)

# Tokenizer function
def tokenize(text):
    """
    Parses a string into a list of semantic units (words)
    Args:
        text (str): The string that the function will tokenize.
    Returns:
        list: tokens parsed out
    """
    # Removing url's
    pattern = r"http\S+"
    
    tokens = re.sub(pattern, "", text) # https://www.youtube.com/watch?v=O2onA4r5UaY
    tokens = re.sub('[^a-zA-Z 0-9]', '', text)
    tokens = re.sub('[%s]' % re.escape(string.punctuation), '', text) # Remove punctuation
    tokens = re.sub('\w*\d\w*', '', text) # Remove words containing numbers
    tokens = re.sub('@*!*\$*', '', text) # Remove @ ! $
    tokens = tokens.strip(',') # TESTING THIS LINE
    tokens = tokens.strip('?') # TESTING THIS LINE
    tokens = tokens.strip('!') # TESTING THIS LINE
    tokens = tokens.strip("'") # TESTING THIS LINE
    tokens = tokens.strip(".") # TESTING THIS LINE

    tokens = tokens.lower().split() # Make text lowercase and split it
    
    return tokens

# Apply tokenizer
df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)

# Timing End
program_end_time = time.time()

# View those tokens (the 4th column)
df

  and should_run_async(code)
  tokens = re.sub('\w*\d\w*', '', text) # Remove words containing numbers
  tokens = re.sub('@*!*\$*', '', text) # Remove @ ! $


Unnamed: 0,original_tweets,emoji_free_tweets,url_free_tweets,tokens,tokens_back_to_text,lemmas,lemmas_back_to_text,lemma_tokens
0,This kid will forever be a legend 😂 https://t....,This kid will forever be a legend 😂 https://t....,This kid will forever be a legend 😂,"[kid, forever, legend, 😂]",kid forever legend 😂,"[kid, forever, legend, 😂]",kid forever legend 😂,"[kid, forever, legend, 😂]"
1,"If you truly believe Lebrons mindset, competit...","If you truly believe Lebrons mindset, competit...","If you truly believe Lebrons mindset, competit...","[truly, believe, lebrons, mindset,, competitiv...","truly believe lebrons mindset, competitive fir...","[truly, believe, lebrons, mindset, competitive...",truly believe lebrons mindset competitive fire...,"[truly, believe, lebrons, mindset, competitive..."
2,BUTTLICKER! OUR PRICES HAVE NEVER BEEN LOWER!!!,BUTTLICKER! OUR PRICES HAVE NEVER BEEN LOWER!!!,BUTTLICKER! OUR PRICES HAVE NEVER BEEN LOWER!!!,"[buttlicker!, prices, lower!!!]",buttlicker! prices lower!!!,"[buttlicker, price, lower]",buttlicker price lower,"[buttlicker, price, lower]"
3,@Bhuvan_Bam ❤️❤️,@Bhuvan_Bam ❤️❤️,@Bhuvan_Bam ❤️❤️,"[@bhuvan_bam, ❤️❤️]",@bhuvan_bam ❤️❤️,"[@bhuvan_bam, ❤, ️, ❤, ️]",@bhuvan_bam ❤ ️ ❤ ️,"[bhuvan_bam, ❤, ️, ❤, ️]"
4,I'm not crying you're crying.\nhttps://t.co/Bc...,I'm not crying you're crying. https://t.co/BcF...,I'm not crying you're crying.,"[crying, you're, crying.]",crying you're crying.,"[cry, cry]",cry cry,"[cry, cry]"
...,...,...,...,...,...,...,...,...
9941,@MirandaSleeper The offerings in that f-3 are ...,@MirandaSleeper The offerings in that f-3 are ...,@MirandaSleeper The offerings in that f-3 are ...,"[@mirandasleeper, offerings, f-3, closed, prev...",@mirandasleeper offerings f-3 closed previous ...,"[@mirandasleeper, offering, f-3, close, previo...",@mirandasleeper offering f-3 close previous of...,"[mirandasleeper, offering, f-3, close, previou..."
9942,This will be the defining segment of “The Last...,This will be the defining segment of “The Last...,This will be the defining segment of “The Last...,"[defining, segment, “the, dance.”]",defining segment “the dance.”,"[define, segment, dance]",define segment dance,"[define, segment, dance]"
9943,"@frank_miskelly I don’t like it, I LOVE IT! Bu...","@frank_miskelly I don’t like it, I LOVE IT! Bu...","@frank_miskelly I don’t like it, I LOVE IT! Bu...","[@frank_miskelly, don’t, it,, love, it!, think...","@frank_miskelly don’t it, love it! think sixth...","[@frank_miskelly, love, think, sixth, old, run...",@frank_miskelly love think sixth old run middl...,"[frank_miskelly, love, think, sixth, old, run,..."
9944,Excited for my brother @Shufly10 as he embarks...,Excited for my brother @Shufly10 as he embarks...,Excited for my brother @Shufly10 as he embarks...,"[excited, brother, @shufly10, embarks, new, jo...","excited brother @shufly10 embarks new journey,...","[excited, brother, @shufly10, embark, new, jou...",excited brother @shufly10 embark new journey o...,"[excited, brother, shufly10, embark, new, jour..."


In [13]:
#Printing Tokenization Time
print(program_end_time - program_start_time, "seconds to finish")


0.6293168067932129 seconds to finish


  and should_run_async(code)


# Topic Modeling

## id2word

In [14]:
# Create a id2word dictionary
id2word = Dictionary(df['lemma_tokens'])
print(len(id2word))

  and should_run_async(code)


22594


In [15]:
# Filtering Extremes
id2word.filter_extremes(no_below=2, no_above=.99)
print(len(id2word))

8594


  and should_run_async(code)


# Corpus Object & Generating Base Model Topics

In [16]:
# Creating a corpus object 
corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']]

  and should_run_async(code)


## Base Model

In [17]:
# Timing Start
base_model_program_start_time = time.time()

# Instantiating a LDA model 
base_model = LdaMulticore(corpus=corpus, num_topics=5, id2word=id2word, workers=12, passes=5)

# Timing End
base_model_program_end_time = time.time()


  and should_run_async(code)


In [18]:
#Printing First Model Time
base_model_runtime = round(base_model_program_end_time - base_model_program_start_time, 2)
print(base_model_runtime)


18.28


  and should_run_async(code)


In [19]:
# Filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in base_model.print_topics()]


  and should_run_async(code)


In [20]:
# Create Topics
topics = [' '.join(t[0:10]) for t in words]


  and should_run_async(code)


In [21]:
# Getting the topics
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


------ Topic 0 ------
people good say love day need trump think time thank

------ Topic 1 ------
️ 🇺 follow new 🇸 good tweet ❤ giveaway bitcoin

------ Topic 2 ------
people know 😂 time obamagate trump president realdonaldtrump good say

------ Topic 3 ------
go 😭 come follow look know people new year love

------ Topic 4 ------
day time happy think btc work man bitcoin bad thing



  and should_run_async(code)


In [22]:
# Compute Perplexity - Perplexity = exp(-1. * log-likelihood per word)
# a measure of how good the model is. lower the better
base_perplexity = base_model.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity) 

# Compute Coherence Score
coherence_model = CoherenceModel(model=base_model, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)


  and should_run_async(code)



Perplexity:  -8.477491681779876

Coherence Score:  0.20655304514502393


# Base Model Topic Distance Visualization

In [23]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(base_model, corpus, id2word)

  and should_run_async(code)


# Hyperparameter Tuning

## Grid Search

In [24]:
lemmas_df = df['lemmas_back_to_text']

vectorizer = CountVectorizer()
data_vectorized = vectorizer.fit_transform(df['lemmas_back_to_text'])


  and should_run_async(code)


In [25]:
gs_start_time = time.time()

# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)
GridSearchCV(cv=None, error_score='raise',
             estimator=LatentDirichletAllocation(batch_size=128, 
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1, 
                                                 learning_decay=0.7, 
                                                 learning_method=None,
                                                 learning_offset=10.0, 
                                                 max_doc_update_iter=100, 
                                                 max_iter=10,
                                                 mean_change_tol=0.001, 
                                                 n_components=10, 
                                                 n_jobs=1,
                                                 perp_tol=0.1, 
                                                 random_state=None,
                                                 topic_word_prior=None, 
                                                 total_samples=1000000.0, 
                                                 verbose=0),
             iid=True, n_jobs=1,
             param_grid={'n_topics': [10, 15, 20, 30], 
                         'learning_decay': [0.5, 0.7, 0.9]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
             scoring=None, verbose=0)

gs_end_time = time.time()


  and should_run_async(code)


In [26]:
print(gs_end_time - gs_start_time, "seconds to finish")


415.52032923698425 seconds to finish


  and should_run_async(code)


In [27]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))
# Best Model's Params:  {'learning_decay': 0.9, 'n_topics': 10}


  and should_run_async(code)


Best Model's Params:  {'learning_decay': 0.9, 'n_components': 10}
Best Log Likelihood Score:  -446509.2200088497
Model Perplexity:  10189.646842489858


# Hyperparameter Tuning

## Optimum number of topics

In [28]:
#Defining a function to loop over number of topics to be used to find an 
#optimal number of topics
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the
    LDA model with respective number of topics
    """
    coherence_values_topic = []
    model_list_topic = []
    for num_topics in range(start, limit, step):
        model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list_topic.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values_topic.append(coherencemodel.get_coherence())

    return model_list_topic, coherence_values_topic

  and should_run_async(code)


In [29]:
model_list_topic, coherence_values_topic = compute_coherence_values(dictionary=id2word,
                                                        corpus=corpus,
                                                        texts=df['lemma_tokens'],
                                                        start=2, limit=200, step=6)

  and should_run_async(code)


In [30]:
limit=200; start=2; step=6;
x_topic = range(start, limit, step)

topic_ts = {'coherence_value': coherence_values_topic,
            'number_of_topics': x_topic}

topic_chart = pd.DataFrame(data=topic_ts)

topic_fig = px.line(topic_chart, x="number_of_topics", y="coherence_value")
topic_fig.show()

  and should_run_async(code)

Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working



In [31]:
# Print the coherence scores
for m, cv in zip(x_topic, coherence_values_topic):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

Num Topics = 2  has Coherence Value of 0.1566
Num Topics = 8  has Coherence Value of 0.2015
Num Topics = 14  has Coherence Value of 0.266
Num Topics = 20  has Coherence Value of 0.2543
Num Topics = 26  has Coherence Value of 0.2508
Num Topics = 32  has Coherence Value of 0.2577
Num Topics = 38  has Coherence Value of 0.2695
Num Topics = 44  has Coherence Value of 0.2854
Num Topics = 50  has Coherence Value of 0.2856
Num Topics = 56  has Coherence Value of 0.2994
Num Topics = 62  has Coherence Value of 0.3193
Num Topics = 68  has Coherence Value of 0.3162
Num Topics = 74  has Coherence Value of 0.3332
Num Topics = 80  has Coherence Value of 0.3286
Num Topics = 86  has Coherence Value of 0.3325
Num Topics = 92  has Coherence Value of 0.34
Num Topics = 98  has Coherence Value of 0.3425
Num Topics = 104  has Coherence Value of 0.3568
Num Topics = 110  has Coherence Value of 0.3514
Num Topics = 116  has Coherence Value of 0.369
Num Topics = 122  has Coherence Value of 0.3772
Num Topics = 12


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



## Topic = 5

In [32]:
# Let's start with parameter tuning for the LDA model and,
# find an optimal number of topics to reach the best coherence score

# Define chunksize and passes
# Chunksize is Number of documents to be used in each training chunk
# Passes is Number of passes through the corpus during training

# Timing Start
model_topic_5_start_time = time.time()

model_topic_5 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=5,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

# Timing End
model_topic_5_end_time = time.time()



`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [33]:
#Printing First Model Time
model_topic_5_runtime = round(model_topic_5_end_time - model_topic_5_start_time, 2)
print(model_topic_5_runtime)

16.19



`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [34]:
# Filtering for words 
words_topic_5 = [re.findall(r'"([^"]*)"',t[1]) for t in model_topic_5.print_topics()]

# Create Topics
topics_5 = [' '.join(t[0:10]) for t in words_topic_5]

# Getting the topics
for id, t in enumerate(topics_5): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
people follow good know time retweet 😭 giveaway tweet love

------ Topic 1 ------
😂 day go come say 🇺 people 🇸 trump vote

------ Topic 2 ------
realdonaldtrump obamagate say new china time trump people look obama

------ Topic 3 ------
️ need president state know new health time trump world

------ Topic 4 ------
bitcoin work people day go think time good 🔥 new




`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [35]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_topic_5_perplexity = model_topic_5.log_perplexity(corpus)
print('\nPerplexity: ', model_topic_5_perplexity) 

# Compute Coherence Score
coherence_model_topic_5 = CoherenceModel(model=model_topic_5, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_topic_5 = coherence_model_topic_5.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_topic_5)



`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.




Perplexity:  -8.41698108806467

Coherence Score:  0.27536694082688146


## Topic = 10

In [36]:
#Increasing number of topics to 10
#Timing Start
model_topic_10_start_time = time.time()

model_topic_10 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=10,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

# Timing End
model_topic_10_end_time = time.time()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [37]:
#Printing First Model Time
model_topic_10_runtime = round(model_topic_10_end_time - model_topic_10_start_time, 2)
print(model_topic_10_runtime)

15.62



`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [38]:
# Filtering for words 
words_topic_10 = [re.findall(r'"([^"]*)"',t[1]) for t in model_topic_10.print_topics()]

# Create Topics
topics_10 = [' '.join(t[0:10]) for t in words_topic_10]

# Getting the topics
for id, t in enumerate(topics_10): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
follow people 😭 retweet giveaway good know tweet time friend

------ Topic 1 ------
come go love say think 👇 trump fuck let 😂

------ Topic 2 ------
flynn say obama case realdonaldtrump 😍 know trump new bitcoin

------ Topic 3 ------
️ need president know trump obama money health time state

------ Topic 4 ------
bitcoin work people go think day time video new bill

------ Topic 5 ------
time ️ realdonaldtrump ❤ people day great die need year

------ Topic 6 ------
🇺 🇸 🔥 coronavirus time china new know right watch

------ Topic 7 ------
thank happy good obamagate today day win people mom birthday

------ Topic 8 ------
😂 🤣 mask say get go 💪 people wear day

------ Topic 9 ------
day good bad people know look year news think time




`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [39]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_topic_10_perplexity = model_topic_10.log_perplexity(corpus)
print('\nPerplexity: ', model_topic_10_perplexity) 

# Compute Coherence Score
coherence_model_topic_10 = CoherenceModel(model=model_topic_10, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_topic_10 = coherence_model_topic_10.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_topic_10)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.




Perplexity:  -8.590866552368194

Coherence Score:  0.29226481796203113


## Topic = 20

In [40]:
#Increasing number of topics to 20
#Timing Start
model_topic_20_start_time = time.time()

model_topic_20 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=20,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

# Timing End
model_topic_20_end_time = time.time()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [41]:
#Printing First Model Time
model_topic_20_runtime = round(model_topic_20_end_time - model_topic_20_start_time, 2)
print(model_topic_20_runtime)

15.17



`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [42]:
# Filtering for words 
words_20 = [re.findall(r'"([^"]*)"',t[1]) for t in model_topic_20.print_topics()]

# Create Topics
topics_20 = [' '.join(t[0:10]) for t in words_20]

# Getting the topics
for id, t in enumerate(topics_20): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
follow giveaway retweet people hour ✅ good tag winner friend

------ Topic 1 ------
👇 go come think watch believe say people get biden

------ Topic 2 ------
say 😍 trump realdonaldtrump china bitcoin de president que pandemic

------ Topic 3 ------
️ president need order obama = hawzi trump ♀ ♂

------ Topic 4 ------
bitcoin btc work day block go 🔸 miss new flynn

------ Topic 5 ------
people time day need state realdonaldtrump god year lose great

------ Topic 6 ------
🇺 🇸 🔥 watch leave tell time right thank people

------ Topic 7 ------
thank happy birthday good today 👏 🚨 people video fuck

------ Topic 8 ------
😭 🤣 💪 comment 😂 people 🙏 trump ️ love

------ Topic 9 ------
bad 🎉 new people day know girl think judge good

------ Topic 10 ------
think guy year new help need people yes day time

------ Topic 11 ------
day mother great mom old far happy year office week

------ Topic 12 ------
time people go ️ week play work let see home

------ Topic 13 ------
know 


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [43]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_topic_20_perplexity = model_topic_20.log_perplexity(corpus)
print('\nPerplexity: ', model_topic_20_perplexity) 

# Compute Coherence Score
coherence_model_topic_20 = CoherenceModel(model=model_topic_20, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_topic_20 = coherence_model_topic_20.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_topic_20)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.




Perplexity:  -8.827157719145498

Coherence Score:  0.32393582896626627


## Topic = 30

In [44]:
#We have had a reduction from .39 to .35 in coherence score 
#by going from 20 to 25. Let's try 30 topics and see what 
#coherence score we'll get. 

# Timing Start
model_topic_30_start_time = time.time()

model_topic_30 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=30,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

# Timing End
model_topic_30_end_time = time.time()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [45]:
#Printing First Model Time
model_topic_30_runtime = round(model_topic_30_end_time - model_topic_30_start_time, 2)
print(model_topic_30_runtime)

15.12



`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [46]:
# Filtering for words 
words_30 = [re.findall(r'"([^"]*)"',t[1]) for t in model_topic_30.print_topics()]

# Create Topics
topics_30 = [' '.join(t[0:10]) for t in words_30]

# Getting the topics
for id, t in enumerate(topics_30): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
obamagate ️ ❤ people look dm control new time car

------ Topic 1 ------
drop mean lose oh learn say justice work week great

------ Topic 2 ------
😂 good 🥺 new come day need people look know

------ Topic 3 ------
bitcoin go btc work halving trump gold say day million

------ Topic 4 ------
case come people elon word end new state think way

------ Topic 5 ------
need day 👀 people time realdonaldtrump new 😈 go ‍

------ Topic 6 ------
yes people think guy year work time say new help

------ Topic 7 ------
tweet retweet go friend good reply enter time giveaway 👉

------ Topic 8 ------
know bill love 👏 good time life 🏻 bad trump

------ Topic 9 ------
️ know day let people go time good week lie

------ Topic 10 ------
win check black crossover amazon work come need white 5

------ Topic 11 ------
obama flynn biden 🔥 know unmask retweet judge follow trump

------ Topic 12 ------
👇 love come think people fuck go say player state

------ Topic 13 ------
💪 people say w


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [47]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_topic_30_perplexity = model_topic_30.log_perplexity(corpus)
print('\nPerplexity: ', model_topic_30_perplexity) 

# Compute Coherence Score
coherence_model_topic_30 = CoherenceModel(model=model_topic_30, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_topic_30 = coherence_model_topic_30.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_topic_30)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.




Perplexity:  -8.989809372716431

Coherence Score:  0.3516199756784022


## Topic = 40

In [48]:
# Let's try 40 topics
# Timing Start
model_topic_40_start_time = time.time()

model_topic_40 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=40,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

# Timing End
model_topic_40_end_time = time.time()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [49]:
#Printing First Model Time
model_topic_40_runtime = round(model_topic_40_end_time - model_topic_40_start_time, 2)
print(model_topic_40_runtime)

15.67



`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [50]:
# Filtering for words 
words_40 = [re.findall(r'"([^"]*)"',t[1]) for t in model_topic_40.print_topics()]

# Create Topics
topics_40 = [' '.join(t[0:10]) for t in words_40]

# Getting the topics
for id, t in enumerate(topics_40): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
️ ❤ follow good time go come ♂ ⚡ think

------ Topic 1 ------
💪 mask say people 💀 truth happen wear way covid19

------ Topic 2 ------
know love good time biden morning game think obamagate trump

------ Topic 3 ------
🎉 + birthday happy havanamayhem new go time people day

------ Topic 4 ------
😂 🥺 dm new people come trump time post angeles

------ Topic 5 ------
obamagate thank fuck 🚨 good love guy btc people happy

------ Topic 6 ------
bitcoin de que big o e go look come win

------ Topic 7 ------
people bitcoin halving today guess feel halve new pay help

------ Topic 8 ------
man money year work blah ♥ start 5 good block

------ Topic 9 ------
hawzi come dr case think la let good de tell

------ Topic 10 ------
time covid people work come open say music coronavirus police

------ Topic 11 ------
great realdonaldtrump people day thing bro time work go inittogether

------ Topic 12 ------
people new work think bill help house force day go

------ Topic 13 ----


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [51]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_topic_40_perplexity = model_topic_40.log_perplexity(corpus)
print('\nPerplexity: ', model_topic_40_perplexity) 

# Compute Coherence Score
coherence_model_topic_40 = CoherenceModel(model=model_topic_40, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_topic_40 = coherence_model_topic_40.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_topic_40)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.




Perplexity:  -9.151189596336533

Coherence Score:  0.35579448931291824


## Topic = 50

In [52]:
# Timing Start
model_topic_50_start_time = time.time()

model_topic_50 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=50,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

# Timing End
model_topic_50_end_time = time.time()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [53]:
#Printing First Model Time
model_topic_50_runtime = round(model_topic_50_end_time - model_topic_50_start_time, 2)
print(model_topic_50_runtime)

16.62



`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [54]:
# Filtering for words 
words_50= [re.findall(r'"([^"]*)"',t[1]) for t in model_topic_50.print_topics()]

# Create Topics
topics_50= [' '.join(t[0:10]) for t in words_50]

# Getting the topics
for id, t in enumerate(topics_50):
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
🤣 million people tell virus go rich man curve think

------ Topic 1 ------
real think talk time day new play 4 come u

------ Topic 2 ------
live ⚡ shit good 🖤 let ️ give look need

------ Topic 3 ------
follow giveaway retweet friend end ✅ 20 dr history 🔥

------ Topic 4 ------
😍 new good think year china state money go take

------ Topic 5 ------
time vaccine try watch trump people long think say covid-19

------ Topic 6 ------
better news save life house thank know contact play end

------ Topic 7 ------
death month go reopen live open tell people mask day

------ Topic 8 ------
😭 people love good thank 🇨 let 🇧 time 💗

------ Topic 9 ------
bill biden president realdonaldtrump name know social ready think new

------ Topic 10 ------
government people lockdown state let money obamagate control thing look

------ Topic 11 ------
vote time believe tweet person tara rt reade follow week

------ Topic 12 ------
woman old sorry look day deep prison coronavirus new sp


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [55]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_topic_50_perplexity = model_topic_50.log_perplexity(corpus)
print('\nPerplexity: ', model_topic_50_perplexity) 

# Compute Coherence Score
coherence_model_topic_50 = CoherenceModel(model=model_topic_50, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_topic_50 = coherence_model_topic_50.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_topic_50)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.




Perplexity:  -9.289301062723158

Coherence Score:  0.3485958902415555


## Topic = 60

In [56]:
# Timing Start
model_topic_60_start_time = time.time()

model_topic_60 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=60,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

# Timing End
model_topic_60_end_time = time.time()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [57]:
#Printing First Model Time
model_topic_60_runtime = round(model_topic_60_end_time - model_topic_60_start_time, 2)
print(model_topic_60_runtime)

17.36



`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [58]:
# Filtering for words 
words_60 = [re.findall(r'"([^"]*)"',t[1]) for t in model_topic_60.print_topics()]

# Create Topics
topics_60 = [' '.join(t[0:10]) for t in words_60]

# Getting the topics
for id, t in enumerate(topics_60): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
u state song sure deep think reopen 💗 control roast

------ Topic 1 ------
bitcoin btc block halve people halving step video coronavirus buy

------ Topic 2 ------
people crazy think look watch thing fortnite stop good ability

------ Topic 3 ------
find learn new 5 speak wake people love rt pick

------ Topic 4 ------
️ ❤ happy birthday 🎉 love havanamayhem ♀ 🤷 ♂

------ Topic 5 ------
great day fuck shit mother mom look hey weekend enjoy

------ Topic 6 ------
truth good votenakanelua people great thejtlewis lauramcf76 sophiavetare jtlewisct look

------ Topic 7 ------
youtube need account to delete video report man surprise go

------ Topic 8 ------
people 💀 stay bitcoin uk go true say way samydindane

------ Topic 9 ------
year work man word go people stay opportunity study course

------ Topic 10 ------
tweet fake hope retweet time person nice go minute people

------ Topic 11 ------
help work say love vaccine india year covid19 new dump

------ Topic 12 -----


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [59]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_topic_60_perplexity = model_topic_60.log_perplexity(corpus)
print('\nPerplexity: ', model_topic_60_perplexity) 

# Compute Coherence Score
coherence_model_topic_60 = CoherenceModel(model=model_topic_60, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_topic_60 = coherence_model_topic_60.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_topic_60)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.




Perplexity:  -9.400630906327303

Coherence Score:  0.3897193625904848


## Topic = 70

In [60]:
# Timing Start
model_topic_70_start_time = time.time()

model_topic_70 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=70,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

# Timing End
model_topic_70_end_time = time.time()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [61]:
#Printing First Model Time
model_topic_70_runtime = round(model_topic_70_end_time - model_topic_70_start_time, 2)
print(model_topic_70_runtime)

19.53



`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [62]:
# Filtering for words 
words_70 = [re.findall(r'"([^"]*)"',t[1]) for t in model_topic_70.print_topics()]

# Create Topics
topics_70 = [' '.join(t[0:10]) for t in words_70]

# Getting the topics
for id, t in enumerate(topics_70): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
happy birthday 🎉 work people havanamayhem fucking day man oh

------ Topic 1 ------
china rt come tweet know time follow go look today

------ Topic 2 ------
tweet reply friend 💞 enter 👉 giveaway entry click 🔗

------ Topic 3 ------
time china new coronavirus illegal stimulus bill uk big house

------ Topic 4 ------
break 🎊 🎉 ass phone people day offer notice share

------ Topic 5 ------
🤣 d virus 🔸 people go patient covid datum florida

------ Topic 6 ------
follow retweet giveaway tag friend hour end enter winner ✅

------ Topic 7 ------
😍 💪 👇 🚨 🏻 btc 🏼 good work people

------ Topic 8 ------
😭 say get month way people new to laugh look

------ Topic 9 ------
| hold people learn donaldtrumpisthetypeofguy know watch account good play

------ Topic 10 ------
mask open wear america people right let wake good add

------ Topic 11 ------
+ people rich na probably heart rt look support 👏

------ Topic 12 ------
😈 💀 good 🔸 look state today pussy win forget

------ Topi


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [63]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_topic_70_perplexity = model_topic_70.log_perplexity(corpus)
print('\nPerplexity: ', model_topic_70_perplexity) 

# Compute Coherence Score
coherence_model_topic_70 = CoherenceModel(model=model_topic_70, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_topic_70 = coherence_model_topic_70.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_topic_70)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.




Perplexity:  -9.582054836397157

Coherence Score:  0.3869959632327602


In [64]:
#Defining a function to loop over number of topics to be used to find an 
#optimal number of topics
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the 
    LDA model with respective number of topics
    """
    coherence_values_topic = []
    model_list_topic = []
    for num_topics in range(start, limit, step):
        model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list_topic.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values_topic.append(coherencemodel.get_coherence())

    return model_list_topic, coherence_values_topic


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [65]:
model_list_topic, coherence_values_topic = compute_coherence_values(dictionary=id2word,
                                                        corpus=corpus,
                                                        texts=df['lemma_tokens'],
                                                        start=2, limit=200, step=6)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [66]:
limit=200; start=2; step=6;
x_topic = range(start, limit, step)

topic_ts = {'coherence_value': coherence_values_topic,
            'number_of_topics': x_topic}

topic_chart = pd.DataFrame(data=topic_ts)

topic_fig = px.line(topic_chart, x="number_of_topics", y="coherence_value")
topic_fig.show()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [67]:
# Print the coherence scores
for m, cv in zip(x_topic, coherence_values_topic):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

Num Topics = 2  has Coherence Value of 0.1627
Num Topics = 8  has Coherence Value of 0.2206
Num Topics = 14  has Coherence Value of 0.2351
Num Topics = 20  has Coherence Value of 0.238
Num Topics = 26  has Coherence Value of 0.2564
Num Topics = 32  has Coherence Value of 0.2802
Num Topics = 38  has Coherence Value of 0.284
Num Topics = 44  has Coherence Value of 0.2784
Num Topics = 50  has Coherence Value of 0.2915
Num Topics = 56  has Coherence Value of 0.2981
Num Topics = 62  has Coherence Value of 0.2883
Num Topics = 68  has Coherence Value of 0.3065
Num Topics = 74  has Coherence Value of 0.3061
Num Topics = 80  has Coherence Value of 0.3268
Num Topics = 86  has Coherence Value of 0.3339
Num Topics = 92  has Coherence Value of 0.3324
Num Topics = 98  has Coherence Value of 0.3413
Num Topics = 104  has Coherence Value of 0.3544
Num Topics = 110  has Coherence Value of 0.3559
Num Topics = 116  has Coherence Value of 0.3746
Num Topics = 122  has Coherence Value of 0.3655
Num Topics = 


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [68]:
# Let's keep track of our progress

topic_ts = {'model_iteration':[1,1,1,1,1,1,1,1,1],
            'model': [base_model,model_topic_5,model_topic_10,model_topic_20, model_topic_30, model_topic_40, model_topic_50, model_topic_60, model_topic_70],
      'runtime_seconds': [base_model_runtime, model_topic_5_runtime,
                          model_topic_10_runtime, model_topic_20_runtime,
                          model_topic_30_runtime, model_topic_40_runtime,
                          model_topic_50_runtime, model_topic_60_runtime,
                          model_topic_70_runtime],
      'coherence_score': [coherence_lda_model_base, coherence_lda_model_topic_5,
                          coherence_lda_model_topic_10, coherence_lda_model_topic_20,
                          coherence_lda_model_topic_30, coherence_lda_model_topic_40,
                          coherence_lda_model_topic_50, coherence_lda_model_topic_60,
                          coherence_lda_model_topic_70],
      'perplexity': [base_perplexity, model_topic_5_perplexity, 
                     model_topic_10_perplexity, model_topic_20_perplexity,
                     model_topic_30_perplexity, model_topic_40_perplexity,
                     model_topic_50_perplexity, model_topic_60_perplexity,
                     model_topic_70_perplexity],
      'number_of_topics': [base_model.num_topics, model_topic_5.num_topics,
                           model_topic_10.num_topics, model_topic_20.num_topics,
                           model_topic_30.num_topics, model_topic_40.num_topics, 
                           model_topic_50.num_topics, model_topic_60.num_topics,
                           model_topic_70.num_topics],
            'passes': [base_model.passes, model_topic_5.passes,
                           model_topic_10.passes, model_topic_20.passes,
                           model_topic_30.passes, model_topic_40.passes, 
                           model_topic_50.passes, model_topic_60.passes,
                           model_topic_70.passes]}

topic_track_sheet = pd.DataFrame(data=topic_ts)

topic_track_sheet


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



Unnamed: 0,model_iteration,model,runtime_seconds,coherence_score,perplexity,number_of_topics,passes
0,1,"LdaModel(num_terms=8594, num_topics=5, decay=0...",18.28,0.206553,-8.477492,5,5
1,1,"LdaModel(num_terms=8594, num_topics=5, decay=0...",16.19,0.275367,-8.416981,5,10
2,1,"LdaModel(num_terms=8594, num_topics=10, decay=...",15.62,0.292265,-8.590867,10,10
3,1,"LdaModel(num_terms=8594, num_topics=20, decay=...",15.17,0.323936,-8.827158,20,10
4,1,"LdaModel(num_terms=8594, num_topics=30, decay=...",15.12,0.35162,-8.989809,30,10
5,1,"LdaModel(num_terms=8594, num_topics=40, decay=...",15.67,0.355794,-9.15119,40,10
6,1,"LdaModel(num_terms=8594, num_topics=50, decay=...",16.62,0.348596,-9.289301,50,10
7,1,"LdaModel(num_terms=8594, num_topics=60, decay=...",17.36,0.389719,-9.400631,60,10
8,1,"LdaModel(num_terms=8594, num_topics=70, decay=...",19.53,0.386996,-9.582055,70,10


In [69]:
#Visualizing our progress
topic_fig_1 = px.line(topic_track_sheet, x="number_of_topics", y="coherence_score",
                    hover_name='perplexity', )
topic_fig_1.show()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



## Optimal number of passes

Passes = 10

In [80]:
# Topic = 68
# Passes = 10
model_topic_68_pass_10_start_time = time.time()

model_topic_68_pass_10 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=68,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

model_topic_68_pass_10_end_time = time.time()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [81]:
model_topic_68_pass_10_runtime = round(model_topic_68_pass_10_end_time - model_topic_68_pass_10_start_time, 2)
print(model_topic_68_pass_10_runtime, "seconds to finish")

17.57 seconds to finish



`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [82]:
words_pass_10 = [re.findall(r'"([^"]*)"',t[1]) for t in model_topic_68_pass_10.print_topics()]

# Create Topics
topics_pass_10 = [' '.join(t[0:10]) for t in words_pass_10]

# Getting the topics
for id, t in enumerate(topics_pass_10): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
people lot love go trump time new come 3 god

------ Topic 1 ------
obama flynn biden unmask official yes political unmasking request barack

------ Topic 2 ------
🎉 shit hawzi ass good bad day happy follow havanamayhem

------ Topic 3 ------
😂 mask think wear story 🤣 tell guess go know

------ Topic 4 ------
people work help time say think realdonaldtrump put read year

------ Topic 5 ------
say biden joe leave trump mask know car realdonaldtrump wear

------ Topic 6 ------
know new real day go great look history hit china

------ Topic 7 ------
people state think learn say government look hit new thank

------ Topic 8 ------
know contact white app + life trace gates 🇧 🇮

------ Topic 9 ------
🇺 🇸 hire say game good son watch fight day

------ Topic 10 ------
tweet friend reply retweet giveaway enter miss live 💞 follow

------ Topic 11 ------
🥺 good people virus americans kill need bitcoin come say

------ Topic 12 ------
mother happy 💔 day 😊 🔥 adam mufc schiff m


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [83]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_topic_68_pass_10_perplexity = model_topic_68_pass_10.log_perplexity(corpus)
print('\nPerplexity: ', model_topic_68_pass_10_perplexity) 

# Compute Coherence Score
coherence_model_topic_68_pass_10 = CoherenceModel(model=model_topic_68_pass_10, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_topic_68_pass_10 = coherence_model_topic_68_pass_10.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_topic_68_pass_10)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.




Perplexity:  -9.530502058915735

Coherence Score:  0.39924068263179113


Passes = 15

In [84]:
# Topic = 68
# Passes = 15
model_topic_68_pass_15_start_time = time.time()

model_topic_68_pass_15 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=68,
                       random_state=42,
                       chunksize=2000,
                       passes=15)

model_topic_68_pass_15_end_time = time.time()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [85]:
model_topic_68_pass_15_runtime = round(model_topic_68_pass_15_end_time - model_topic_68_pass_15_start_time, 2)
print(model_topic_68_pass_15_runtime, "seconds to finish")

22.46 seconds to finish



`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [86]:
words_pass_15 = [re.findall(r'"([^"]*)"',t[1]) for t in model_topic_68_pass_15.print_topics()]

# Create Topics
topics_pass_15 = [' '.join(t[0:10]) for t in words_pass_15]

# Getting the topics
for id, t in enumerate(topics_pass_15): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
kill have say 🤔 🎊 trump great truth mind people

------ Topic 1 ------
😭 💚 people forget 🍀 btc 6 ✅ today space

------ Topic 2 ------
long time world country internet run day self spread smta

------ Topic 3 ------
follow retweet giveaway rt ️ winner comment hour tag like

------ Topic 4 ------
votenakanelua 👀 good sophiavetare thejtlewis lauramcf76 jtlewisct live hapa_girl33 follow

------ Topic 5 ------
love happy birthday favorite wish day question work havanamayhem time

------ Topic 6 ------
know go bill watch let sound guy trump new old

------ Topic 7 ------
realdonaldtrump go pick morning tomorrow enjoy genflynn trend to let

------ Topic 8 ------
people state think learn say government look hit 😎 pay

------ Topic 9 ------
bitcoin paul jones say day game think tudor today thing

------ Topic 10 ------
bitcoin time halving people halve buy price party btc feel

------ Topic 11 ------
president investigate court have death governor let pay people think

---


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [87]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_topic_68_pass_15_perplexity = model_topic_68_pass_15.log_perplexity(corpus)
print('\nPerplexity: ', model_topic_68_pass_15_perplexity) 

# Compute Coherence Score
coherence_model_topic_68_pass_15 = CoherenceModel(model=model_topic_68_pass_15, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_topic_68_pass_15 = coherence_model_topic_68_pass_15.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_topic_68_pass_15)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.




Perplexity:  -9.430473979325637

Coherence Score:  0.39991417089969067


Passes = 20

In [88]:
# Topic = 68
# Passes = 20
model_topic_68_pass_20_start_time = time.time()

model_topic_68_pass_20 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=68,
                       random_state=42,
                       chunksize=2000,
                       passes=20)

model_topic_68_pass_20_end_time = time.time()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [89]:
model_topic_68_pass_20_runtime = round(model_topic_68_pass_20_end_time - model_topic_68_pass_20_start_time, 2)
print(model_topic_68_pass_20_runtime, "seconds to finish")

29.27 seconds to finish



`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [90]:
words_pass_20 = [re.findall(r'"([^"]*)"',t[1]) for t in model_topic_68_pass_20.print_topics()]

# Create Topics
topics_pass_20 = [' '.join(t[0:10]) for t in words_pass_20]

# Getting the topics
for id, t in enumerate(topics_pass_20): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
d 5 bezos jeff go care work 🔸 level pandemic

------ Topic 1 ------
fuck listen kind people wake day country come say small

------ Topic 2 ------
contact white app know gates trace + matic america 🇧

------ Topic 3 ------
ready elon musk paul rand xrp think need come janet

------ Topic 4 ------
read money book cry fortnite = question number people think

------ Topic 5 ------
sleep people day 500 give say 100 lie hope away

------ Topic 6 ------
🔥 vote obama democrats new voting time trump case pelosi

------ Topic 7 ------
agree not watch think ripple know stop new will money

------ Topic 8 ------
🤯 need na crazy game talk -&gt be elonmusk rn

------ Topic 9 ------
love happy birthday favorite wish day question work havanamayhem time

------ Topic 10 ------
people work help put think time read say realdonaldtrump _theaishamalik

------ Topic 11 ------
photo 📌 day 👏 people take time today 😁 good

------ Topic 12 ------
mother day happy 💔 mom 😊 🔥 mufc child adam


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [91]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_topic_68_pass_20_perplexity = model_topic_68_pass_20.log_perplexity(corpus)
print('\nPerplexity: ', model_topic_68_pass_20_perplexity) 

# Compute Coherence Score
coherence_model_topic_68_pass_20 = CoherenceModel(model=model_topic_68_pass_20, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_topic_68_pass_20 = coherence_model_topic_68_pass_20.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_topic_68_pass_20)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.




Perplexity:  -9.365832696934703

Coherence Score:  0.4058389790388323


Passes = 25

In [92]:
# Topic = 68
# Passes = 25
model_topic_68_pass_25_start_time = time.time()

model_topic_68_pass_25 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=68,
                       random_state=42,
                       chunksize=2000,
                       passes=25)

model_topic_68_pass_25_end_time = time.time()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [93]:
model_topic_68_pass_25_runtime = round(model_topic_68_pass_25_end_time - model_topic_68_pass_25_start_time, 2)
print(model_topic_68_pass_25_runtime, "seconds to finish")

35.95 seconds to finish



`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [94]:
words_pass_25 = [re.findall(r'"([^"]*)"',t[1]) for t in model_topic_68_pass_25.print_topics()]

# Create Topics
topics_pass_25 = [' '.join(t[0:10]) for t in words_pass_25]

# Getting the topics
for id, t in enumerate(topics_pass_25): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
🇺 🇸 hire son game better say lady fight xbox

------ Topic 1 ------
shit 🎉 hawzi girl ass good thread bad rock let

------ Topic 2 ------
😭 💚 people forget 🍀 space 6 force today btc

------ Topic 3 ------
ready elon musk paul rand xrp think reopen come janet

------ Topic 4 ------
president investigate restaurant court governor wow have death pay school

------ Topic 5 ------
twitter day bad come 😳 pladizow need hear look fucking

------ Topic 6 ------
⚡ head come go look tune ✨ time wait year

------ Topic 7 ------
🤣 lose covid-19 crisis support people fake job joebiden case

------ Topic 8 ------
home stay ago month social ok day order work people

------ Topic 9 ------
sleep people day 500 give say lie 100 hope reach

------ Topic 10 ------
oh bro time eye care day family go border say

------ Topic 11 ------
people work help put think time read say hard realdonaldtrump

------ Topic 12 ------
quarantine 🚨 feel good donaldtrumpisthetypeofguy time state tx usd t


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [95]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_topic_68_pass_25_perplexity = model_topic_68_pass_25.log_perplexity(corpus)
print('\nPerplexity: ', model_topic_68_pass_25_perplexity) 

# Compute Coherence Score
coherence_model_topic_68_pass_25 = CoherenceModel(model=model_topic_68_pass_25, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_topic_68_pass_25 = coherence_model_topic_68_pass_25.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_topic_68_pass_25)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.




Perplexity:  -9.320250737213605

Coherence Score:  0.41268954561973575


## Comparision Sheet

In [96]:
# Let's keep track of our progress

passes_ts = {'model_iteration':[2,2,2,2],
             'model': [model_topic_68_pass_10, model_topic_68_pass_15, model_topic_68_pass_20, model_topic_68_pass_25], 
      'runtime_seconds': [model_topic_68_pass_10_runtime, model_topic_68_pass_15_runtime,
                          model_topic_68_pass_20_runtime, model_topic_68_pass_25_runtime],
      'coherence_score': [coherence_lda_model_topic_68_pass_10, coherence_lda_model_topic_68_pass_15, 
                          coherence_lda_model_topic_68_pass_20, coherence_lda_model_topic_68_pass_25],
      'perplexity': [model_topic_68_pass_10_perplexity,model_topic_68_pass_15_perplexity, 
                     model_topic_68_pass_20_perplexity,model_topic_68_pass_25_perplexity],
      'number_of_topics': [model_topic_68_pass_10.num_topics,model_topic_68_pass_15.num_topics, 
                           model_topic_68_pass_20.num_topics,model_topic_68_pass_25.num_topics],
            'passes': [10,15,20,25]}

passes_track_sheet = pd.DataFrame(data=passes_ts)

passes_track_sheet


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



Unnamed: 0,model_iteration,model,runtime_seconds,coherence_score,perplexity,number_of_topics,passes
0,2,"LdaModel(num_terms=8594, num_topics=68, decay=...",17.57,0.399241,-9.530502,68,10
1,2,"LdaModel(num_terms=8594, num_topics=68, decay=...",22.46,0.399914,-9.430474,68,15
2,2,"LdaModel(num_terms=8594, num_topics=68, decay=...",29.27,0.405839,-9.365833,68,20
3,2,"LdaModel(num_terms=8594, num_topics=68, decay=...",35.95,0.41269,-9.320251,68,25


In [97]:
#Visualizing our progress
passes_fig = px.line(passes_track_sheet, x="passes", y="coherence_score",
                    hover_name='perplexity')
passes_fig.show()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



## Check for Minimum Probability

In [98]:
#alpha = asymmetric
# For alpha = "symmetric", CHECK: model_topic_68_pass_25
model_topic_68_asymm_start_time = time.time()

model_topic_68_asymm = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=68,
                       random_state=42,
                       chunksize=2000,
                       passes=25,
                       alpha = 'asymmetric')

model_topic_68_asymm_end_time = time.time()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [99]:
model_topic_68_asymm_runtime = round(model_topic_68_asymm_end_time - model_topic_68_asymm_start_time, 2)
print(model_topic_68_asymm_runtime, "seconds to finish")

38.33 seconds to finish



`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [101]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_topic_68_asymm_perplexity = model_topic_68_asymm.log_perplexity(corpus)
print('\nPerplexity: ', model_topic_68_asymm_perplexity) 

# Compute Coherence Score
coherence_model_topic_68_asymm = CoherenceModel(model=model_topic_68_asymm, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_topic_68_asymm = coherence_model_topic_68_asymm.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_topic_68_asymm)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.




Perplexity:  -9.187749243620516

Coherence Score:  0.4290958512381921


In [103]:
# Comparision sheet for Alpha type

alpha_ts = {'model_iteration':[3,3],
            'model': [model_topic_68_pass_25, model_topic_68_asymm], 
      'runtime_seconds': [model_topic_68_pass_25_runtime, model_topic_68_asymm_runtime],
      'coherence_score': [coherence_lda_model_topic_68_pass_25, coherence_lda_model_topic_68_asymm],
      'perplexity': [model_topic_68_pass_25_perplexity,model_topic_68_asymm_perplexity],
      'number_of_topics': [model_topic_68_pass_25.num_topics,model_topic_68_asymm.num_topics],
      'passes': [25,25],
      'alpha':['symmetric','asymmetric']}

alpha_track_sheet = pd.DataFrame(data=alpha_ts)

alpha_track_sheet


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



Unnamed: 0,model_iteration,model,runtime_seconds,coherence_score,perplexity,number_of_topics,passes,alpha
0,3,"LdaModel(num_terms=8594, num_topics=68, decay=...",35.95,0.41269,-9.320251,68,25,symmetric
1,3,"LdaModel(num_terms=8594, num_topics=68, decay=...",38.33,0.429096,-9.187749,68,25,asymmetric


## Check for Decay
Decay = 0.7

In [104]:
# For decay, deafault value = 0.7, CHECK : model_topic_68_pass_25
model_topic_68_decay_start_time = time.time()

model_topic_68_decay = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=68,
                       random_state=42,
                       chunksize=2000,
                       passes=25,
                       decay=0.7)

model_topic_68_decay_end_time = time.time()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [107]:
model_topic_68_decay_runtime = round(model_topic_68_decay_end_time - model_topic_68_decay_start_time, 2)
print(model_topic_68_decay_runtime, "seconds to finish")

36.95 seconds to finish


In [109]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_topic_68_decay_perplexity = model_topic_68_decay.log_perplexity(corpus)
print('\nPerplexity: ', model_topic_68_decay_perplexity) 

# Compute Coherence Score
coherence_model_topic_68_decay = CoherenceModel(model=model_topic_68_decay, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_topic_68_decay = coherence_model_topic_68_decay.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_topic_68_decay)


Perplexity:  -9.471624985685802

Coherence Score:  0.3965738575668712


In [112]:
# Let's keep track of our progress

decay_ts = {'model_iteration':[4,4],
            'model': [model_topic_68_pass_25, model_topic_68_decay], 
      'runtime_seconds': [model_topic_68_pass_25_runtime, model_topic_68_decay_runtime],
      'coherence_score': [coherence_lda_model_topic_68_pass_25, coherence_lda_model_topic_68_decay],
      'perplexity': [model_topic_68_pass_25_perplexity,model_topic_68_decay_perplexity],
      'number_of_topics': [model_topic_68_pass_25.num_topics,model_topic_68_decay.num_topics],
      'passes': [model_topic_68_pass_25.passes, model_topic_68_decay.passes],
      'alpha':['symmetric','symmetric'],
      'decay':[model_topic_68_pass_25.decay, model_topic_68_decay.decay]}

decay_track_sheet = pd.DataFrame(data=decay_ts)

decay_track_sheet

Unnamed: 0,model_iteration,model,runtime_seconds,coherence_score,perplexity,number_of_topics,passes,alpha,decay
0,4,"LdaModel(num_terms=8594, num_topics=68, decay=...",35.95,0.41269,-9.320251,68,25,symmetric,0.5
1,4,"LdaModel(num_terms=8594, num_topics=68, decay=...",36.95,0.396574,-9.471625,68,25,symmetric,0.7


In [113]:
#Visualizing our progress
decay_fig = px.line(decay_track_sheet, x="decay", y="coherence_score",
                    hover_name='perplexity')
decay_fig.show()

## Optimal number of iterations
Iterations = 60

In [114]:
model_topic_68_iter_60_start_time = time.time()

model_topic_68_iter_60 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=68,
                       random_state=42,
                       chunksize=2000,
                       passes=25,
                       decay=0.5,
                       iterations=60)

model_topic_68_iter_60_end_time = time.time()

In [116]:
model_topic_68_iter_60_runtime = round(model_topic_68_iter_60_end_time - model_topic_68_iter_60_start_time, 2)
print(model_topic_68_iter_60_runtime, "seconds to finish")

36.67 seconds to finish


In [117]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_topic_68_iter_60_perplexity = model_topic_68_iter_60.log_perplexity(corpus)
print('\nPerplexity: ', model_topic_68_iter_60_perplexity) 

# Compute Coherence Score
coherence_model_topic_68_iter_60 = CoherenceModel(model=model_topic_68_iter_60, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_topic_68_iter_60 = coherence_model_topic_68_iter_60.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_topic_68_iter_60)


Perplexity:  -9.318638478131007

Coherence Score:  0.41447754827246847


Iterations = 70

In [118]:
model_topic_68_iter_70_start_time = time.time()

model_topic_68_iter_70 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=68,
                       random_state=42,
                       chunksize=2000,
                       passes=25,
                       decay=0.5,
                       iterations=70)

model_topic_68_iter_70_end_time = time.time()

In [119]:
model_topic_68_iter_70_runtime = round(model_topic_68_iter_70_end_time - model_topic_68_iter_70_start_time, 2)
print(model_topic_68_iter_70_runtime, "seconds to finish")

37.3 seconds to finish


In [120]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_topic_68_iter_70_perplexity = model_topic_68_iter_70.log_perplexity(corpus)
print('\nPerplexity: ', model_topic_68_iter_70_perplexity) 

# Compute Coherence Score
coherence_model_topic_68_iter_70 = CoherenceModel(model=model_topic_68_iter_70, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_topic_68_iter_70 = coherence_model_topic_68_iter_70.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_topic_68_iter_70)


Perplexity:  -9.323792979027012

Coherence Score:  0.4131789134941591


In [121]:
# Comparision sheet for different iterations
iterations_ts = {'model_iteration':[5,5,5],
                 'model': [model_topic_68_pass_25, model_topic_68_iter_60, model_topic_68_iter_70], 
      'runtime_seconds': [model_topic_68_pass_25_runtime, model_topic_68_iter_60_runtime, 
                          model_topic_68_iter_70_runtime],
      'coherence_score': [coherence_lda_model_topic_68_pass_25, coherence_lda_model_topic_68_iter_60,
                          coherence_lda_model_topic_68_iter_70],
      'perplexity': [model_topic_68_pass_25_perplexity,model_topic_68_iter_60_perplexity,
                     model_topic_68_iter_70_perplexity],
      'number_of_topics': [model_topic_68_pass_25.num_topics,model_topic_68_iter_60.num_topics,
                           model_topic_68_iter_70.num_topics],
      'passes': [model_topic_68_pass_25.passes, model_topic_68_iter_60.passes, model_topic_68_iter_70.passes],
      'alpha':['symmetric','symmetric','symmetric'],
      'decay':[model_topic_68_pass_25.decay, model_topic_68_iter_60.decay, model_topic_68_iter_70.decay],
      'iterations':[model_topic_68_pass_25.iterations, model_topic_68_iter_60.iterations,
                    model_topic_68_iter_70.iterations]}

iterations_track_sheet = pd.DataFrame(data=iterations_ts)

iterations_track_sheet

Unnamed: 0,model_iteration,model,runtime_seconds,coherence_score,perplexity,number_of_topics,passes,alpha,decay,iterations
0,5,"LdaModel(num_terms=8594, num_topics=68, decay=...",35.95,0.41269,-9.320251,68,25,symmetric,0.5,50
1,5,"LdaModel(num_terms=8594, num_topics=68, decay=...",36.67,0.414478,-9.318638,68,25,symmetric,0.5,60
2,5,"LdaModel(num_terms=8594, num_topics=68, decay=...",37.3,0.413179,-9.323793,68,25,symmetric,0.5,70


In [122]:
#Visualizing our progress
iterations_fig = px.line(iterations_track_sheet, x="iterations", y="coherence_score",
                    hover_name='perplexity')
iterations_fig.show()

# Final Model

In [123]:
model_final = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=68,
                       random_state=42,
                       chunksize=2000,
                       passes=25,
                       decay=0.5,
                       iterations=60)

In [124]:
# Filtering for words 
words = [re.findall('"([^"]*)"',t[1]) for t in model_final.print_topics()]

# Create Topics
topics = [' '.join(t[0:10]) for t in words]

# Getting the topics
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
🇺 🇸 hire game better say fight series lady xbox

------ Topic 1 ------
shit 🎉 hawzi girl thread ass good bad let user

------ Topic 2 ------
😭 🤣 💚 people forget 🍀 6 space today btc

------ Topic 3 ------
ready elon paul rand musk xrp come tesla prediction think

------ Topic 4 ------
president investigate wow restaurant pay have court take need school

------ Topic 5 ------
twitter day come need 😳 great look pladizow hear place

------ Topic 6 ------
⚡ wait go come head time tune look tonight sport

------ Topic 7 ------
lose covid-19 job crisis people fake joebiden support case news

------ Topic 8 ------
home stay ago month social day ok order work 2

------ Topic 9 ------
sleep people day 500 say give lie 100 reach hope

------ Topic 10 ------
oh bro time eye day total go care border bad

------ Topic 11 ------
people work help put think time realdonaldtrump 🖤 start begin

------ Topic 12 ------
quarantine 🚨 feel good time finally donaldtrumpisthetypeofguy tx u

In [125]:
# Compute Perplexity
# a measure of how good the model is. lower the better
final_perplexity = model_final.log_perplexity(corpus)
print('\nPerplexity: ', final_perplexity) 

# Compute Coherence Score
coherence_model = CoherenceModel(model=model_final, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_final = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_final)



Perplexity:  -9.32136869599782

Coherence Score:  0.41447754827246847


# Final Model Topic Distance Visualization

In [131]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
data = pyLDAvis.gensim.prepare(model_final, corpus, id2word)

In [136]:
final_lda_model = open(os.path.join(response_folder, 'final_lda.html'), 'w')
pyLDAvis.save_html(data,final_lda_model)

# GENERATING DOMINANT TOPICS

In [73]:
def format_topics_sentences(ldamodel=model_final, corpus=corpus, texts=df['lemma_tokens']):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=model_final, corpus=corpus, texts=df['lemma_tokens'])

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,63.0,0.388,"😂, mask, wear, think, heart, story, hey, los, ...","[kid, forever, legend, 😂]"
1,1,67.0,0.5014,"🤯, be, need, game, na, talk, -&gt, crazy, exac...","[truly, believe, lebrons, mindset, competitive..."
2,2,31.0,0.5073,"bitcoin, time, halving, halve, btc, price, buy...","[buttlicker, price, lower]"
3,3,52.0,0.8029,"❤, ️, good, song, u, 🙏, baby, ur, think, friend","[bhuvan_bam, ❤, ️, ❤, ️]"
4,4,39.0,0.6716,"read, book, money, cry, =, easy, number, fortn...","[cry, cry]"
5,5,18.0,0.3973,"people, curve, flatten, cure, usama__m, go, si...","[vintage, image, reimagine, 🎨, k_koi]"
6,6,50.0,0.8029,"money, block, laugh, blah, week, life, time, a...","[close, life, fall, apart]"
7,7,59.0,0.8768,"follow, retweet, giveaway, rt, comment, winner...","[4, million, sub, custom, tonight, 🤠, yay, nay]"
8,8,18.0,0.5154,"people, curve, flatten, cure, usama__m, go, si...","[country, club, foxy, know, gang, life]"
9,9,17.0,0.5736,"not, agree, watch, think, new, handle, money, ...","[stephen_kcco, thepodpmi, shahkgang, evanfoxy,..."


In [74]:
# Group top 5 sentences under each topic
sent_topics_sorted = pd.DataFrame()

sent_topics_output_grouped = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_output_grouped:
    sent_topics_sorted = pd.concat([sent_topics_sorted, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorted.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorted.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorted.head()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0.0,0.9875,"😭, 🤣, 💚, people, forget, 🍀, space, today, 6, btc","[libs, obama, administration, scandal, free, s..."
1,1.0,0.9635,"fuck, kill, have, say, come, 🤔, truth, cry, pe...","[fact, black, kill, 2x, man, white, white, kil..."
2,2.0,0.9682,"people, case, daily, pic, 😐, welcome, take, go...","[excited, 97, business, calc, completely, forg..."
3,3.0,0.9531,"obama, flynn, biden, unmask, official, general...","[break, federal, appeal, court, set, june, 2, ..."
4,4.0,0.9621,"lol, world, work, today, think, day, scandal, ...","[economic, mismanagement, extreme, current, re..."


In [75]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show dominant topics
df_dominant_topics


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
0.0,63.0,"😂, mask, wear, think, heart, story, hey, los, ...",304.0,0.0306
1.0,67.0,"🤯, be, need, game, na, talk, -&gt, crazy, exac...",169.0,0.0170
2.0,31.0,"bitcoin, time, halving, halve, btc, price, buy...",114.0,0.0115
3.0,52.0,"❤, ️, good, song, u, 🙏, baby, ur, think, friend",174.0,0.0175
4.0,39.0,"read, book, money, cry, =, easy, number, fortn...",128.0,0.0129
...,...,...,...,...
9941.0,46.0,"black, people, 💀, wait, criminal, fan, s, life...",,
9942.0,8.0,"listen, kind, wake, people, day, country, come...",,
9943.0,47.0,"work, pass, 🔸, issue, pay, think, say, state, ...",,
9944.0,61.0,"️, ♂, ‍, ♀, 🤷, 🤦, meme, new, season, coronavirus",,


# Final Model Topic Distance Visualization


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



## Using Mallet for LDA

In [77]:
import os
from gensim.models.wrappers import LdaMallet
os.environ.update({'MALLET_HOME':r'C:/mallet/mallet-2.0.8/'}) 
#You should update this path as per the path of Mallet directory on your system.
mallet_path = r'C:/mallet/mallet-2.0.8/bin/mallet.bat' 
#You should update this path as per the path of Mallet directory on your system.


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [78]:
ldamallet = gensim.models.wrappers.LdaMallet(
   mallet_path, corpus=corpus, num_topics=20, id2word=id2word
)
pprint(ldamallet.show_topics(formatted=False))


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



[(12,
  [('biden', 0.036557149467838966),
   ('question', 0.022674687644608976),
   ('joe', 0.02059231837112448),
   ('hear', 0.018741323461360482),
   ('turn', 0.018047200370198982),
   ('dr', 0.016196205460434984),
   ('issue', 0.015964831096714482),
   ('clear', 0.014345210550670985),
   ('answer', 0.014113836186950486),
   ('ass', 0.013651087459509487)]),
 (15,
  [('week', 0.03861256544502618),
   ('case', 0.03337696335078534),
   ('china', 0.028359511343804537),
   ('covid-19', 0.02574171029668412),
   ('high', 0.0231239092495637),
   ('test', 0.02181500872600349),
   ('month', 0.02137870855148342),
   ('covid19', 0.02137870855148342),
   ('number', 0.019197207678883072),
   ('bring', 0.01767015706806283)]),
 (4,
  [('day', 0.10351692103516921),
   ('today', 0.04821942048219421),
   ('happy', 0.035611590356115906),
   ('support', 0.027206370272063702),
   ('hope', 0.025215660252156602),
   ('mother', 0.0207918602079186),
   ('birthday', 0.017916390179163903),
   ('community', 0.01

In [79]:
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=df['lemma_tokens'], dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.




Coherence Score:  0.41858294943512303
