### Imports

In [1]:
#Base and Cleaning
import os
import tqdm
import pandas as pd
import numpy as np
import emoji
import regex
import re
import string
from collections import Counter

#Visualizations
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import pyLDAvis.gensim_models
import chart_studio
import chart_studio.plotly as py
import chart_studio.tools as tls

#Natural Language Processing (NLP)
import spacy
import gensim
import advertools as adv
import stopwordsiso
from spacy.tokenizer import Tokenizer
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS as SW
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from pprint import pprint
from wordcloud import STOPWORDS

stopwords = set(STOPWORDS)

import logging

logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [2]:
TWEETS_SAMPLE = '/mlodata1/raileanu/tweets_sample.txt'
TWEETS_PROCESSED = '/mlodata1/raileanu/tweets_sample_lda.csv'
MODEL_RESULTS = '/mlodata1/raileanu/lda_model_results.csv'
LDA_HTML_OUTPUT = '/mlodata1/raileanu/lda_results'
LDA_MODEL_OUTPUT = '/mlodata1/raileanu/lda_results/models'

#### If Preprocessed Tweets Exist, Skip to Modeling

In [3]:
if os.path.isfile(TWEETS_PROCESSED):
    sample_tweets_df = pd.read_csv(TWEETS_PROCESSED,
                                   dtype={"tweet": str, "string_token": str, "string_lemma": str,
                                          "lemma_token": str})
    sample_tweets_df["lemma_token"] = sample_tweets_df["lemma_token"].apply(
        lambda x: x[1:-1].split(','))
    print("Sample tweets loaded!")
    sample_tweets_df.head()

Sample tweets loaded!


### Read Sampled Tweets

In [27]:
lines = []
with open(TWEETS_SAMPLE) as file:
    lines = file.read().splitlines()

In [28]:
sample_tweets_df = pd.DataFrame(lines, columns=['tweet'])
sample_tweets_df

Unnamed: 0,tweet
0,and we thought 2021 was gon na be the end of c...
1,"user_placeholder como eu já tinha dito , a efi..."
2,user_placeholder user_placeholder this is less...
3,yep . expeditiously
4,( urgent ) 5 more soldiers test positive for c...
...,...
99728,not everything will be fun and games uh ?
99729,user_placeholder user_placeholder total embarr...
99730,user_placeholder user_placeholder imagine the ...
99731,24 કલાકમાં ગુજરાતમાં કોરોના કેસurl_placeholder


### Preprocess Tweets

In [29]:
def remove_emojis(text):
    """
    Removes emoji's from tweets
    Accepts:
        Text (tweets)
    Returns:
        Text (emoji free tweets)
    """
    return str(text).encode('ascii', 'ignore').decode('ascii')

In [30]:
%%time
# Remove emojis from the tweets
sample_tweets_df['tweet'] = sample_tweets_df['tweet'].apply(lambda x: remove_emojis(x))
sample_tweets_df

CPU times: user 90.1 ms, sys: 85 µs, total: 90.2 ms
Wall time: 86.2 ms


Unnamed: 0,tweet
0,and we thought 2021 was gon na be the end of c...
1,"user_placeholder como eu j tinha dito , a efic..."
2,user_placeholder user_placeholder this is less...
3,yep . expeditiously
4,( urgent ) 5 more soldiers test positive for c...
...,...
99728,not everything will be fun and games uh ?
99729,user_placeholder user_placeholder total embarr...
99730,user_placeholder user_placeholder imagine the ...
99731,24 url_placeholder


### Tokenization

In [31]:
%%time
nlp = spacy.load('en_core_web_lg')
multilingual_nlp = spacy.load('xx_ent_wiki_sm')
# Tokenizer
tokenizer = Tokenizer(nlp.vocab)

# Custom stopwords can be added
custom_stopwords = ['\n', '\n\n', '&amp;', ' ', '.', '-', '$', '@', '!', '?', '..',
                    '\'', '+', '=', '-', '~', '//', '/', ':', '+.', 'de', 'la', 'el',
                    'y', 'en', 'se', 'es', 'las', 'un', 'lo', 'si', 'ms',
                    'por', 'los', 'con', 'para', 'del', 'una', 'pero', 'todo',
                    'le', 'les', 'et', 'pas', 'des', 'pour', 'que', 'est',
                    'die', 'und', 'der', 'ist', 'das', 'nicht', 'ich', 'zu', 'den',
                    'al', 'q', 'o', 'sin', 'este', 'te', 'son', 'nos', 'c', 'ser',
                    'qu', 'esta', 'mi', 'hay', 'e', 'da', 'com', 'um', 'pra',
                    'na', 'em', 'uma', 'os', 'mais', 's', 'mas', 'tem', 't', 'j',
                    'ser', 'vai', 'ele', 'isso', 'sem', 'meu', 'foi', 'l', 'qui',
                    'd', 'ce', 'une', 'au', 'il', 'vous', 'sur', 'dans', 'par', 'p',
                    'n', 'avec', 's', ',', '.user_placeholder', 'vai', 'r', 'z',
                    'como', 'su', 'ya', 'user_placeholder', 'url_placeholder']

# Customize stop words by adding to the default list
STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords).union(
    multilingual_nlp.Defaults.stop_words)
# ALL_STOP_WORDS = spacy (EN + XX) + gensim + wordcloud + advtools + stopwordsiso
ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords)
for language, swords in adv.stopwords.items():
    ALL_STOP_WORDS.union(swords)
for lang in stopwordsiso.langs():
    ALL_STOP_WORDS.union(stopwordsiso.stopwords(lang))

tokens = []
for doc in tokenizer.pipe(sample_tweets_df['tweet'], batch_size=512):
    doc_tokens = []
    for token in doc:
        if token.text.lower() not in STOP_WORDS:
            doc_tokens.append(token.text.lower())
    # Make tokens a string again
    tokens.append(doc_tokens)

# Makes tokens column
sample_tweets_df['string_token'] = [' '.join(map(str, l)) for l in tokens]
sample_tweets_df

CPU times: user 11.5 s, sys: 1.65 s, total: 13.1 s
Wall time: 13.3 s


Unnamed: 0,tweet,string_token
0,and we thought 2021 was gon na be the end of c...,thought 2021 gon end corona
1,"user_placeholder como eu j tinha dito , a efic...",eu tinha dito eficcia vachina cientficamente c...
2,user_placeholder user_placeholder this is less...,covid safe young people cars awful downplay ri...
3,yep . expeditiously,yep expeditiously
4,( urgent ) 5 more soldiers test positive for c...,( urgent ) 5 soldiers test positive coronaviru...
...,...,...
99728,not everything will be fun and games uh ?,fun games uh
99729,user_placeholder user_placeholder total embarr...,total embarrassment 55000 tested 13 doubt dict...
99730,user_placeholder user_placeholder imagine the ...,imagine look faces arrive canada told quaranti...
99731,24 url_placeholder,24


### Lemmatization

In [32]:
# Lemmatizer
def get_lemmas(text):
    '''Used to lemmatize the processed tweets'''
    lemmas = []
    doc = nlp(text)
    for token in doc:
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
            lemmas.append(token.lemma_)
    return lemmas


# Tokenizer as a function
def tokenize(text):
    """
    Parses a string into a list of semantic units (words)
    Args:
        text (str): The string that the function will tokenize.
    Returns:
        list: tokens parsed out
    """
    tokens = re.sub(r'[^a-zA-Z 0-9]', '',
                    text)  # Remove text that doesn't contain letters or numbers
    tokens = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
    tokens = re.sub(r'\w*\d\w*', '', text)  # Remove words containing numbers
    tokens = tokens.lower().split()  # Make text lowercase and split it
    return tokens

In [33]:
%%time
# Lemmatize the tweets
sample_tweets_df['string_lemma'] = [' '.join(map(str, l)) for l in
                                    sample_tweets_df['string_token'].apply(get_lemmas)]
sample_tweets_df

CPU times: user 9min 13s, sys: 1.83 s, total: 9min 15s
Wall time: 9min 15s


Unnamed: 0,tweet,string_token,string_lemma
0,and we thought 2021 was gon na be the end of c...,thought 2021 gon end corona,think 2021 gon end corona
1,"user_placeholder como eu j tinha dito , a efic...",eu tinha dito eficcia vachina cientficamente c...,eu tinha dito eficcia vachina cientficamente c...
2,user_placeholder user_placeholder this is less...,covid safe young people cars awful downplay ri...,covid safe young people car awful downplay ris...
3,yep . expeditiously,yep expeditiously,yep expeditiously
4,( urgent ) 5 more soldiers test positive for c...,( urgent ) 5 soldiers test positive coronaviru...,urgent 5 soldier test positive coronavirus fro...
...,...,...,...
99728,not everything will be fun and games uh ?,fun games uh,fun game uh
99729,user_placeholder user_placeholder total embarr...,total embarrassment 55000 tested 13 doubt dict...,total embarrassment 55000 test 13 doubt dictat...
99730,user_placeholder user_placeholder imagine the ...,imagine look faces arrive canada told quaranti...,imagine look face arrive canada tell quarantin...
99731,24 url_placeholder,24,24


In [34]:
%%time
# Tokenize lemmatized tweets
sample_tweets_df['lemma_token'] = sample_tweets_df['string_lemma'].apply(tokenize)
sample_tweets_df

CPU times: user 2.03 s, sys: 72.1 ms, total: 2.1 s
Wall time: 2.1 s


Unnamed: 0,tweet,string_token,string_lemma,lemma_token
0,and we thought 2021 was gon na be the end of c...,thought 2021 gon end corona,think 2021 gon end corona,"[think, gon, end, corona]"
1,"user_placeholder como eu j tinha dito , a efic...",eu tinha dito eficcia vachina cientficamente c...,eu tinha dito eficcia vachina cientficamente c...,"[eu, tinha, dito, eficcia, vachina, cientficam..."
2,user_placeholder user_placeholder this is less...,covid safe young people cars awful downplay ri...,covid safe young people car awful downplay ris...,"[covid, safe, young, people, car, awful, downp..."
3,yep . expeditiously,yep expeditiously,yep expeditiously,"[yep, expeditiously]"
4,( urgent ) 5 more soldiers test positive for c...,( urgent ) 5 soldiers test positive coronaviru...,urgent 5 soldier test positive coronavirus fro...,"[urgent, soldier, test, positive, coronavirus,..."
...,...,...,...,...
99728,not everything will be fun and games uh ?,fun games uh,fun game uh,"[fun, game, uh]"
99729,user_placeholder user_placeholder total embarr...,total embarrassment 55000 tested 13 doubt dict...,total embarrassment 55000 test 13 doubt dictat...,"[total, embarrassment, test, doubt, dictator, ..."
99730,user_placeholder user_placeholder imagine the ...,imagine look faces arrive canada told quaranti...,imagine look face arrive canada tell quarantin...,"[imagine, look, face, arrive, canada, tell, qu..."
99731,24 url_placeholder,24,24,[]


In [35]:
sample_tweets_df.to_csv(TWEETS_PROCESSED, index=False)

### Modeling

In [4]:
# Create an id2word dictionary
id2word = Dictionary(sample_tweets_df['lemma_token'])
print(len(id2word))

140799


In [5]:
# Filtering Extremes
id2word.filter_extremes(no_below=2, no_above=.99)
print(len(id2word))

49113


In [6]:
# Creating a corpus
corpus = [id2word.doc2bow(d) for d in sample_tweets_df['lemma_token']]

In [7]:
%%time
# Instantiating the LDA model
lda_model = LdaMulticore(corpus=corpus,
                         id2word=id2word,
                         num_topics=100,
                         random_state=100,
                         chunksize=100,
                         passes=10,
                         alpha='asymmetric',
                         eta=0.31,
                         workers=8)

CPU times: user 10min, sys: 7min 35s, total: 17min 35s
Wall time: 17min 4s


In [8]:
# Filtering for words
words = [re.findall(r'"([^"]*)"', t[1]) for t in lda_model.print_topics()]
# Create Topics
topics = [' '.join(t[0:10]) for t in words]
# Getting the topics
for id, t in enumerate(topics):
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
 'mayor' 'live'  'management'  'officer'  'ja'  'quarentena'  'acabar'  'evento'  'fila'  'john'

------ Topic 1 ------
 'wuhan'  'chinese'  'yesterday'  'current' 'break'  'tomorrow'  'word'  'emerge'  'manipulate'  'kong'

------ Topic 2 ------
 'fall' 'omg'  'liar'  'eliminate'  'journalism' 'trust'  'iq'  'bang'  '.-'  'afirma'

------ Topic 3 ------
 'eu'  'vacina'  'covid'  'pandemia'  'mscara'  'ter' 'eu'  'essa'  'do'  'pq'

------ Topic 4 ------
 'polripresisi' 'insist'  'jogojawatimur'  'cosmetic'  'zi' 'tetapdirumahsaja'  'scoop'  'smuggle' 'enforcement'  'cleanliness'

------ Topic 5 ------
 'prevencin' 'corruption'  'variantes'  'macron'  'mtodo'  'garcia'  'passsanitaire'  'sentido'  'french'  'ataque'

------ Topic 6 ------
'look'  'truth'  'ass'  'worth'  'york'  'wearamask'  'poder'  'dar'  'property'  'random'

------ Topic 7 ------
 'mr'  'screw'  'petition'  'homeschoole'  'homeschool' 'meet'  'colorado'  'edtech'  'grammar'  'backfire'

------

In [9]:
# Compute Perplexity
# a measure of how good the model is. lower the better
base_perplexity = lda_model.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity)

# Compute Coherence Score
coherence_model = CoherenceModel(model=lda_model, texts=sample_tweets_df['lemma_token'],
                                 dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)


Perplexity:  -9.696782523426908

Coherence Score:  0.5210033427993294


### Visualization

In [10]:
# Load the model if wanted
LOAD_MODEL = False
lda_model_fname = "index-266__n_topics-100__coherence-5358.bin"

if LOAD_MODEL:
    lda_model = LdaMulticore.load(os.path.join(LDA_MODEL_OUTPUT, lda_model_fname))

In [11]:
#Creating Topic Distance Visualization
pyLDAvis.enable_notebook()
lda_vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
lda_vis

  default_term_info = default_term_info.sort_values(


In [12]:
# Export visualization
lda_html_fname = "index-266__n_topics-100__coherence-5358.html"
pyLDAvis.save_html(lda_vis, os.path.join(LDA_HTML_OUTPUT, lda_html_fname))

In [13]:
# Export model
lda_model.save(os.path.join(LDA_MODEL_OUTPUT, lda_model_fname))

### Hyper Parameter Tuning

In [43]:
vectorizer = CountVectorizer()
data_vectorized = vectorizer.fit_transform(sample_tweets_df['string_lemma'])
# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30, 50, 75, 100, 150, 200],
                 'learning_decay': [.5, .7, .9]}

In [70]:
def coherence_scorer(model, X, y):
    coherence_model = CoherenceModel(model=model, texts=sample_tweets_df['lemma_token'],
                                     dictionary=id2word, coherence='c_v')
    print(coherence_model.get_coherence())
    return coherence_model.get_coherence()


coherence_score = make_scorer(coherence_scorer, greater_is_better=True)

In [71]:
%%time
# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)

Traceback (most recent call last):
  File "/mlodata1/raileanu/Topics_Visualization/env/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 760, in _score
    scores = scorer(estimator, X_test)
TypeError: __call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "/mlodata1/raileanu/Topics_Visualization/env/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 760, in _score
    scores = scorer(estimator, X_test)
TypeError: __call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "/mlodata1/raileanu/Topics_Visualization/env/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 760, in _score
    scores = scorer(estimator, X_test)
TypeError: __call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "/mlodata1/raileanu/Topics_Visualization/env/lib/python3.8/site-packages/sklearn/model_selection/

CPU times: user 10min 55s, sys: 166 ms, total: 10min 55s
Wall time: 10min 55s


GridSearchCV(estimator=LatentDirichletAllocation(),
             param_grid={'learning_decay': [0.5], 'n_components': [10]},
             scoring=make_scorer(coherence_scorer))

In [72]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.5, 'n_components': 10}
Best Log Likelihood Score:  nan
Model Perplexity:  11358.634970974872


### When Libraries Don't Work as Intended :)

#### # of Topics

In [76]:
# Defining a function to loop over number of topics to be used to find an
# optimal number of topics
def compute_coherence_values_num_topics(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values_topic = []
    model_list_topic = []
    total_num_topics = []
    for num_topics in range(start, limit, step):
        model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=id2word, workers=8,
                             passes=25)
        model_list_topic.append(model)
        total_num_topics.append(num_topics)
        coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dictionary,
                                         coherence='c_v')
        coherence_values_topic.append(coherence_model.get_coherence())

    return model_list_topic, coherence_values_topic, total_num_topics

In [77]:
%%time
# Coherence Value Computation
model_list_topic, coherence_values_topic, total_num_topics = compute_coherence_values_num_topics(
    dictionary=id2word,
    corpus=corpus,
    texts=sample_tweets_df['lemma_token'],
    start=2, limit=200, step=6)

In [78]:
best_index = coherence_values_topic.index(max(coherence_values_topic))
print(max(coherence_values_topic), best_index, total_num_topics[best_index])

0.34860840260083875 32 194


#### All Hyper Params

In [87]:
def compute_coherence_values(corpus, dictionary, k, p, a, b):
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k,
                                           random_state=100,
                                           chunksize=100,
                                           passes=p,
                                           alpha=a,
                                           eta=b,
                                           workers=8)

    coherence_model_lda = CoherenceModel(model=lda_model, texts=sample_tweets_df['lemma_token'],
                                         dictionary=id2word, coherence='c_v')

    return coherence_model_lda.get_coherence()

In [103]:
grid = {'Validation_Set': {}}
# Topics range
min_topics = 5
max_topics = 100
step_size = 5
# topics_range = range(min_topics, max_topics, step_size)
topics_range = [10, 25, 50, 75, 100]
# Num of Passes
passes = range(10, 20, 5)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')
# Validation sets
num_of_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs * 0.75)),
               corpus]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Passes': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': [],
                 }

In [105]:
%%time
pbar_total = len(corpus_sets) * len(topics_range) * len(passes) * len(alpha) * len(beta)
pbar = tqdm.tqdm(total=pbar_total)

# iterate through validation corpus
for i in range(len(corpus_sets)):
    # iterate through number of topics
    for k in topics_range:
        # iterate through passes
        for p in passes:
            # iterate through alpha values
            for a in alpha:
                # iterate through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word,
                                                  k=k, p=p, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Passes'].append(p)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    pbar.update(1)
pd.DataFrame(model_results).to_csv(MODEL_RESULTS, index=False)
pbar.close()

  0%|                                                                            | 0/600 [00:05<?, ?it/s]


KeyboardInterrupt: 