In [None]:
import random
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation # LDA
from sklearn.model_selection import GridSearchCV

# Read data

In [None]:
# Read csv
tweets = pd.read_csv("../input/pfizer-vaccine-tweets/vaccination_tweets.csv")

tweets.head()

In [None]:
tweets.shape

In [None]:
tweets.info()

# Preprocessing

In [None]:
stops = stopwords.words("english")             

tweets['text'] = tweets.text.str.replace("[^\w\s]", "").str.lower() # Lowercase
tweets['text'] = tweets['text'].apply(lambda x: 
                              ' '.join([item for item in x.split() if item not in stops])) # Remove english stopwords
tweets['text'] = tweets.text.str.replace("[^\w\s]", "") # Remove punctuation
tweets['text'] = tweets['text'].str.replace('\d+', '') # Remove digits

In [None]:
tweets['text'] 

# Create the document term matrix

We need to create vocabulary of all the words in our data by using the CountVectorizer class from the sklearn.feature_extraction.text module to create a document-term matrix. We specify to only include those words that appear in less than 80% of the document and appear in at least 5 tweets. We also remove all the stop words as they do not really contribute to topic modeling.

In [None]:
count_vect = CountVectorizer(max_df=0.75, min_df=10, stop_words='english')
doc_term_matrix = count_vect.fit_transform(tweets['text'].values.astype('U'))

# Print the document text matrix
doc_term_matrix

# LDA with GridSearchCV

Topic modeling involves counting words and grouping similar word patterns to describe topics within the data. If the model knows the word frequency, and which words often appear in the same document, it will discover patterns that can group different words together.



In [None]:
# Define Search Params to check which combination of parameters are the best
search_params = {'n_components': [5, 6, 7, 8, 10], 'learning_decay': [.2, .5, .7, .9]}
cv = 2

# Initialize LDA to perform LDA on our document-term matrix
# n_components specifies the number of categories, or topics, that we want our text to be divided into. 
LDA = LatentDirichletAllocation(random_state=0, evaluate_every=-1, n_jobs=-1)

# Init Grid Search Class
model = GridSearchCV(LDA, param_grid=search_params, cv=cv, verbose=5, n_jobs=-1)

# Fit transform the feature matrix
# This step might take a while a GridSearchCV() can indeed take a huge amount of CPU-time / CPU-poolOfRESOURCEs
model.fit(doc_term_matrix)

In [None]:
# Best Model
best_lda_model = model.best_estimator_
print("Best Model's Params: ", model.best_params_)

# Model's perplexity and log-likelihood

In [None]:
# print log-likelihood
print("Log likelihood: ", model.best_score_)

In [None]:
# print perplexity
print("Perplexity: ", best_lda_model.perplexity(doc_term_matrix))

# Compare LDA Models Performance Scores

Plotting the log-likelihood scores against num_topics, clearly shows number of topics = 5 has better scores. And learning_decay doesn't seem to make much of a difference.

In [None]:
# Get Log Likelyhoods from Grid Search Output
n_topics = [5, 6, 7, 8, 10] # same as in n_components in search_param variable

means = model.cv_results_['mean_test_score']

params = model.cv_results_['params']

log_likelyhoods_2 = [round(mean) 
                     for mean, params in zip(means, params) if params['learning_decay']==0.2]
log_likelyhoods_5 = [round(mean) 
                     for mean, params in zip(means, params) if params['learning_decay']==0.5]
log_likelyhoods_7 = [round(mean) 
                     for mean, params in zip(means, params) if params['learning_decay']==0.7]
log_likelyhoods_9 = [round(mean)  
                    for mean, params in zip(means, params) if params['learning_decay']==0.9]

# Show graph
plt.figure(figsize=(12, 8))
plt.plot(n_topics, log_likelyhoods_2, label='0.2')
plt.plot(n_topics, log_likelyhoods_5, label='0.5')
plt.plot(n_topics, log_likelyhoods_7, label='0.7')
plt.plot(n_topics, log_likelyhoods_9, label='0.9')
plt.title("Choosing The best LDA Model")
plt.xlabel("Number of Topics")
plt.ylabel("Log Likelyhood Scores")
plt.legend(title='Learning decay', loc='best')
plt.show()


# Plot LDA best model

Topic modeling is useful, but it’s difficult to understand it just by looking at a combination of words and numbers like above.

One of the most effective ways to understand data is through visualization. For that purpose I am going to use [pyLDAvis](https://pyldavis.readthedocs.io/en/latest/readme.html). The pyLDAvis is designed to help users interpret the topics in a topic model that has been fit to a corpus of text data.

Click in the image to interact with the visualization yourself. Here you have some tips to help you understand the plot:

- Each bubble represents a topic. The larger the bubble, the higher percentage of the number of tweets in the corpus is about that topic. 
- Blue bars represent the overall frequency of each word in the corpus. If no topic is selected, the blue bars of the most frequently used words will be displayed. 
- Red bars give the estimated number of times a given term was generated by a given topic. 


The further the bubbles are away from each other, the more different they are. A good topic model will have big and non-overlapping bubbles scattered throughout the chart.

In [None]:
import pyLDAvis.sklearn

panel = pyLDAvis.sklearn.prepare(best_lda_model, doc_term_matrix, count_vect, mds='tsne')
pyLDAvis.display(panel)
