# Topic Modeling

### Import required libraries

In [1]:
# !pip install pyLDAvis # Uncomment and install this visualization library

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
#Data manipulation
import pandas as pd
import numpy as np
from collections import Counter
from pprint import pprint

# Data preprocessing & cleaning
import re
import string
from nltk.corpus import stopwords
from nltk import pos_tag, WordNetLemmatizer
from gensim.utils import simple_preprocess
import gensim.corpora as corpora

# Modeling
import gensim

# Model Evaluation
from gensim.models import CoherenceModel

# Plotting tools
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.ticker import FuncFormatter
import seaborn as sns
from plotly.tools import mpl_to_plotly
import pyLDAvis
import pyLDAvis.gensim 

In [4]:
plt.style.use('ggplot')
stop_words=stopwords.words('english')

Extend the list of stop words

In [5]:
stop_words.extend(['from', 'subject', 're', 'edu', 'use','know','dont'])

### Load dataset

In [6]:
df=pd.read_csv("../datasets/tweets.csv")

In [7]:
df.shape

(3921, 19)

Check first 5 rows 

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,screen_name,name,user_verification,followers_count,friends_count,listed_count,retweet_count,favorite_count,retweeted,entities,favourites_count,location,created_at,text,clean_text,sentiment_polarity,sentiment_subjectivity,sentiment_polarity_color
0,0,nat___price,natalie price,False,62,732,0,2,0,False,"{'hashtags': [{'text': 'covid', 'indices': [45...",13671,"Yorkshire and The Humber, England",2021-01-30 09:09:05,RT @NickTriggle: The devastating toll of the #...,The devastating toll of the covid pandemic on...,-0.55,0.6,red
1,1,libertad717,Punto,False,1084,4760,468,11,0,False,"{'hashtags': [], 'symbols': [], 'user_mentions...",208,,2021-01-30 09:09:00,RT @NATOBrazeB: Great to meet online @UN’s Und...,Great to meet online Undersecretary for Glob...,0.4,0.375,green
2,2,marylouisepearc,marylouise lady of leisure,False,124,244,3,7,0,False,"{'hashtags': [], 'symbols': [], 'user_mentions...",57448,,2021-01-30 09:08:58,RT @SiCarswell: That screeching noise to be he...,That screeching noise to be heard is the EU C...,-0.008333,0.55,red
3,3,JhSalford,JHSalford,False,177,177,0,336,0,False,"{'hashtags': [{'text': 'COVID', 'indices': [76...",435,"Salford, England",2021-01-30 09:08:56,RT @GabrielScally: Don't be silly. The UK gove...,Don't be silly. The UK government has complet...,-0.077778,0.751389,red
4,4,SarahEdmondsPhD,Fear is the MindKiller,False,817,1861,9,387,0,False,"{'hashtags': [{'text': 'Covid', 'indices': [12...",53062,"Flagstaff, AZ",2021-01-30 09:08:49,RT @AlexBerenson: So after four weeks more ser...,So after four weeks more serious adverse even...,0.083333,0.583333,green


Drop unnecessary columns

In [9]:
df=df[['clean_text','sentiment_polarity_color']]

Rename columns

In [10]:
df.columns=['content','target']

In [11]:
df.head()

Unnamed: 0,content,target
0,The devastating toll of the covid pandemic on...,red
1,Great to meet online Undersecretary for Glob...,green
2,That screeching noise to be heard is the EU C...,red
3,Don't be silly. The UK government has complet...,red
4,So after four weeks more serious adverse even...,green


In [12]:
df.head()

Unnamed: 0,content,target
0,The devastating toll of the covid pandemic on...,red
1,Great to meet online Undersecretary for Glob...,green
2,That screeching noise to be heard is the EU C...,red
3,Don't be silly. The UK government has complet...,red
4,So after four weeks more serious adverse even...,green


Check rows and columns

In [13]:
df.shape

(3921, 2)

### Preprocess Data

Convert the text into lower case

In [14]:
def convert_to_lower_case(text):
    return "".join([t.lower() for t in text])

In [15]:
df['content']=df['content'].apply(lambda x: convert_to_lower_case(x))

In [16]:
df.head()

Unnamed: 0,content,target
0,the devastating toll of the covid pandemic on...,red
1,great to meet online undersecretary for glob...,green
2,that screeching noise to be heard is the eu c...,red
3,don't be silly. the uk government has complet...,red
4,so after four weeks more serious adverse even...,green


Remove emails

In [17]:
def remove_emails(text):
    data = ' '.join([item for item in text.split() if '@' not in item])
    return data

In [18]:
df['content']=df['content'].apply(lambda x: remove_emails(x))

In [19]:
df.head()

Unnamed: 0,content,target
0,the devastating toll of the covid pandemic on ...,red
1,great to meet online undersecretary for global...,green
2,that screeching noise to be heard is the eu co...,red
3,don't be silly. the uk government has complete...,red
4,so after four weeks more serious adverse event...,green


Remove new line characters

In [20]:
def remove_line_character(text):
    data=text.rstrip()
    return data

In [21]:
df['content']=df['content'].apply(lambda x: remove_line_character(x))

In [22]:
df.head()

Unnamed: 0,content,target
0,the devastating toll of the covid pandemic on ...,red
1,great to meet online undersecretary for global...,green
2,that screeching noise to be heard is the eu co...,red
3,don't be silly. the uk government has complete...,red
4,so after four weeks more serious adverse event...,green


Remove single quotes

Alternatively can use punctuation function

In [23]:
def remove_single_quotes(text):
    data=text.replace("'", "")
    return data

In [24]:
df['content']=df['content'].apply(lambda x: remove_single_quotes(x))

In [25]:
df.head()

Unnamed: 0,content,target
0,the devastating toll of the covid pandemic on ...,red
1,great to meet online undersecretary for global...,green
2,that screeching noise to be heard is the eu co...,red
3,dont be silly. the uk government has completel...,red
4,so after four weeks more serious adverse event...,green


Remove Punctuations

In [26]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [27]:
def remove_punctuation(text):
    return "".join([t for t in text if t not in string.punctuation])

In [28]:
df['content']=df['content'].apply(lambda x: remove_punctuation(x))

In [29]:
df.head()

Unnamed: 0,content,target
0,the devastating toll of the covid pandemic on ...,red
1,great to meet online undersecretary for global...,green
2,that screeching noise to be heard is the eu co...,red
3,dont be silly the uk government has completely...,red
4,so after four weeks more serious adverse event...,green


Remove words with less than 3 characters

In [30]:
def words_less_than_three_chars(text):
    return " ".join([t for t in text.split() if len(t)>2])

In [31]:
df['content']=df['content'].apply(lambda x: words_less_than_three_chars(x))

In [32]:
df.head()

Unnamed: 0,content,target
0,the devastating toll the covid pandemic childr...,red
1,great meet online undersecretary for global co...,green
2,that screeching noise heard the commission mov...,red
3,dont silly the government has completely botch...,red
4,after four weeks more serious adverse event re...,green


Remove digits in data

In [33]:
df['content'].replace('\d+', '', regex=True, inplace=True)


invalid escape sequence \d


invalid escape sequence \d


invalid escape sequence \d


invalid escape sequence \d



Remove rows without data

In [34]:
df.drop(df[(df['content']=='') | (df['content']==' ')].index, inplace=True,axis=0)

In [35]:
df[df['content']=='']

Unnamed: 0,content,target


In [36]:
df.shape

(3915, 2)

Remove non-alpha numeric characters

In [37]:
def remove_non_alpha_numerics(text):
    alpha_num=' '.join([word for word in text.split() if word.isalpha()])
    return alpha_num

In [38]:
df['content']=df['content'].apply(lambda x: remove_non_alpha_numerics(x))

In [39]:
df.head()

Unnamed: 0,content,target
0,the devastating toll the covid pandemic childr...,red
1,great meet online undersecretary for global co...,green
2,that screeching noise heard the commission mov...,red
3,dont silly the government has completely botch...,red
4,after four weeks more serious adverse event re...,green


Tokenize the text

In [40]:
def text_tokenizaion(text):
    return re.split(' ',text)

In [41]:
df['content']=df['content'].apply(lambda x: text_tokenizaion(x))

In [42]:
df.head()

Unnamed: 0,content,target
0,"[the, devastating, toll, the, covid, pandemic,...",red
1,"[great, meet, online, undersecretary, for, glo...",green
2,"[that, screeching, noise, heard, the, commissi...",red
3,"[dont, silly, the, government, has, completely...",red
4,"[after, four, weeks, more, serious, adverse, e...",green


Remove stopword
Comment this section when using Extracts Nouns only function

In [43]:
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [44]:
def remove_stopwords(text):
    return [w for w in text if w not in stop_words]

In [45]:
df['content']=df['content'].apply(lambda x: remove_stopwords(x))

In [46]:
df.head()

Unnamed: 0,content,target
0,"[devastating, toll, covid, pandemic, children,...",red
1,"[great, meet, online, undersecretary, global, ...",green
2,"[screeching, noise, heard, commission, moving,...",red
3,"[silly, government, completely, botched, covid...",red
4,"[four, weeks, serious, adverse, event, reports...",green


Extracts Nouns only

In [47]:
# def extract_nouns(text):
#     is_noun = lambda pos: pos[:2] == 'NN' or  pos[:2] == 'RB' or  pos[:2] == 'JJ'
#     all_nouns = " ".join([word for (word, pos) in pos_tag(text) if is_noun(pos)])
#     return all_nouns

In [48]:
# df['content']=df['content'].apply(lambda x: extract_nouns(x))

In [49]:
df.head()

Unnamed: 0,content,target
0,"[devastating, toll, covid, pandemic, children,...",red
1,"[great, meet, online, undersecretary, global, ...",green
2,"[screeching, noise, heard, commission, moving,...",red
3,"[silly, government, completely, botched, covid...",red
4,"[four, weeks, serious, adverse, event, reports...",green


In [51]:
# df['content']=df['content'].apply(lambda x: text_tokenizaion(x))

Normalize text by Lemmatization

In [52]:
def text_lematization(text):
    return [WordNetLemmatizer().lemmatize(w) for w in text]

In [53]:
df['content']=df['content'].apply(lambda x: text_lematization(x))

In [54]:
df.head()

Unnamed: 0,content,target
0,"[devastating, toll, covid, pandemic, child, me...",red
1,"[great, meet, online, undersecretary, global, ...",green
2,"[screeching, noise, heard, commission, moving,...",red
3,"[silly, government, completely, botched, covid...",red
4,"[four, week, serious, adverse, event, report, ...",green


Convert data to list for modeling

In [55]:
data = df.content.values.tolist()
tokenized_data = df.content.values.tolist()

In [56]:
print(tokenized_data[0:2])

[['devastating', 'toll', 'covid', 'pandemic', 'child', 'mental', 'health', 'baby', 'development', 'abuse', 'neglect'], ['great', 'meet', 'online', 'undersecretary', 'global', 'communication', 'shared', 'interest', 'counter']]


### Create Data Input to Model

1. Create Dictionary

In [None]:
id2word = corpora.Dictionary(tokenized_data)

In [None]:
id2word

2. Create Corpus (Term Document Frequency)

In [None]:
corpus = [id2word.doc2bow(text) for text in tokenized_data]

In [None]:
print(corpus[0:2])

Show corpus and frequency

In [None]:
print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]])

### Modeling LDA Topic model

#### Latent Dirichlet allocation (LDA)

Latent Dirichlet allocation (LDA) is a generative statistical model that allows sets of observations to be explained by unobserved groups that explain why some parts of the data are similar. For example, if observations are words collected into documents, it posits that each document is a mixture of a small number of topics and that each word's presence is attributable to one of the document's topics. LDA is an example of a topic model and belongs to the machine learning toolbox and in wider sense to the artificial intelligence toolbox. Source: <a href='https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation'>wikipedia</a>

In the LDA model below we specify chunksize which is the number of document to use for each training iteration/chunk. passes is the total number of training pass.

In [None]:
model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=id2word,num_topics=6, random_state=100, update_every=1,
                                           chunksize=100,passes=10,alpha='auto',per_word_topics=True)

Show topics

Each keyword has a weighted importance value

In [None]:
pprint(model.print_topics())

### Model Evaluation

1. Model perplexity

In information theory, perplexity is a measurement of how well a probability distribution or probability model predicts a sample. It may be used to compare probability models. A low perplexity indicates the probability distribution is good at predicting the sample. <a href='https://en.wikipedia.org/wiki/Perplexity'>wikipedia</a>

In [None]:
model.log_perplexity(corpus)

2. Topic Coherence 

Topic coherence is a metric that returns the coherene score with is a measure of the degree of semantic similarity between high scoring words in the topic

In [None]:
model_coherence = CoherenceModel(model=model, texts=tokenized_data, dictionary=id2word, coherence='c_v')

In [None]:
model_coherence.get_coherence()

### Visualize the topics

In [None]:
pyLDAvis.enable_notebook()
vis=pyLDAvis.gensim.prepare(model, corpus, id2word)

In [None]:
vis

##### Interpreting the Visual

Each bubble on the left graph represents a topic. The larger the bubble, the more prevalent is that topic. A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant. A model with too many topics, will typically have many overlaps, small sized bubbles clustered in one region of the chart.

### 1. Dominant topic in each sentence

To get the dominant topic in each sentence we compute the percentage contribution of each topic.

In [None]:
def topics_in_sentences(model=None, corpus=corpus, texts=data):
    # Initialize an empty dataframe
    sentence_topics_df = pd.DataFrame()

    # Loop through each document and each sentence to get the key topics
    for i, row_list in enumerate(model[corpus]):
        row = row_list[0] if model.per_word_topics else row_list            
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # For each document extract the topic percentage contribution and keywords
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # Dominant topic
                wp = model.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sentence_topics_df = sentence_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sentence_topics_df.columns = ['Dominant_Topic', 'Pct_Contribution', 'Topic_Keywords']

    # Append original text to the end of the output
    contents = pd.Series(texts)
    sentence_topics_df = pd.concat([sentence_topics_df, contents], axis=1)
    return(sentence_topics_df)


sentence_topics_keywords_df = topics_in_sentences(model=model, corpus=corpus, texts=tokenized_data)

# Format
dominant_topic_df = sentence_topics_keywords_df.reset_index()
dominant_topic_df.columns = ['Document_No', 'Dominant_Topic', 'Topic_Pct_Contribution', 'Keywords', 'Text']

In [None]:
dominant_topic_df.head()

### 2. The most representative sentence for each topic

In [None]:
# pd.options.display.max_colwidth = 100

sorted_sentence_topics_df = pd.DataFrame()
grouped_sentence_topics_df = sentence_topics_keywords_df.groupby('Dominant_Topic')

for i, grp in grouped_sentence_topics_df:
    sorted_sentence_topics_df = pd.concat([sorted_sentence_topics_df, 
                                             grp.sort_values(['Pct_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sorted_sentence_topics_df.reset_index(drop=True, inplace=True)

# Format columns
sorted_sentence_topics_df.columns = ['Topic_Num', "Topic_Pct_Contrib", "Keywords", "Representative Text"]

In [None]:
sorted_sentence_topics_df.head(10)

### 3. Topic distribution across documents

In [None]:
# Number of Documents for Each Topic
topic_counts = sentence_topics_keywords_df['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = sentence_topics_keywords_df[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
dominant_topics_df = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
dominant_topics_df.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Pct_Documents']

In [None]:
dominant_topics_df.head()

### 4. Word Clouds of Top N Keywords in Each Topic

In [None]:
topics = model.show_topics(formatted=False)
data_flat = [w for w_list in tokenized_data for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i , weight, counter[word]])

word_cloud_df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])        

# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(3, 2, figsize=(16,10), sharey=True, dpi=160)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height="word_count", data=word_cloud_df.loc[word_cloud_df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
    ax_twin = ax.twinx()
    ax_twin.bar(x='word', height="importance", data=word_cloud_df.loc[word_cloud_df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
    ax.set_ylabel('Word Count', color=cols[i])
    ax_twin.set_ylim(0, 0.030); ax.set_ylim(0, 3500)
    ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=16)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(word_cloud_df.loc[word_cloud_df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
    ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')
    
fig.tight_layout(w_pad=2)    
fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=22, y=1.05)
mpl_to_plotly(fig)
# plt.show()

### 5. Most discussed topics in the documents

In [None]:
def topics_per_document(model, corpus, start=0, end=1):
    corpus_sel = corpus[start:end]
    dominant_topics = []
    topic_percentages = []
    for i, corp in enumerate(corpus_sel):
        topic_percs, wordid_topics, wordid_phivalues = model[corp]
        dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
        dominant_topics.append((i, dominant_topic))
        topic_percentages.append(topic_percs)
    return(dominant_topics, topic_percentages)

dominant_topics, topic_percentages = topics_per_document(model=model, corpus=corpus, end=-1)            

# Distribution of Dominant Topics in Each Document
df = pd.DataFrame(dominant_topics, columns=['Document_Id', 'Dominant_Topic'])
dominant_topic_in_each_doc = df.groupby('Dominant_Topic').size()
df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(name='count').reset_index()

# Total Topic Distribution by actual weight
topic_weightage_by_doc = pd.DataFrame([dict(t) for t in topic_percentages])
df_topic_weightage_by_doc = topic_weightage_by_doc.sum().to_frame(name='count').reset_index()

# Top 3 Keywords for each Topic
topic_top3words = [(i, topic) for i, topics in model.show_topics(formatted=False) 
                                 for j, (topic, wt) in enumerate(topics) if j < 3]

df_top3words_stacked = pd.DataFrame(topic_top3words, columns=['topic_id', 'words'])
df_top3words = df_top3words_stacked.groupby('topic_id').agg(', \n'.join)
df_top3words.reset_index(level=0,inplace=True)

In [None]:
# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4), dpi=120, sharey=True)

# Topic Distribution by Dominant Topics
ax1.bar(x='Dominant_Topic', height='count', data=df_dominant_topic_in_each_doc, width=.5, color='firebrick')
ax1.set_xticks(range(df_dominant_topic_in_each_doc.Dominant_Topic.unique().__len__()))
tick_formatter = FuncFormatter(lambda x, pos: 'Topic ' + str(x)+ '\n' + df_top3words.loc[df_top3words.topic_id==x, 'words'].values[0])
ax1.xaxis.set_major_formatter(tick_formatter)
ax1.set_title('Number of Documents by Dominant Topic', fontdict=dict(size=10))
ax1.set_ylabel('Number of Documents')
ax1.set_ylim(0, 1000)

# Topic Distribution by Topic Weights
ax2.bar(x='index', height='count', data=df_topic_weightage_by_doc, width=.5, color='steelblue')
ax2.set_xticks(range(df_topic_weightage_by_doc.index.unique().__len__()))
ax2.xaxis.set_major_formatter(tick_formatter)
ax2.set_title('Number of Documents by Topic Weightage', fontdict=dict(size=10))
mpl_to_plotly(fig)
# plt.show()

### Model Tuning

Getting most optimal number of topics

In [None]:
limit=50; 
start=1; 
step=6;

In [None]:
def compute_optimal_topics(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
model_list, coherence_values = compute_optimal_topics(dictionary=id2word, corpus=corpus, texts=tokenized_data, start=start, limit=limit, step=step)

Vsualize

In [None]:
x = range(start, limit, step)
ax.plot(x, coherence_values)
ax.xlabel("Num Topics")
ax.ylabel("Coherence score")
ax.legend(("coherence_values"), loc='best')
ax.grid(True)


# fig= plt.figure(figsize=(10,6))
# ax= fig.add_subplot(111)
# ax.bar(langs,students)
# ax.grid(True)
mpl_to_plotly(fig)
# plt.show()

Show topics and coherence values

In [None]:
for m, cv in zip(x, coherence_values):
   print("Num Topics =", m, " is having Coherence Value of", round(cv, 4))

### Conclusion

How to improve the model:<hr>
1. Improve on text processing.
2. The variety of topics the text talks about.
3. Topic modeling algorithm to use.
4. The number of topics to be retrieved from the algorithm.
5. The Model hyperparameter tuning.