<a href="https://colab.research.google.com/github/PetyoKaratov/NLP-Task/blob/main/topic_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%matplotlib inline

In [None]:
!pip install openpyxl==3.0.0
!pip install emot
!pip install pyldavis



In [None]:
# imports
from __future__ import absolute_import, division, print_function, unicode_literals
from google.colab import drive
import pandas as pd
import re                                  # library for regular expression operations
import string                              # for string operations
import pprint
import nltk 
import numpy as np
import tqdm
import emot 
# download the stopwords from NLTK
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import WordNetLemmatizer    # module for lemmatization
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models  
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


  from collections import Iterable
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


In [None]:
# mount the google drive root
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# read the excel file
df = pd.read_excel('./drive/My Drive/NLP_Task_Data.xlsx')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9915 entries, 0 to 9914
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Row_id  9915 non-null   int64         
 1   Date    9915 non-null   datetime64[ns]
 2   Text    9915 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 232.5+ KB


In [None]:
df.head()

Unnamed: 0,Row_id,Date,Text
0,1,2021-01-18,"SelfCare is the BESTCare❗️💯, put yo self first..."
1,2,2021-01-21,Breaking workaholic thought patterns takes mor...
2,3,2021-01-26,Self Care is a must ..... Tarot Reading is hap...
3,4,2021-01-19,Self love and self care. Invest in yourself. ...
4,5,2021-01-24,So excited for self care Friday tomorrow!


In [None]:
tweet = df.Text[1]
tweet

'Breaking workaholic thought patterns takes more than just mindset work, it also takes reading. Here are the self-care books that helped @workbrighterco break their workaholism: https://t.co/Htk8AkXNRR #hustleculture https://t.co/YvdDPhrYuS'

In [None]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

To preprocess the tweet we remove stock market tickers, old stype retweet text, hashtafs tokenize the weets with TweetTokenizer and then remove stopwords punctuation and lemmatizing. Lemmatisation (or lemmatization) in linguistics is the process of grouping together the inflected forms of a word so they can be analysed as a single item, identified by the word's lemma, or dictionary form.

In [None]:
def process_tweet(tweet: str) -> str:
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet
    """
    lemmatizer = WordNetLemmatizer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # remove emoticons
    tweet = re.sub(r'[^\x00-\x7F]+', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation): # remove punctuation
            lemmatize_word = lemmatizer.lemmatize(word, get_wordnet_pos(word))  # lemmatizing word
            tweets_clean.append(lemmatize_word)

    return tweets_clean

# choose the same tweet
tweet = df.Text[0]

print()
print('\033[92m')
print(tweet)
print('\033[94m')

# call the imported function
tweets_stem = process_tweet(tweet); # Preprocess a given tweet

print('preprocessed tweet:')
print(tweets_stem) # Print the result


[92m
SelfCare is the BESTCare❗️💯, put yo self first for once!
[94m
preprocessed tweet:
['selfcare', 'bestcare', 'put', 'yo', 'self', 'first']


Apply tweet preprocessing to all tweets:

In [None]:
df['text_lemmatized'] = df['Text'].apply(process_tweet)

In [None]:
df.head()

Unnamed: 0,Row_id,Date,Text,text_lemmatized
0,1,2021-01-18,"SelfCare is the BESTCare❗️💯, put yo self first...","[selfcare, bestcare, put, yo, self, first]"
1,2,2021-01-21,Breaking workaholic thought patterns takes mor...,"[break, workaholic, thought, pattern, take, mi..."
2,3,2021-01-26,Self Care is a must ..... Tarot Reading is hap...,"[self, care, must, ..., tarot, reading, happen..."
3,4,2021-01-19,Self love and self care. Invest in yourself. ...,"[self, love, self, care, invest, madewithripl,..."
4,5,2021-01-24,So excited for self care Friday tomorrow!,"[excite, self, care, friday, tomorrow]"


Create the Dictionary and Corpus needed for Topic Modeling

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(df['text_lemmatized'])

# Create Corpus
texts = df['text_lemmatized']

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]]


Gensim creates a unique id for each word in the document. The produced corpus shown above is a mapping of (word_id, word_frequency).

For example, (0, 1) above implies, word id 0 occurs once in the first document. Likewise, word id 1 occurs twice and so on.

This is used as the input by the LDA model.

If you want to see what word a given id corresponds to, pass the id as a key to the dictionary.

In [None]:
id2word[0]

'bestcare'

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('bestcare', 1),
  ('first', 1),
  ('put', 1),
  ('self', 1),
  ('selfcare', 1),
  ('yo', 1)]]

### Base Model

We have everything required to train the base Latent Dirichlet Allocation (LDA) model. In addition to the corpus and dictionary, you need to provide the number of topics as well. Apart from that, alpha and eta are hyperparameters that affect sparsity of the topics. According to the Gensim docs, both defaults to 1.0/num_topics prior (we’ll use default for the base model).

    chunksize controls how many documents are processed at a time in the training algorithm. Increasing chunksize will speed up training, at least as long as the chunk of documents easily fit into memory.

    passes controls how often we train the model on the entire corpus (set to 10). Another word for passes might be “epochs”. iterations is somewhat technical, but essentially it controls how often we repeat a particular loop over each document. It is important to set the number of “passes” and “iterations” high enough.

In [None]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=10, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

### View the topics in LDA model

The above LDA model is built with 20 different topics where each topic is a combination of keywords and each keyword contributes a certain weightage to the topic.

You can see the keywords for each topic and the weightage(importance) of each keyword using lda_model.print_topics() as shown next.

In [None]:
# Print the Keyword in the 10 topics
pprint.pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.028*"watch" + 0.020*"second" + 0.019*"self-care" + 0.018*"skincare" + '
  '0.016*"share" + 0.016*"selfcare" + 0.014*"favorite" + 0.013*"social" + '
  '0.011*"thats" + 0.010*"new"'),
 (1,
  '0.059*"selfcare" + 0.022*"selflove" + 0.022*"mindfulness" + 0.018*"love" + '
  '0.018*"life" + 0.017*"inspiration" + 0.016*"2021" + 0.014*"..." + '
  '0.012*"happiness" + 0.012*"read"'),
 (2,
  '0.041*"selfcare" + 0.027*"2020" + 0.021*"break" + 0.017*"rest" + '
  '0.017*"month" + 0.015*"book" + 0.015*"still" + 0.014*"plan" + 0.013*"best" '
  '+ 0.012*"part"'),
 (3,
  '0.048*"self-care" + 0.027*"time" + 0.024*"help" + 0.021*"selfcare" + '
  '0.019*"day" + 0.017*"take" + 0.017*"health" + 0.016*"make" + 0.016*"tip" + '
  '0.013*"mental"'),
 (4,
  '0.032*"therapy" + 0.024*"bliss" + 0.024*"self" + 0.024*"care" + '
  '0.022*"call" + 0.022*"soul" + 0.022*"music" + 0.022*"soundhound" + '
  '0.020*"season" + 0.014*"christmas"'),
 (5,
  '0.025*"holiday" + 0.021*"self-care" + 0.019*"mask" + 0.016*"ca

### Compute Model Perplexity and Coherence Score

Model perplexity and topic coherence provide a convenient measure to judge how good a given topic model is.

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=df['text_lemmatized'], dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.637500543373694

Coherence Score:  0.38615394180736784


### Hyperparameter Tuning

First, let’s differentiate between model hyperparameters and model parameters :

    Model hyperparameters can be thought of as settings for a machine learning algorithm that are tuned by the data scientist before training. Examples would be the number of trees in the random forest, or in our case, number of topics K

    Model parameters can be thought of as what the model learns during training, such as the weights for each word in a given topic

Now that we have the baseline coherence score for the default LDA model, let’s perform a series of sensitivity tests to help determine the following model hyperparameters:

    Number of Topics (K)
    Dirichlet hyperparameter alpha: Document-Topic Density
    Dirichlet hyperparameter beta: Word-Topic Density

We’ll perform these tests in sequence, one parameter at a time by keeping others constant. We’ll use C_v as our choice of metric for performance comparison

In [None]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=df['text_lemmatized'], dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

Let’s call the function, and iterate it over the range of topics, alpha, and beta parameter values

In [None]:
grid = {}
grid['Validation_Set'] = {}# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')# Validation sets
num_of_docs = len(corpus)
corpus_sets = [
               corpus]
corpus_title = ['100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }# Can take a long time to run

pbar = tqdm.tqdm(total=270)

# iterate through validation corpuses
for i in range(len(corpus_sets)):
    # iterate through number of topics
    for k in topics_range:
        # iterate through alpha values
        for a in alpha:
            # iterare through beta values
            for b in beta:
                # get the coherence score for the given parameters
                cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, k=int(k), a=a, b=b)
                # Save the model results
                model_results['Validation_Set'].append(corpus_title[i])
                model_results['Topics'].append(k)
                model_results['Alpha'].append(a)
                model_results['Beta'].append(b)
                model_results['Coherence'].append(cv)
                
                pbar.update(1)
pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
pbar.close()

## Final Model

Let’s train the final model using the above selected parameters

In [None]:
model_df = pd.read_csv('./drive/My Drive/lda_tuning_results.csv')
model_df.sort_values(by=['Coherence']).tail(10)

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
143,100% Corpus,6,symmetric,0.91,0.510706
203,100% Corpus,8,symmetric,0.91,0.512474
153,100% Corpus,7,0.01,0.91,0.515802
182,100% Corpus,8,0.01,0.61,0.516026
177,100% Corpus,7,asymmetric,0.61,0.52093
122,100% Corpus,6,0.01,0.61,0.52229
223,100% Corpus,9,0.61,0.91,0.528819
188,100% Corpus,8,0.31,0.91,0.541064
28,100% Corpus,2,asymmetric,0.91,0.542925
93,100% Corpus,5,0.01,0.91,0.558495


The best model is with 5 topis but I select the one with 8 to work with more topics.

In [None]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=8, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=0.31,
                                           eta=0.90)

In [None]:
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
LDAvis_prepared

  by='saliency', ascending=False).head(R).drop('saliency', 1)


So how to infer pyLDAvis’s output?

Each bubble on the left-hand side plot represents a topic. The larger the bubble, the more prevalent is that topic.

A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant.

A model with too many topics, will typically have many overlaps, small sized bubbles clustered in one region of the chart.

Alright, if you move the cursor over one of the bubbles, the words and bars on the right-hand side will update. These words are the salient keywords that form the selected topic.

## Finding the dominant topic in each sentence

One of the practical application of topic modeling is to determine what topic a given document is about.

To find that, we find the topic number that has the highest percentage contribution in that document.

The format_topics_sentences() function below nicely aggregates this information in a presentable table.

In [None]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=df['Text']):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,5.0,0.705,"care, self, day, today, get, ..., take, like, ...","SelfCare is the BESTCare❗️💯, put yo self first..."
1,1,3.0,0.4147,"self-care, selfcare, time, help, health, take,...",Breaking workaholic thought patterns takes mor...
2,2,5.0,0.789,"care, self, day, today, get, ..., take, like, ...",Self Care is a must ..... Tarot Reading is hap...
3,3,5.0,0.6335,"care, self, day, today, get, ..., take, like, ...",Self love and self care. Invest in yourself. ...
4,4,5.0,0.707,"care, self, day, today, get, ..., take, like, ...",So excited for self care Friday tomorrow!
5,5,3.0,0.4659,"self-care, selfcare, time, help, health, take,...",Download EBOOK Saturday Night Pasta: Recipes a...
6,6,5.0,0.8578,"care, self, day, today, get, ..., take, like, ...",Happy Luna Full Moon in Leo everyone! Remember...
7,7,3.0,0.7006,"self-care, selfcare, time, help, health, take,...",Swipe.Match.Heal\nDownload #GinaApp https://t....
8,8,5.0,0.8054,"care, self, day, today, get, ..., take, like, ...",Love the feature on my phone that automaticall...
9,9,5.0,0.7956,"care, self, day, today, get, ..., take, like, ...",decided to treat myself and buy the bfish guid...


## Topic distribution across documents

Finally, we want to understand the volume and distribution of topics in order to judge how widely it was discussed. The below grahic exposes that information.

In [None]:
df_merge = pd.merge(df, df_dominant_topic, left_index=True, right_index=True)

In [None]:
df_merge.head()

Unnamed: 0,Row_id,Date,Text_x,text_lemmatized,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text_y
0,1,2021-01-18,"SelfCare is the BESTCare❗️💯, put yo self first...","[selfcare, bestcare, put, yo, self, first]",0,5.0,0.705,"care, self, day, today, get, ..., take, like, ...","SelfCare is the BESTCare❗️💯, put yo self first..."
1,2,2021-01-21,Breaking workaholic thought patterns takes mor...,"[break, workaholic, thought, pattern, take, mi...",1,3.0,0.4147,"self-care, selfcare, time, help, health, take,...",Breaking workaholic thought patterns takes mor...
2,3,2021-01-26,Self Care is a must ..... Tarot Reading is hap...,"[self, care, must, ..., tarot, reading, happen...",2,5.0,0.789,"care, self, day, today, get, ..., take, like, ...",Self Care is a must ..... Tarot Reading is hap...
3,4,2021-01-19,Self love and self care. Invest in yourself. ...,"[self, love, self, care, invest, madewithripl,...",3,5.0,0.6335,"care, self, day, today, get, ..., take, like, ...",Self love and self care. Invest in yourself. ...
4,5,2021-01-24,So excited for self care Friday tomorrow!,"[excite, self, care, friday, tomorrow]",4,5.0,0.707,"care, self, day, today, get, ..., take, like, ...",So excited for self care Friday tomorrow!


In [None]:
#all topics
df_merge['Keywords'].unique()

array(['care, self, day, today, get, ..., take, like, im, go',
       'self-care, selfcare, time, help, health, take, practice, new, tip, stress',
       'selfcare, selflove, love, therapy, bliss, motivation, heal, mentalhealth, inspiration, soul',
       'book, massage, appointment, doctor, spaday, blackownedbusiness, recommendation, spa, salon, reschedule',
       'skincare, beauty, skin, selfcare, product, fitness, beautiful, natural, tea, vitamin',
       'make, sure, saturday, priority, could, selfcaresunday, lovely, storm, selfish, selfnurture',
       'divorce, fall, theyre, clarity, youmatter, reputationintelligence, mediation, mirror, eliminatebullying, reflexology',
       'harmony, akafit, mylifestyle, 6miles, payattentiontothehiddengems, enjoythescenery, pastorlife, stoprushingeverywhere, allyouhaveistime, justdoit'],
      dtype=object)

In [None]:
# All topics and number of topic
df_merge[['Dominant_Topic', 'Keywords']].drop_duplicates()

Unnamed: 0,Dominant_Topic,Keywords
0,5.0,"care, self, day, today, get, ..., take, like, ..."
1,3.0,"self-care, selfcare, time, help, health, take,..."
18,4.0,"selfcare, selflove, love, therapy, bliss, moti..."
65,2.0,"book, massage, appointment, doctor, spaday, bl..."
92,6.0,"skincare, beauty, skin, selfcare, product, fit..."
177,0.0,"make, sure, saturday, priority, could, selfcar..."
188,1.0,"divorce, fall, theyre, clarity, youmatter, rep..."
249,7.0,"harmony, akafit, mylifestyle, 6miles, payatten..."


In [None]:
df_timeseries = df_merge[['Date', 'Dominant_Topic', 'Document_No']].groupby(['Date', 'Dominant_Topic']).count().reset_index()

In [None]:
df_timeseries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165 entries, 0 to 164
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Date            165 non-null    datetime64[ns]
 1   Dominant_Topic  165 non-null    float64       
 2   Document_No     165 non-null    int64         
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 4.0 KB


In [None]:
import plotly.express as px
fig = px.line(df_timeseries, x="Date", y='Document_No', color='Dominant_Topic',
              title='Topic over time')
fig.update_xaxes(
    tickformat="%Y-%m-%d"
    )
fig.show()