<a href="https://colab.research.google.com/github/PetyoKaratov/NLP-Task/blob/main/topic_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%matplotlib inline

In [2]:
!pip install openpyxl==3.0.0
!pip install emot
!pip install pyldavis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openpyxl==3.0.0
  Downloading openpyxl-3.0.0.tar.gz (172 kB)
[K     |████████████████████████████████| 172 kB 7.3 MB/s 
[?25hCollecting jdcal
  Downloading jdcal-1.4.1-py2.py3-none-any.whl (9.5 kB)
Building wheels for collected packages: openpyxl
  Building wheel for openpyxl (setup.py) ... [?25l[?25hdone
  Created wheel for openpyxl: filename=openpyxl-3.0.0-py2.py3-none-any.whl size=241207 sha256=cee0a3ab9fb1514e4a17487798915ab9cdbde72622785f80d168eb3654fd5ec2
  Stored in directory: /root/.cache/pip/wheels/c7/64/ff/ce98f6e1d2701ae8e216c875da62feed2839ac8a3cae0ab8af
Successfully built openpyxl
Installing collected packages: jdcal, openpyxl
  Attempting uninstall: openpyxl
    Found existing installation: openpyxl 3.0.10
    Uninstalling openpyxl-3.0.10:
      Successfully uninstalled openpyxl-3.0.10
Successfully installed jdcal-1.4.1 openpyxl-3.0.0
Looking in indexes: https

In [3]:
# imports
from __future__ import absolute_import, division, print_function, unicode_literals
from google.colab import drive
import pandas as pd
import re                                  # library for regular expression operations
import string                              # for string operations
import pprint
import nltk 
import numpy as np
import tqdm
import emot 
# download the stopwords from NLTK
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
from nltk.corpus import wordnet
from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import WordNetLemmatizer    # module for lemmatization
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models  
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
  from collections import Iterable


In [4]:
# mount the google drive root
drive.mount('/content/drive/')

Mounted at /content/drive/


In [5]:
# read the excel file
df = pd.read_excel('./drive/My Drive/NLP_Task_Data.xlsx')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56255 entries, 0 to 56254
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Row_id  9915 non-null   float64       
 1   Date    9915 non-null   datetime64[ns]
 2   Text    56255 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 1.3+ MB


In [6]:
df.head()

Unnamed: 0,Row_id,Date,Text
0,1.0,2021-01-18,is upset that he can't update his Facebook by ...
1,2.0,2021-01-21,@Kenichan I dived many times for the ball. Man...
2,3.0,2021-01-26,my whole body feels itchy and like its on fire
3,4.0,2021-01-19,"@nationwideclass no, it's not behaving at all...."
4,5.0,2021-01-24,@Kwesidei not the whole crew


In [7]:
tweet = df.Text[1]
tweet

'@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds'

In [8]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

To preprocess the tweet we remove stock market tickers, old stype retweet text, hashtafs tokenize the weets with TweetTokenizer and then remove stopwords punctuation and lemmatizing. Lemmatisation (or lemmatization) in linguistics is the process of grouping together the inflected forms of a word so they can be analysed as a single item, identified by the word's lemma, or dictionary form.

In [9]:
def process_tweet(tweet: str) -> str:
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet
    """
    lemmatizer = WordNetLemmatizer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # remove emoticons
    tweet = re.sub(r'[^\x00-\x7F]+', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation): # remove punctuation
            lemmatize_word = lemmatizer.lemmatize(word, get_wordnet_pos(word))  # lemmatizing word
            tweets_clean.append(lemmatize_word)

    return tweets_clean

# choose the same tweet
tweet = df.Text[0]

print()
print('\033[92m')
print(tweet)
print('\033[94m')

# call the imported function
tweets_stem = process_tweet(tweet); # Preprocess a given tweet

print('preprocessed tweet:')
print(tweets_stem) # Print the result


[92m
is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!
[94m
preprocessed tweet:
['upset', "can't", 'update', 'facebook', 'texting', '...', 'might', 'cry', 'result', 'school', 'today', 'also', 'blah']


Apply tweet preprocessing to all tweets:

In [10]:
df['text_lemmatized'] = df['Text'].apply(process_tweet)

In [11]:
df.head()

Unnamed: 0,Row_id,Date,Text,text_lemmatized
0,1.0,2021-01-18,is upset that he can't update his Facebook by ...,"[upset, can't, update, facebook, texting, ...,..."
1,2.0,2021-01-21,@Kenichan I dived many times for the ball. Man...,"[dive, many, time, ball, manage, save, 50, res..."
2,3.0,2021-01-26,my whole body feels itchy and like its on fire,"[whole, body, feel, itchy, like, fire]"
3,4.0,2021-01-19,"@nationwideclass no, it's not behaving at all....","[behaving, i'm, mad, can't, see]"
4,5.0,2021-01-24,@Kwesidei not the whole crew,"[whole, crew]"


Create the Dictionary and Corpus needed for Topic Modeling

In [12]:
# Create Dictionary
id2word = corpora.Dictionary(df['text_lemmatized'])

# Create Corpus
texts = df['text_lemmatized']

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)]]


Gensim creates a unique id for each word in the document. The produced corpus shown above is a mapping of (word_id, word_frequency).

For example, (0, 1) above implies, word id 0 occurs once in the first document. Likewise, word id 1 occurs twice and so on.

This is used as the input by the LDA model.

If you want to see what word a given id corresponds to, pass the id as a key to the dictionary.

In [13]:
id2word[0]

'...'

In [14]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('...', 1),
  ('also', 1),
  ('blah', 1),
  ("can't", 1),
  ('cry', 1),
  ('facebook', 1),
  ('might', 1),
  ('result', 1),
  ('school', 1),
  ('texting', 1),
  ('today', 1),
  ('update', 1),
  ('upset', 1)]]

### Base Model

We have everything required to train the base Latent Dirichlet Allocation (LDA) model. In addition to the corpus and dictionary, you need to provide the number of topics as well. Apart from that, alpha and eta are hyperparameters that affect sparsity of the topics. According to the Gensim docs, both defaults to 1.0/num_topics prior (we’ll use default for the base model).

    chunksize controls how many documents are processed at a time in the training algorithm. Increasing chunksize will speed up training, at least as long as the chunk of documents easily fit into memory.

    passes controls how often we train the model on the entire corpus (set to 10). Another word for passes might be “epochs”. iterations is somewhat technical, but essentially it controls how often we repeat a particular loop over each document. It is important to set the number of “passes” and “iterations” high enough.

In [15]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=10, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

### View the topics in LDA model

The above LDA model is built with 20 different topics where each topic is a combination of keywords and each keyword contributes a certain weightage to the topic.

You can see the keywords for each topic and the weightage(importance) of each keyword using lda_model.print_topics() as shown next.

In [16]:
# Print the Keyword in the 10 topics
pprint.pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.059*"work" + 0.055*"..." + 0.055*"go" + 0.047*"get" + 0.043*"day" + '
  '0.033*"wish" + 0.029*"back" + 0.026*"home" + 0.026*"i\'m" + 0.024*"want"'),
 (1,
  '0.052*"sad" + 0.038*"watch" + 0.026*"look" + 0.025*"hurt" + 0.025*"..." + '
  '0.022*"lose" + 0.019*"man" + 0.018*"make" + 0.016*"i\'m" + 0.014*"die"'),
 (2,
  '0.042*"night" + 0.033*"last" + 0.032*"..." + 0.018*"bore" + 0.017*"movie" + '
  '0.016*"time" + 0.016*"saturday" + 0.015*"start" + 0.014*"party" + '
  '0.014*"left"'),
 (3,
  '0.087*".." + 0.044*"im" + 0.028*"think" + 0.025*"take" + 0.020*"miss" + '
  '0.019*"come" + 0.018*"wanna" + 0.018*"gonna" + 0.018*"dont" + 0.016*"want"'),
 (4,
  '0.047*"miss" + 0.042*"..." + 0.042*"u" + 0.036*"i\'m" + 0.032*"really" + '
  '0.032*"sorry" + 0.026*"lol" + 0.018*"twitter" + 0.016*"hate" + 0.016*"go"'),
 (5,
  '0.038*"tonight" + 0.027*"new" + 0.024*"try" + 0.020*"phone" + '
  '0.017*"already" + 0.016*"house" + 0.012*"i\'ve" + 0.012*"seem" + '
  '0.012*"song" + 0.011*"forgot"'),


### Compute Model Perplexity and Coherence Score

Model perplexity and topic coherence provide a convenient measure to judge how good a given topic model is.

In [17]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=df['text_lemmatized'], dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -9.00950600941515

Coherence Score:  0.2602388475736583


### Hyperparameter Tuning

First, let’s differentiate between model hyperparameters and model parameters :

    Model hyperparameters can be thought of as settings for a machine learning algorithm that are tuned by the data scientist before training. Examples would be the number of trees in the random forest, or in our case, number of topics K

    Model parameters can be thought of as what the model learns during training, such as the weights for each word in a given topic

Now that we have the baseline coherence score for the default LDA model, let’s perform a series of sensitivity tests to help determine the following model hyperparameters:

    Number of Topics (K)
    Dirichlet hyperparameter alpha: Document-Topic Density
    Dirichlet hyperparameter beta: Word-Topic Density

We’ll perform these tests in sequence, one parameter at a time by keeping others constant. We’ll use C_v as our choice of metric for performance comparison

In [18]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=df['text_lemmatized'], dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

Let’s call the function, and iterate it over the range of topics, alpha, and beta parameter values

In [None]:
grid = {}
grid['Validation_Set'] = {}# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')# Validation sets
num_of_docs = len(corpus)
corpus_sets = [
               corpus]
corpus_title = ['100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }# Can take a long time to run

pbar = tqdm.tqdm(total=50)

# iterate through validation corpuses
for i in range(len(corpus_sets)):
    # iterate through number of topics
    for k in topics_range:
        # iterate through alpha values
        for a in alpha:
            # iterare through beta values
            for b in beta:
                # get the coherence score for the given parameters
                cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, k=int(k), a=a, b=b)
                # Save the model results
                model_results['Validation_Set'].append(corpus_title[i])
                model_results['Topics'].append(k)
                model_results['Alpha'].append(a)
                model_results['Beta'].append(b)
                model_results['Coherence'].append(cv)
                
                pbar.update(1)
pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
pbar.close()

  diff = np.log(self.expElogbeta)
74it [6:19:56, 300.81s/it]

## Final Model

Let’s train the final model using the above selected parameters

In [None]:
model_df = pd.read_csv('./drive/My Drive/lda_tuning_results.csv')
model_df.sort_values(by=['Coherence']).tail(10)

The best model is with 5 topis but I select the one with 8 to work with more topics.

In [None]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=8, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=0.31,
                                           eta=0.90)

In [None]:
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
LDAvis_prepared

So how to infer pyLDAvis’s output?

Each bubble on the left-hand side plot represents a topic. The larger the bubble, the more prevalent is that topic.

A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant.

A model with too many topics, will typically have many overlaps, small sized bubbles clustered in one region of the chart.

Alright, if you move the cursor over one of the bubbles, the words and bars on the right-hand side will update. These words are the salient keywords that form the selected topic.

## Finding the dominant topic in each sentence

One of the practical application of topic modeling is to determine what topic a given document is about.

To find that, we find the topic number that has the highest percentage contribution in that document.

The format_topics_sentences() function below nicely aggregates this information in a presentable table.

In [None]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=df['Text']):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

## Topic distribution across documents

Finally, we want to understand the volume and distribution of topics in order to judge how widely it was discussed. The below grahic exposes that information.

In [None]:
df_merge = pd.merge(df, df_dominant_topic, left_index=True, right_index=True)

In [None]:
df_merge.head()

In [None]:
#all topics
df_merge['Keywords'].unique()

In [None]:
# All topics and number of topic
df_merge[['Dominant_Topic', 'Keywords']].drop_duplicates()

In [None]:
df_timeseries = df_merge[['Date', 'Dominant_Topic', 'Document_No']].groupby(['Date', 'Dominant_Topic']).count().reset_index()

In [None]:
df_timeseries.info()

In [None]:
import plotly.express as px
fig = px.line(df_timeseries, x="Date", y='Document_No', color='Dominant_Topic',
              title='Topic over time')
fig.update_xaxes(
    tickformat="%Y-%m-%d"
    )
fig.show()