# Import Libraries

In [1]:
# Run in python console
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim (Topic Modeling Pacakge)
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy
from spacy.lang.en.examples import sentences 
spacy.cli.download("en_core_web_sm")

# Plotting tools
import pyLDAvis
from gensim.models.ldamodel import LdaModel
import pyLDAvis.gensim # don't skip this
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# NLTK Stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
nltk.download('words')
words = set(nltk.corpus.words.words())

✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


  def _figure_formats_changed(self, name, old, new):
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ivyha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\ivyha\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
# Stopwords will be loaded here
# Do edit the stopwords you wish to use in the code below

stop_new = ['mrt', 'station', 'see', 'bus', 'buses', 'line', 'account', 'one', 'full',
                     'get', 'min', 'class', 'year', 'pm', '', 'take', 'taxi', 'allow', 'last', 'almost',
                     'post', 'start', 'cross', 'car_rental', 'comfortdelgro', 'singapore', 'grab',
                     'night', 'find', 'need', 'build', 'photo', 'video', 'leave', 'car', 'train', 
                     'think', 'food', 'make', 'set', 'fresh', 'sbs', 'driver', 'delgro', 'comfort',  
                     'move',  'ever', 'blue', 'uber', 'go', 'look', 'use', 'also', 'give', 
                     'many', 'come', 'lot', 'seem', 'guess', 'definitely', 'sure', 'keep', 'much', 'already', 
                     'do', 'lol', 'people', 'well', 'back', 'week', 'u', 'want', 'day', 'will', 'know', 
                     'even', 'really', 'said', 'say', 'cab', 'public transport', 'taxis', 'public', 
                     'transport', 'thing', 'still', 'got', 'now', 's', 'stop', 'around', 'another', 
                     'smrt', 'next', 'us', 'may', 'person', 'years', 'going', 'trains', 'way', 
                     'seat', 'guy', 'https', 'always', 'riders', 'cabbie', 'sg', 'drivers', 'auntie', 'man', 
                     'uncle', 'stops', 'someone', 'something', 'andy', 'cabs', 'cabbies', 'order', 'delivery', 
                     'gojek', 'quite','fucking', 'every', 'getting', 'trying', 'told', 'something', 'singaporean', 
                     'feel', 'lta', 'fuck', 'without', 'let', 'made', 'getting', 'passenger', 'restaurant', 'grabfood', 
                     'merchant', 'don', 't', 'customer', 'cars', 'actually', 'senior', 'entrepreneur', 'rental', 'cars', 'current']

stop_words.extend(stop_new)

# Import DataFrame

In [3]:
# The code will import the sentiment dataframe 
# Do edit the path file to specify where you have saved the file

df_clean_cat_bcovid = pd.read_csv(r'df_clean_cat_bcovid_sent.csv')
df_clean_cat_acovid = pd.read_csv(r'df_clean_cat_acovid_sent.csv')
df_clean_cat_all = pd.read_csv(r'df_clean_cat_all_sent.csv')

# Prepare DataFrame for Topic Modelling

In [4]:
# These data will get the text column and convert it to list
# If you wish to get the text from a specific sentiment you can use the code below
# df.loc[df['sentiment'] == ' Negative, 'text'].values.tolist()

df_bcovid_data = df_clean_cat_bcovid.text.values.tolist()
df_acovid_data = df_clean_cat_acovid.text.values.tolist()
df_all_data = df_clean_cat_all.text.values.tolist()

# Tokenize Each Sentence

Let’s tokenize each sentence into a list of words, removing punctuations and unnecessary characters altogether.

Gensim’s simple_preprocess() is great for this.
Additionally I have set deacc=True to remove the punctuations.

### Create Function

In [5]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

### Execute Function

In [6]:
df_bcovid_data_words = list(sent_to_words(df_bcovid_data))
df_acovid_data_words = list(sent_to_words(df_acovid_data))
df_all_data_words = list(sent_to_words(df_all_data))

# Build Bigram and Trigram Models

Bigrams are two words frequently occurring together in the document.
Trigrams are 3 words frequently occurring.

Some examples in our example are: ‘front_bumper’, ‘oil_leak’, ‘maryland_college_park’ etc.

Gensim’s Phrases model can build and implement the bigrams, trigrams, 
quadgrams and more.

The two important arguments to Phrases are min_count and threshold.
The higher the values of these param, the harder it is for words to be combined to bigrams.

## Before Covid

In [7]:
# Build the bigram and trigram models
df_bcovid_data_bigram = gensim.models.Phrases(df_bcovid_data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
df_bcovid_data_trigram = gensim.models.Phrases(df_bcovid_data_bigram[df_bcovid_data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
df_bcovid_data_bigram_mod = gensim.models.phrases.Phraser(df_bcovid_data_bigram)
df_bcovid_data_trigram_mod = gensim.models.phrases.Phraser(df_bcovid_data_trigram)

# See trigram example
#print(overall_acovid_title_data_trigram_mod[overall_acovid_title_data_bigram_mod])
print(df_bcovid_data_trigram_mod[df_bcovid_data_bigram_mod[df_bcovid_data_words[1]]])

['so', 'on', 'my', 'way', 'back', 'home', 'my', 'taxi', 'broke', 'down', 'walked', 'the', 'highway', 'and', 'random', 'uncle', 'offered', 'to', 'send', 'me', 'back']


## After Covid

In [8]:
# Build the bigram and trigram models
df_acovid_data_bigram = gensim.models.Phrases(df_acovid_data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
df_acovid_data_trigram = gensim.models.Phrases(df_acovid_data_bigram[df_acovid_data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
df_acovid_data_bigram_mod = gensim.models.phrases.Phraser(df_acovid_data_bigram)
df_acovid_data_trigram_mod = gensim.models.phrases.Phraser(df_acovid_data_trigram)

# See trigram example
#print(overall_acovid_title_data_trigram_mod[overall_acovid_title_data_bigram_mod])
print(df_acovid_data_trigram_mod[df_acovid_data_bigram_mod[df_acovid_data_words[1]]])

['new', 'cabinet', 'announcement']


## All

In [None]:
# Build the bigram and trigram models
df_all_data_bigram = gensim.models.Phrases(df_all_data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
df_all_data_trigram = gensim.models.Phrases(df_all_data_bigram[df_all_data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
df_all_data_bigram_mod = gensim.models.phrases.Phraser(df_all_data_bigram)
df_all_data_trigram_mod = gensim.models.phrases.Phraser(df_all_data_trigram)

# See trigram example
#print(overall_acovid_title_data_trigram_mod[overall_acovid_title_data_bigram_mod])
print(df_all_data_trigram_mod[df_all_data_bigram_mod[df_all_data_words[1]]])

['new', 'cabinet', 'announcement']


# Define Functions

The bigrams model is ready. Let’s define the functions to remove the stopwords, 
make bigrams and lemmatization and call them sequentially.

In [19]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):

    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts, trigram_mod, bigram_mod):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

# Call the Functions

## Before Covid

In [20]:
# Remove Stop Words
df_bcovid_data_words_nostops = remove_stopwords(df_bcovid_data_words)

# Form Bigrams
df_bcovid_data_words_bigrams = make_bigrams(df_bcovid_data_words_nostops, df_bcovid_data_bigram_mod)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
df_bcovid_data_lemmatized = lemmatization(df_bcovid_data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(df_bcovid_data_lemmatized[:1])

[['understand', 'find', 'picture', 'take', 'ago']]


## After Covid

In [21]:
# Remove Stop Words
df_acovid_data_words_nostops = remove_stopwords(df_acovid_data_words)

# Form Bigrams
df_acovid_data_words_bigrams = make_bigrams(df_acovid_data_words_nostops, df_acovid_data_bigram_mod)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
df_acovid_data_lemmatized = lemmatization(df_acovid_data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(df_acovid_data_lemmatized[:1])

[['old', 'crown', 'stack', 'scrap']]


## All

In [None]:
# Remove Stop Words
df_all_data_words_nostops = remove_stopwords(df_all_data_words)

# Form Bigrams
df_all_data_words_bigrams = make_bigrams(df_all_data_words_nostops, df_all_data_bigram_mod)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
df_all_data_lemmatized = lemmatization(df_all_data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(df_all_data_lemmatized[:1])

[['old', 'crown', 'stack', 'scrap']]


# The two main inputs to the LDA topic model are the dictionary(id2word) and the corpus.
# Let’s create them.

## Before Covid

In [32]:
# Create Dictionary
df_bcovid_data_id2word = corpora.Dictionary(df_bcovid_data_lemmatized)

# Create Corpus
df_bcovid_data_texts = df_bcovid_data_lemmatized

# Term Document Frequency
df_bcovid_data_corpus = [df_bcovid_data_id2word.doc2bow(text) for text in df_bcovid_data_texts]

# View
print(df_bcovid_data_corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]]


## After Covid

In [33]:
# Create Dictionary
df_acovid_data_id2word = corpora.Dictionary(df_acovid_data_lemmatized)

# Create Corpus
df_acovid_data_texts = df_acovid_data_lemmatized

# Term Document Frequency
df_acovid_data_corpus = [df_acovid_data_id2word.doc2bow(text) for text in df_acovid_data_texts]

# View
print(df_acovid_data_corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1)]]


## All

In [None]:
# Create Dictionary
df_all_data_id2word = corpora.Dictionary(df_all_data_lemmatized)

# Create Corpus
df_all_data_texts = df_all_data_lemmatized

# Term Document Frequency
df_all_data_corpus = [df_all_data_id2word.doc2bow(text) for text in df_all_data_texts]

# View
print(df_all_data_corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1)]]


# Build the Topic Model

Gensim creates a unique id for each word in the document.
The produced corpus shown above is a mapping of (word_id, word_frequency).

For example, (0, 1) above implies, word id 0 occurs once in the first document.

Likewise, word id 1 occurs twice and so on.

This is used as the input by the LDA model.

If you want to see what word a given id corresponds to, pass the id as a key to the dictionary.

id2word[0]
'addition'

Or, you can see a human-readable form of the corpus itself.

# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('addition', 1),
  ('anyone', 2),
  ('body', 1),
  ('bricklin', 1),
  ('bring', 1),
  ('call', 1),
  ('car', 5),
  ('could', 1),
  ('day', 1),
  ('door', 2),
  ('early', 1),
  ('engine', 1),
  ('enlighten', 1),
  ('front_bumper', 1),
  ('maryland_college', 1),
  (..truncated..)]]


Alright, without digressing further let’s jump back on track with the next step:

Building the topic model.

We have everything required to train the LDA model.
In addition to the corpus and dictionary, you need to provide the number of topics as well.

Apart from that, alpha and eta are hyperparameters that affect sparsity of the topics.
According to the Gensim docs, both defaults to 1.0/num_topics prior.

chunksize is the number of documents to be used in each training chunk.

update_every determines how often the model parameters should be updated and 
passes is the total number of training passes.

In [44]:
# Build LDA model
def lda_model(corpus, id2word, n_topics):

    model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=n_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
    return model

The above LDA model is built with 20 different topics where each topic is a 
combination of keywords and each keyword contributes a certain weightage to the topic.

You can see the keywords for each topic and the weightage(importance) of 
each keyword using lda_model.print_topics() as shown next.

The output will look like this...


[(0,
  '0.016*"car" + 0.014*"power" + 0.010*"light" + 0.009*"drive" + 0.007*"mount" '
  '+ 0.007*"controller" + 0.007*"cool" + 0.007*"engine" + 0.007*"back" + '
  '0.006*"turn"'),
 (1,
  '0.072*"line" + 0.066*"organization" + 0.037*"write" + 0.032*"article" + '
  '0.028*"university" + 0.027*"nntp_post" + 0.026*"host" + 0.016*"reply" + '
  '0.014*"get" + 0.013*"thank"'),
 (2,
  '0.017*"patient" + 0.011*"study" + 0.010*"slave" + 0.009*"wing" + '
  '0.009*"disease" + 0.008*"food" + 0.008*"eat" + 0.008*"pain" + '
  '0.007*"treatment" + 0.007*"syndrome"'),
 (3,
  '0.013*"key" + 0.009*"use" + 0.009*"may" + 0.007*"public" + 0.007*"system" + '
  '0.007*"order" + 0.007*"government" + 0.006*"state" + 0.006*"provide" + '
  '0.006*"law"'),
  
  so on...



How to interpret this?

Topic 0 is a represented as _

0.016“car” + 0.014“power” + 0.010“light” + 0.009“drive” 
+ 0.007“mount” + 0.007“controller” + 0.007“cool” + 0.007“engine” 
+ 0.007“back” + ‘0.006“turn”.

It means the top 10 keywords that contribute to this topic are:
‘car’, ‘power’, ‘light’.. and so on
and the weight of ‘car’ on topic 0 is 0.016.

--> The weights reflect how important a keyword is to that topic.

Looking at these keywords, can you guess what this topic could be?
You may summarise it either are ‘cars’ or ‘automobiles’.

Likewise, can you go through the remaining topic keywords and judge what the topic is?
https://www.machinelearningplus.com/wp-content/uploads/2018/03/Inferring-Topic-from-Keywords.png

# Calculate Model Perplexity and Topic Coherence

Model perplexity and topic coherence provide a convenient measure to judge how good a given topic model is. In my experience, topic coherence score, in particular, has been more helpful.

This can help determine the ideal number of topics to use. We can determine this by looking at the chart and picking a number of topic that have a spike.

In [46]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model=LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

## Before Covid

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=df_bcovid_data_id2word, corpus=df_bcovid_data_corpus, texts=df_bcovid_data_lemmatized, start=2, limit=40, step=6)
# Show graph
import matplotlib.pyplot as plt
limit=40; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()


## After Covid

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=df_acovid_data_id2word, corpus=df_acovid_data_corpus, texts=df_acovid_data_lemmatized, start=2, limit=40, step=6)
# Show graph
import matplotlib.pyplot as plt
limit=40; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()


## All

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=df_all_data_id2word, corpus=df_all_data_corpus, texts=df_all_data_lemmatized, start=2, limit=40, step=6)
# Show graph
import matplotlib.pyplot as plt
limit=40; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()


# Visualize the topics

Now that the LDA model is built, the next step is to examine the produced 
topics and the associated keywords. There is no better tool than pyLDAvis package’s 
interactive chart and is designed to work well with jupyter notebooks.

In [59]:
# lda_model takes in 3 elements, the corpus, id2word, and the number of topics
# Do edit the number of topics base on the chart that was visualized ealier to get the ideal number of topics.

df_bcovid_data_ldamodel = lda_model(df_bcovid_data_corpus, df_bcovid_data_id2word, 15)
df_acovid_data_ldamodel = lda_model(df_acovid_data_corpus, df_acovid_data_id2word, 20)
df_all_data_ldamodel = lda_model(df_all_data_corpus, df_all_data_id2word, 20)

In [60]:
df_bcovid_data_doc_lda = df_bcovid_data_ldamodel[df_bcovid_data_corpus]
df_acovid_data_doc_lda = df_acovid_data_ldamodel[df_acovid_data_corpus]
df_all_data_doc_lda = df_all_data_ldamodel[df_all_data_corpus]

## Before Covid

In [73]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(df_bcovid_data_ldamodel,
                              df_bcovid_data_corpus, 
                              df_bcovid_data_id2word)
vis

## After Covid

In [62]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(df_acovid_data_ldamodel,
                              df_acovid_data_corpus, 
                              df_acovid_data_id2word)
vis

## All

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(df_all_data_ldamodel,
                              df_all_data_corpus, 
                              df_all_data_id2word)
vis