# Covid-19 Topic Modeling with BERTopic

## Installing libaries

In [1]:
%%capture
!pip install bertopic

In [2]:
# Install necessary libraries
!pip install nltk emoji pandas

# Import necessary libraries
import nltk
import pandas as pd
import numpy as np
import re
import emoji
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download nltk data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

wn = nltk.WordNetLemmatizer()

Collecting emoji
  Downloading emoji-2.12.1-py3-none-any.whl (431 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.4/431.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: emoji
Successfully installed emoji-2.12.1


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


## Read Data

In [3]:
# Load the dataset
from google.colab import drive
drive.mount('/mntDrive')

# load the dataset from drive
dataset_path = "/mntDrive/MyDrive/ICT606_ASSIGNMENT2/covid.csv"

import chardet
with open(dataset_path, 'rb') as f:
    encoding = chardet.detect(f.read())['encoding']

data = pd.read_csv(dataset_path, encoding=encoding)
#data = pd.read_csv(dataset_path)

Mounted at /mntDrive


## Data Preprocessing

In [4]:
# Get the dataset information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.9+ MB


In [5]:
# checking for null values
data.isnull().sum()

UserName            0
ScreenName          0
Location         8590
TweetAt             0
OriginalTweet       0
Sentiment           0
dtype: int64

### Get a subset of data

In [6]:
# Get a random subset of 5000 rows
subset_data = data.sample(n=5000, random_state=13)

In [8]:
# Data preprocessing
import string
emoji_pattern = re.compile(
    "["
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
    u"\U00002702-\U000027B0"
    u"\U000024C2-\U0001F251"
    "]+", flags=re.UNICODE)

def preprocess_tweet(tweet):
    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)

    # Remove user @ references and '#' from hashtags
    tweet = re.sub(r'\@\w+|\#','', tweet)

    # Remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    tweet = re.sub(r'\d+', '', tweet)

    #remove Retweet
    tweet = re.sub(r'RT[\s]+', '', tweet)

    #remove emojis
    tweet = emoji_pattern.sub(r'', tweet)

    #convert to lower case
    tweet = tweet.lower()

    return tweet

# Apply the preprocess function to the tweets
subset_data['preprocessed_tweet'] = subset_data['OriginalTweet'].apply(preprocess_tweet)

### Removing stop words

In [9]:
# Remove stopwords
stopwords = nltk.corpus.stopwords.words('english')
subset_data['tweets_without_stopwords'] = subset_data['preprocessed_tweet'].apply(lambda x: ' '.join([w for w in x.split() if w.lower() not in stopwords]))
subset_data['tweets_without_stopwords']

7713     stores inspected ensure hand sanitisers antise...
26321    visited worked phoenix mall velachery march pl...
25726    new york state attorney general asking domain ...
35814    impact covid quarantine retail evaluated amp f...
852      morrisons launches ¬ìhardship fund¬î staff fac...
                               ...                        
16261    crude oil prices collapsed consumer demand low...
36233    consumer confidence plummeted levels seen rece...
18775    silver prices massively oversold opportunity a...
7835     dear customers shall temporarily closing mall ...
16353    government wont tell ufos aliensyou panic buy ...
Name: tweets_without_stopwords, Length: 5000, dtype: object

### Lemmatization

Lemmatization refers to changing words to their base form.


In [10]:
# Lemmatization
subset_data['tweets_lemmatized'] = subset_data['tweets_without_stopwords'].apply(lambda x: ' '.join([wn.lemmatize(w) for w in x.split() if w not in stopwords]))
subset_data['tweets_lemmatized'].head()

7713     store inspected ensure hand sanitisers antisep...
26321    visited worked phoenix mall velachery march pl...
25726    new york state attorney general asking domain ...
35814    impact covid quarantine retail evaluated amp f...
852      morrison launch ¬ìhardship fund¬î staff facing...
Name: tweets_lemmatized, dtype: object

In [11]:
# Extract the text data from the specific column
docs = subset_data['tweets_lemmatized'].tolist()

## Topic Modeling Using BERTopic

Instantiate the BERTopic model.

There are four key components used in BERTopic.


*   A transformer embedding model
*   UMAP dimensionality reduction
*   HDBSCAN clustering
*   Cluster tagging using c-TF-IDF


### Training

In [12]:
from bertopic import BERTopic

topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(docs)

2024-06-02 12:48:09,464 - BERTopic - Embedding - Transforming documents to embeddings.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

2024-06-02 12:49:41,468 - BERTopic - Embedding - Completed ✓
2024-06-02 12:49:41,470 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-02 12:50:23,579 - BERTopic - Dimensionality - Completed ✓
2024-06-02 12:50:23,584 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-02 12:50:25,300 - BERTopic - Cluster - Completed ✓
2024-06-02 12:50:25,316 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-02 12:50:25,722 - BERTopic - Representation - Completed ✓


### Extracting Topics
After fitting our model, we can start by looking at the results. Typically, we look at the most frequent topics first as they best represent the collection of documents.

We can see that the output gives us 41 rows in total.
Topic -1 should be ignored. It indicates that the tweets are not assigned to any specific topic. The count for topic -1 is 2075, meaning that there are 2075 tweets as outliers that do not belong to any topic.

The Name column lists the top terms for each topic. For example, the top 4 terms for Topic 0 are hand, mask, sanit, wear indicating that it is a topic related to hand sanitizing and wearing masks.

In [13]:
freq = topic_model.get_topic_info();
freq.head(10)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2175,-1_covid_supermarket_coronavirus_grocery,"[covid, supermarket, coronavirus, grocery, sto...",[know someone work grocery store told paper to...
1,0,459,0_sanitizer_hand_mask_glove,"[sanitizer, hand, mask, glove, coronavirus, so...",[critical condition wash hand frequently alcoh...
2,1,238,1_toiletpaper_toilet_paper_coronavirus,"[toiletpaper, toilet, paper, coronavirus, toil...",[remember toilet paper shortage here look back...
3,2,188,2_consumer_behavior_confidence_covid,"[consumer, behavior, confidence, covid, survey...",[survey u consumer sentiment coronavirus crisi...
4,3,154,3_oil_price_crude_cut,"[oil, price, crude, cut, russia, market, globa...",[opec russia oil producing nation agreed sunda...
5,4,144,4_supermarket_socialdistancing_distancing_social,"[supermarket, socialdistancing, distancing, so...",[social distancing good well need supply you¬í...
6,5,139,5_food_stock_supply_covid,"[food, stock, supply, covid, demand, coronavir...",[coronavirus update supply chain food shortage...
7,6,107,6_bank_food_demand_donation,"[bank, food, demand, donation, help, donate, l...",[local food bank need donation see link list p...
8,7,104,7_worker_employee_store_grocery,"[worker, employee, store, grocery, work, teste...",[grocery store worker deserve hazard pay coron...
9,8,82,8_scam_scammer_ftc_email,"[scam, scammer, ftc, email, fake, fear, consum...",[way hacker scammer exploiting coronavirus pan...


If more than 4 terms are needed for a topic, we can use get_topic and pass in the topic number. For example, get_topic(0) gives us the top 10 terms for topic 0 and their relative importance.

In [14]:
# Select the most frequent topic
topic_model.get_topic(0)

[('sanitizer', 0.0673722015697414),
 ('hand', 0.06652381286435934),
 ('mask', 0.053760958222734134),
 ('glove', 0.024675407121964803),
 ('coronavirus', 0.022480315646230407),
 ('soap', 0.02015987051765178),
 ('face', 0.019981049310910905),
 ('sanitizers', 0.017006926621973215),
 ('wash', 0.015824725859867966),
 ('covid', 0.01474888221349214)]

The predicted topics for the first 10 documents.

In [16]:
topic_model.topics_[:10]

[0, 7, 8, -1, -1, 3, -1, 20, 25, 5]

## Visualization

### Visualize Terms

We can visualize the top keywords using a bar chart. top_n_topics=12 means that we will create bar charts for the top 12 topics. The length of the bar represents the score of the keyword. A longer bar means higher importance for the topic.

In [17]:
topic_model.visualize_barchart(top_n_topics=12)

### Visualize Probabilities

To visualize the distributions to understand how confident BERTopic is that certain topics can be found in a document.



In [18]:
topic_model.visualize_distribution(probs[300], min_probability=0.015)

In [19]:
topic_model.visualize_distribution(probs[550], min_probability=0.015)

In [25]:
topic_model.visualize_distribution(probs[120], min_probability=0.015)

### Visualize Topics

 Further, systematically optimizing hyperparameters for the BERTopic model using techniques like grid search or Bayesian optimization could lead to more accurate and meaningful topic extraction.

In [26]:
topic_model.visualize_topics()

### Visualize Topic Hierarchy

Another way to see how the topics are connected is through a hierarchical clustering graph. We can control the number of topics in the graph by the top_n_topics parameter.

In this example, the top 10 topics are included in the hierarchical graph. We can see that the sound quality topic is closely connected to the headset topic, and both of them are connected to the earpiece comfortable topic.

In [29]:
topic_model.visualize_hierarchy(top_n_topics=50)

In [28]:
topic_model.visualize_hierarchy(top_n_topics=10)

### Visualize Topic Similarity

Heatmap can also be used to analyze the similarities between topics. The similarity score ranges from 0 to 1. A value close to 1 represents a higher similarity between the two topics, which is represented by darker blue color.

In [30]:
# Visualize similarity using heatmap
topic_model.visualize_heatmap()

### Visualize Term Score Decline

In [31]:
topic_model.visualize_term_rank()

In [35]:
topic_model.get_topic(44)

[('wuhan', 0.2814177967620223),
 ('competition', 0.2376126444099938),
 ('italy', 0.2364896841872313),
 ('giveawayalert', 0.23182055587587927),
 ('puzzle', 0.22555191547713505),
 ('contest', 0.2153405922733052),
 ('join', 0.1960510863476027),
 ('alert', 0.1791586062333017),
 ('de', 0.15036794365142336),
 ('sanitizer', 0.10091826187515873)]

### Visualize Documents and Topics

In [32]:
topic_model.visualize_documents(docs)

## Model Evaluation

In [36]:
# import library from gensim
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

# Preprocess documents
cleaned_docs = topic_model._preprocess_text(subset_data['OriginalTweet'])

# Extract vectorizer from BERTopic
vectorizer = topic_model.vectorizer_model

# Extract features for Topic Coherence evaluation
analyzer = vectorizer.build_analyzer()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topics = topic_model.get_topics()
topics.pop(-1, None)
topic_words = [
[word for word, _ in topic_model.get_topic(topic) if word != ""] for topic in topics
]
topic_words = [[words for words, _ in topic_model.get_topic(topic)]
        for topic in range(len(set(topics))-1)][:10]

# Evaluate
coherence_model = CoherenceModel(topics=topic_words,
                          texts=tokens,
                          corpus=corpus,
                          dictionary=dictionary,
                          coherence='c_v')
coherence = coherence_model.get_coherence()

print(coherence)

0.5284572090330437


In [45]:
# Get coherence scores for each topic
coherence_per_topic = coherence_model.get_coherence_per_topic()

# Sort the coherence scores in descending order and get the top 10
top_10_coherence_scores = sorted(coherence_per_topic, reverse=True)[:10]

# Print the top 10 coherence scores
top_10_coherence_scores


[0.6207772473733348,
 0.5042494775185182,
 0.43415975333292706,
 0.3906915459655057,
 0.3824476434107296,
 0.36416132000180207,
 0.3037959017517434,
 0.30164236124453264,
 0.25634567159039634,
 0.20009882685007813]

## **Topic Representation**


### Topic Reduction
We can reduce the number of topics after having trained a BERTopic model. The advantage of doing so, is that you can decide the number of topics after knowing how many are actually created. It is difficult to predict before training your model how many topics that are in your documents and how many will be extracted. Instead, we can decide afterwards how many topics seems realistic:

In [50]:
new_topics = topic_model.reduce_topics(docs, nr_topics=25)

2024-06-02 14:11:14,263 - BERTopic - Topic reduction - Reducing number of topics
2024-06-02 14:11:14,563 - BERTopic - Topic reduction - Reduced number of topics from 9 to 5


In [51]:
## Evaluate new coherence score

# Preprocess documents
cleaned_docs = topic_model._preprocess_text(subset_data['OriginalTweet'])

# Extract vectorizer from BERTopic
vectorizer = topic_model.vectorizer_model

# Extract features for Topic Coherence evaluation
analyzer = vectorizer.build_analyzer()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topics = topic_model.get_topics()
topics.pop(-1, None)
topic_words = [
[word for word, _ in topic_model.get_topic(topic) if word != ""] for topic in topics
]
topic_words = [[words for words, _ in topic_model.get_topic(topic)]
        for topic in range(len(set(topics))-1)][:10]

# Evaluate
coherence_model = CoherenceModel(topics=topic_words,
                          texts=tokens,
                          corpus=corpus,
                          dictionary=dictionary,
                          coherence='c_v')
coherence = coherence_model.get_coherence()

print(coherence)

0.3720706539789818


In [39]:
freq = topic_model.get_topic_info();
freq.head(10)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2175,-1_covid_coronavirus_supermarket_store,"[covid, coronavirus, supermarket, store, groce...",[often going store grocery shopping due pandem...
1,0,459,0_sanitizer_hand_mask_coronavirus,"[sanitizer, hand, mask, coronavirus, glove, fa...",[critical condition wash hand frequently alcoh...
2,1,319,1_grocery_store_coronavirus_worker,"[grocery, store, coronavirus, worker, covid, s...",[covid coronavirus grocery store worker get ha...
3,2,270,2_food_demand_bank_stock,"[food, demand, bank, stock, covid, supply, hel...",[local food bank need donation see link list p...
4,3,244,3_oil_price_gas_market,"[oil, price, gas, market, global, low, russia,...",[u president trump say saudi arabia russia cut...
5,4,238,4_toiletpaper_toilet_paper_coronavirus,"[toiletpaper, toilet, paper, coronavirus, toil...","[part toilet paper, remember toilet paper shor..."
6,5,221,5_consumer_covid_behavior_crisis,"[consumer, covid, behavior, crisis, impact, ch...",[global survey consumer sentiment coronavirus ...
7,6,179,6_supermarket_socialdistancing_elderly_distancing,"[supermarket, socialdistancing, elderly, dista...",[left grocery store packed like sardine social...
8,7,152,7_price_covid_vegetable_taking,"[price, covid, vegetable, taking, advantage, c...",[going waiting taking advantage drop low price...
9,8,90,8_online_shopping_covid_order,"[online, shopping, covid, order, onlineshoppin...","[use online shopping help affected covid, covi..."


In [37]:
topic_model.visualize_barchart(top_n_topics=25)

### Update Topics

After having created the topic model, we can fine-tune the model. We can use the function update_topics to update the topic representation with new parameters for c-TF-IDF:

In [52]:
topic_model.update_topics(docs, n_gram_range=(1, 2))

In [53]:
topic_model.get_topic(1)

[('price', 0.10232973419536223),
 ('oil', 0.09928422879555455),
 ('oil price', 0.062154226446929056),
 ('coronavirus', 0.03768837356328525),
 ('covid', 0.03372900477485498),
 ('market', 0.0315124864526701),
 ('gas', 0.025246498979223864),
 ('gas price', 0.020760639827085536),
 ('global', 0.019375586036144355),
 ('pandemic', 0.019319508920134933)]

## **Search Topics**

After having trained our model, we can use find_topics to search for topics that are similar to an input search_term. Here, we are going to be searching for topics that closely relate the search term "vehicle". Then, we extract the most similar topic and check the results:

In [38]:
similar_topics, similarity = topic_model.find_topics("mask", top_n=10); similar_topics

[1, 14, 0, 15, 8, -1, 5, 9, 11, 2]

In [39]:
topic_model.get_topic(5)

[('worker', 0.06454033362487122),
 ('grocery', 0.0343734907843987),
 ('employee', 0.033165810600613355),
 ('store', 0.032475263143900826),
 ('grocery store', 0.03142042111474632),
 ('work', 0.02385171603365789),
 ('supermarket', 0.022609223363575383),
 ('staff', 0.021967190795536728),
 ('covid', 0.021041140156526347),
 ('thank', 0.019067199503358293)]

## **Model serialization**

The model and its internal settings can easily be saved. The documents and embeddings will not be saved. However, UMAP and HDBSCAN will be saved.

In [45]:
# Save model
topic_model.save("covid_model")



In [46]:
# Load model
my_model = BERTopic.load("covid_model")