### 1) Imports

In [11]:
# Other models
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
# Sklearn
from sklearn.feature_extraction.text import CountVectorizer
# BERTopic 
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
# NLTK for pre-processing
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
# Gensim for results
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
# Others
import pandas as pd
import numpy as np
import re

### 2) Pre processing

#### a) Loading datasets

In [12]:
dataset = "covid19_tweets.csv"
dataset_dir = "./datasets/"
text_field = "text"
date_field = "date"
values = 4000
df = pd.read_csv(dataset_dir + dataset)
df = df.head(values)

# Initialize NLTK stemmer and stopwords
stemmer = PorterStemmer()
nltk.download("stopwords")
stop = stopwords.words("english")
nltk.download('punkt')

# Sorting by date
df[date_field] = pd.to_datetime(df[date_field], format="mixed")
df.sort_values(by=date_field)

# Settings documents and dates
documents = df[text_field].values
dates = df[date_field].values

for i in range(len(dates)) :
    dates[i] = dates[i].astype('datetime64[D]')


Columns (5,6,7,12) have mixed types. Specify dtype option on import or set low_memory=False.

[nltk_data] Downloading package stopwords to /home/mateo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mateo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
def remove_emoji(text):
    emoji_pattern = re.compile("["u"\U0001F600-\U0001F64F"u"\U0001F300-\U0001F5FF"u"\U0001F680-\U0001F6FF"u"\U0001F1E0-\U0001F1FF"u"\U00002500-\U00002BEF"u"\U00002702-\U000027B0"u"\U000024C2-\U0001F251""]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_url(text) :
    text = re.sub(r'https?://\S+', '', text)
    return re.sub(r'http?://\S+', '', text)

def remove_tags(text) :
    return re.sub(r'@\w+', '', text)

def remove_whitespace(text) :
    return re.sub(r"\s+", " ", text)

def pre_processing(documents:np.ndarray) :
    L = []
    for text in documents:
        if text != "" and text and text == text:
            # Removing links
            text = remove_url(text)

            # Removing tags
            text = remove_tags(text)

            # Removing emojis
            text = remove_emoji(text)

            # Removing useless whitespaces
            text = remove_whitespace(text)

            # Tokenize the sentence
            tokens = nltk.word_tokenize(text)

            # Stemming
            stemmed_tokens = [stemmer.stem(token) for token in tokens]

            # Stop word removal
            filtered_tokens = [token for token in stemmed_tokens if token.lower() not in stop]

            # Punctuation removal
            filtered_tokens = [token.translate(str.maketrans('', '', string.punctuation)) for token in filtered_tokens if token]

            # Remove empty string
            filtered_tokens = [token for token in filtered_tokens if token != '' and len(token) > 2]

            # Join tokens back into a sentence
            text = ' '.join(filtered_tokens)
            L.append(text)
        else :
            L.append("")
    return L

tweets = pre_processing(documents)

### 3) Model selections and Training


In [14]:
# Step 1 - Extract embeddings
embedding = "all-MiniLM-L6-v2"
embedding_model = SentenceTransformer(embedding)

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model =  HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True, gen_min_span_tree=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(encoding="utf-8", stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Step 6 - Fine-tune topic representations
representation_model = MaximalMarginalRelevance(diversity=0.2)

# All steps together
topic_model = BERTopic(
  language="english",
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  ctfidf_model=ctfidf_model,
  representation_model=representation_model,
  calculate_probabilities=True,        
  verbose=True,
)

topics, probs = topic_model.fit_transform(tweets)
topic_model.get_topic_info()

Batches: 100%|██████████| 125/125 [01:45<00:00,  1.19it/s]
2023-05-22 19:05:03,742 - BERTopic - Transformed documents to Embeddings
2023-05-22 19:05:37,932 - BERTopic - Reduced dimensionality
2023-05-22 19:05:38,685 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name
0,-1,36,-1_nepal_vocabulari_covidwarrior_coronavirusva...
1,0,2434,0_india_thi_covid19_covidvaccin
2,1,1198,1_russia_russianvaccin_vladimir_coronavirusvaccin
3,2,145,2_daughter_putin_vladimir_russia
4,3,50,3_vaccine_ministri_approv_russian
5,4,34,4_mythandfact_covidwarrior_word_covid19india
6,5,31,5_vodka_invent_vodkadiaries_polonium210
7,6,26,6_technic_readi_russianvaccin_bless
8,7,25,7_school_kid_flu_children
9,8,21,8_shik___


### Save model

In [15]:
models_dir = "./models/"
model = "best_model_4000"

In [16]:
topic_model.save(models_dir + model)

### 4) Interpretation and Visualization of the topics

#### a) Bar chart of the topic word scores

In [17]:
topic_model.load(models_dir + model)
topic_model.visualize_barchart(top_n_topics=10)

#### b) Intertopic distance map

In [18]:
topic_model.visualize_topics()

#### c) Hierarchical clustering

In [19]:
from scipy.cluster import hierarchy as sch
linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(tweets, linkage_function=linkage_function)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

100%|██████████| 8/8 [00:03<00:00,  2.08it/s]


In [20]:
topic_over_time = topic_model.topics_over_time(tweets, dates)
topic_model.visualize_topics_over_time(topic_over_time, topics=topics)

18it [01:14,  4.15s/it]


### 5) Model Evaluation

#### a) Coherence metric

In [21]:
topic_model.load(models_dir + model)
cv = topic_model.vectorizer_model
X = cv.fit_transform(tweets)
doc_tokens = [text.split(" ") for text in tweets]

id2word = Dictionary(doc_tokens)
texts = doc_tokens
corpus = [id2word.doc2bow(text) for text in texts]

topic_words = []
for i in range(len(topic_model.get_topic_freq())-1):
  interim = []
  interim = [t[0] for t in topic_model.get_topic(i)]
  topic_words.append(interim)

coherence_model = CoherenceModel(topics=topic_words, texts=texts, corpus=corpus, dictionary=id2word, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(dataset, ":", coherence_score)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av