In [2]:
#Install all libraries for BERTopic
!pip install bertopic
!pip install umap-learn hdbscan sentence-transformers



In [3]:
from google.colab import files
uploaded = files.upload()

Saving cleaned_data.csv to cleaned_data (1).csv


In [4]:
import pandas as pd
df = pd.read_csv('cleaned_data.csv')

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer

# Models
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Custom vectorizer
vectorizer_model = CountVectorizer(min_df=1) # Will change min_df to 9 in real dataset

# BERTopic model with everything
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    calculate_probabilities=True,
    verbose=True
)

In [6]:
docs = df['text_lemmatized'].tolist()
# Filter out any non-string values from the docs list
docs = [str(doc) for doc in docs if isinstance(doc, str)]
topics, probs = topic_model.fit_transform(docs)

2025-07-29 02:13:16,020 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

2025-07-29 02:13:16,490 - BERTopic - Embedding - Completed ✓
2025-07-29 02:13:16,492 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-29 02:13:26,164 - BERTopic - Dimensionality - Completed ✓
2025-07-29 02:13:26,165 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-29 02:13:26,190 - BERTopic - Cluster - Completed ✓
2025-07-29 02:13:26,195 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-29 02:13:26,213 - BERTopic - Representation - Completed ✓


In [7]:
freq = topic_model.get_topic_info(); freq.head(5)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,478,0_tariff_trump_trade_import,"[tariff, trump, trade, import, country, market...",[trump reciprocal tariff indias stock market d...
1,1,20,1_gt_island_penguin_mcdonald,"[gt, island, penguin, mcdonald, population, tr...",[morning news gt trump tariff handy list gt wa...


In [8]:
topic_model.get_topic(0)  # Select the most frequent topic

[('tariff', 0.1531815558789403),
 ('trump', 0.0840055948589083),
 ('trade', 0.04535287826708483),
 ('import', 0.040277807389787236),
 ('country', 0.03952599680088081),
 ('market', 0.03952599680088081),
 ('good', 0.03724139255693881),
 ('india', 0.036311151293618546),
 ('reciprocal', 0.030861523772805717),
 ('president', 0.026776587168252274)]

In [26]:
# topic_model.visualize_topics() *too small dataset for distance map

In [9]:
topic_model.visualize_barchart(top_n_topics=2)

In [34]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m81.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [10]:
from gensim.models import CoherenceModel
from gensim import corpora

# Tokenize
tokenized_text = [doc.split() for doc in docs]

# Extract topic words
topic_words = [ [word for word, _ in topic_model.get_topic(i)] for i in range(len(set(topics)) - 1) ]

# Dictionary and corpus for gensim
id2word = corpora.Dictionary(tokenized_text)
corpus = [id2word.doc2bow(text) for text in tokenized_text]

# Coherence model
coherence_model = CoherenceModel(
    topics=topic_words,
    texts=tokenized_text,
    dictionary=id2word,
    coherence='c_v'
)

coherence_score = coherence_model.get_coherence()
print(f"BERTopic c_v coherence score: {coherence_score:.4f}")

BERTopic c_v coherence score: 0.4559
