<a href="https://colab.research.google.com/github/Pathita-Nonthadid/Business_Project/blob/main/BERTopic_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install all libraries for BERTopic
!pip install bertopic
!pip install umap-learn hdbscan sentence-transformers



In [3]:
# Upload cleaned_data.csv file from computer
from google.colab import files
uploaded = files.upload()

Saving cleaned_data.csv to cleaned_data (1).csv


In [4]:
# Import data into pandas
import pandas as pd
df = pd.read_csv('cleaned_data.csv')

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer

# Set all models
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Custom vectorizer
vectorizer_model = CountVectorizer(min_df=1) # I will change min_df(ignoring words that appear less than n times) in real dataset because it's error in this 500 rows test dataset.
                                             # This error is happening because the CountVectorizer inside topic_model still has min_df set to a value higher than what works for your data.
# BERTopic model with everything
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    calculate_probabilities=True,
    verbose=True
)

In [38]:
docs = df['text_lemmatized'].tolist()
# Filter out any non-string values from the docs list
docs = [str(doc) for doc in docs if isinstance(doc, str)]

# Print the number of documents and a sample
print(f"Number of documents after filtering: {len(docs)}")
if len(docs) > 0:
    print("Sample documents:")
    for i, doc in enumerate(docs[:5]): # Print the first 5 documents
        print(f"{i+1}: {doc}")
else:
    print("No documents remaining after filtering.")

Number of documents after filtering: 498
Sample documents:
1: tariff arent selftaxation theyre leverage vietnam export b buy just b asymmetric power tariff tool correct trade imbalance protect domestic industry force fairer term econ illiteracy knowing
2: veitnam audacity tariff good f
3: thought tariff debate tariff law equality dictate able tariff world change like like bound happen country survive tariff
4: grok total labor hour build car part hr thats laborcar like path build low value labor intensive part mexico pay tariff just build rest
5: india tough prime minister just leave great friend treat right charge cent charge president announces discount reciprocal tariff india


In [39]:
topics, probs = topic_model.fit_transform(docs)

2025-07-29 03:41:35,866 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

2025-07-29 03:41:36,114 - BERTopic - Embedding - Completed ✓
2025-07-29 03:41:36,115 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-29 03:41:36,972 - BERTopic - Dimensionality - Completed ✓
2025-07-29 03:41:36,973 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-29 03:41:36,999 - BERTopic - Cluster - Completed ✓
2025-07-29 03:41:37,003 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-29 03:41:37,017 - BERTopic - Representation - Completed ✓


In [40]:
freq = topic_model.get_topic_info(); freq.head(5)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,478,0_tariff_trump_trade_import,"[tariff, trump, trade, import, country, market...",[trump reciprocal tariff indias stock market d...
1,1,20,1_gt_island_penguin_mcdonald,"[gt, island, penguin, mcdonald, population, tr...",[morning news gt trump tariff handy list gt wa...


In [29]:
topic_model.get_topic(0)  # Select the most frequent topic

[('tariff', 0.3027524502260191),
 ('trump', 0.19368867168503776),
 ('trade', 0.11757807145827065),
 ('import', 0.10622998417131116),
 ('reciprocal', 0.08393413992124352),
 ('president', 0.07388050193547378),
 ('tax', 0.06989962877669717),
 ('amp', 0.0671740045204722),
 ('american', 0.058611927234580474),
 ('do', 0.058611927234580474)]

In [43]:
# topic_model.visualize_topics() #**too small dataset for distance map

In [32]:
topic_model.visualize_barchart()

In [34]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m81.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [41]:
from gensim.models import CoherenceModel
from gensim import corpora

# Tokenize
tokenized_text = [doc.split() for doc in docs]

# Extract topic words from BERTopic model
topic_words = [ [word for word, _ in topic_model.get_topic(i)] for i in range(len(set(topics)) - 1) ]

# Dictionary and corpus for gensim
id2word = corpora.Dictionary(tokenized_text)
corpus = [id2word.doc2bow(text) for text in tokenized_text]

# Coherence model
coherence_model = CoherenceModel(
    topics=topic_words,
    texts=tokenized_text,
    dictionary=id2word,
    coherence='c_v'
)

coherence_score = coherence_model.get_coherence()
print(f"BERTopic c_v coherence score: {coherence_score:.4f}")

BERTopic c_v coherence score: 0.4559
