In [1]:
!pip install bertopic
!pip install umap-learn hdbscan sentence-transformers



In [3]:
import pandas as pd
lemmatized = pd.read_csv("lemmatized_data.csv")
lemmatized.head()

Unnamed: 0,post_id,user_id,full_text,created_At,lang,repost_count,like_count,source,text_preprocessing1,text_preprocessing_no_punctuation,text_no_stopwords,text_lemmatized
0,1907578844636393679,1810361594586738692,"This is nasty \n\nNASDAQ futures down -4.4%, S...",Wed Apr 02 23:40:12 +0000 2025,en,0,0,Twitter Web App,"This is nasty \n\nNASDAQ futures down -.%, S&a...",this is nasty \n\nnasdaq futures down sampp f...,nasty nasdaq futures sampp futures tariff news...,nasty nasdaq future sampp future tariff news h...
1,1907578044492968060,2736639061,Investors Panic as U.S. Stock Market Plummets ...,Wed Apr 02 23:37:01 +0000 2025,en,1,0,Twitter Web App,Investors Panic as U.S. Stock Market Plummets ...,investors panic as us stock market plummets ov...,investors panic stock market plummets trumps t...,investor panic stock market plummet trump tari...
2,1907575218501198232,1850383929829945344,For all the people who have absolutely no conc...,Wed Apr 02 23:25:47 +0000 2025,en,0,2,Twitter for Android,For all the people who have absolutely no conc...,for all the people who have absolutely no conc...,people absolutely concept tariff amp differs g...,people absolutely concept tariff amp differs g...
3,1907574739964510255,449290925,As someone who is feeling JVL levels of schade...,Wed Apr 02 23:23:53 +0000 2025,en,0,0,Twitter for Android,As someone who is feeling JVL levels of schade...,as someone who is feeling jvl levels of schade...,feeling jvl levels schadenfreude let just say ...,feel jvl level schadenfreude let just say happ...
4,1907570533836701721,23709151,The fuzziest math is #TrumpTariff Math.,Wed Apr 02 23:14:59 +0000 2025,en,0,0,Twitter for Android,The fuzziest math is Math.,the fuzziest math is math,fuzziest math math,fuzzy math math


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer

# Set all models
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Custom vectorizer
vectorizer_model = CountVectorizer(min_df=10) # ignoring words that appear less than 10 times
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    calculate_probabilities=True,
    verbose=True
)

In [5]:
docs = lemmatized['text_lemmatized'].tolist()
# Filter out any non-string values from the docs list
docs = [str(doc) for doc in docs if isinstance(doc, str)]

# Print the number of documents and a sample
print(f"Number of documents after filtering: {len(docs)}")
if len(docs) > 0:
    print("Sample documents:")
    for i, doc in enumerate(docs[:5]): # Print the first 5 documents
        print(f"{i+1}: {doc}")
else:
    print("No documents remaining after filtering.")

Number of documents after filtering: 22379
Sample documents:
1: nasty nasdaq future sampp future tariff news hold nvda puts deep money tomorrow close bear eat good week spy
2: investor panic stock market plummet trump tariff plan
3: people absolutely concept tariff amp differs gst mark simple understand
4: feel jvl level schadenfreude let just say happy birthday
5: fuzzy math math


In [6]:
topics, probs = topic_model.fit_transform(docs)

2025-08-20 03:10:01,519 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 700/700 [00:33<00:00, 21.05it/s]
2025-08-20 03:10:34,970 - BERTopic - Embedding - Completed ✓
2025-08-20 03:10:34,970 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-08-20 03:10:48,248 - BERTopic - Dimensionality - Completed ✓
2025-08-20 03:10:48,250 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_

In [7]:
freq = topic_model.get_topic_info(); freq.head(11)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,9379,-1_trump_tariff_china_market,"[trump, tariff, china, market, war, like, trad...",[trump tariff kill market continue lie pay imp...
1,0,1552,0_pay_country_importer_consumer,"[pay, country, importer, consumer, product, co...",[bizarre he brag tariff american consumer pay ...
2,1,652,1_china_chinese_additional_beijing,"[china, chinese, additional, beijing, trump, w...","[china humiliates trump tariff war, wowww chin..."
3,2,457,2_china_chinese_pay_import,"[china, chinese, pay, import, war, good, produ...",[consumer pay tariff chinese good amp chinese ...
4,3,380,3_canada_canadian_liberal_conservative,"[canada, canadian, liberal, conservative, elec...",[ummm canada tariff import country unless cana...
5,4,355,4_india_indian_ceasefire_offer,"[india, indian, ceasefire, offer, deal, claim,...",[clear message hindutuva rascal india superpow...
6,5,346,5_movie_outside_produce_foreign,"[movie, outside, produce, foreign, industry, a...",[trump tariff foreign movie make movie thing g...
7,6,318,6_inflation_cpi_low_rate,"[inflation, cpi, low, rate, expect, fed, feed,...",[s inflation low feed aint look current inflat...
8,7,258,7_japan_japanese_car_negotiator,"[japan, japanese, car, negotiator, auto, negot...","[charge tariff go japan, japan zerotariff poli..."
9,8,258,8_trump_man_tariff_president,"[trump, man, tariff, president, orange, plane,...","[tariff trump, tariff trump, dumo tariff trump..."


In [8]:
topic_model.get_topic(0)  # Select the most frequent topic

[('pay', 0.013334295065807302),
 ('country', 0.012449803364349234),
 ('importer', 0.010778419366984298),
 ('consumer', 0.00995299587417649),
 ('product', 0.008776666830487244),
 ('cost', 0.008672520575505622),
 ('tax', 0.008111438751791059),
 ('import', 0.007230246265084042),
 ('citizen', 0.007158418979535831),
 ('dont', 0.007140108910286662)]

In [10]:
import plotly.io as pio
from IPython.display import HTML 

fig_umap = topic_model.visualize_topics(top_n_topics=10)
HTML(fig_umap.to_html(include_plotlyjs="cdn"))

In [11]:
fig_bar = topic_model.visualize_barchart(top_n_topics=10)
HTML(fig_bar.to_html(include_plotlyjs="cdn"))

In [12]:
from gensim.models import CoherenceModel
from gensim import corpora

# Tokenize
tokenized_text = [doc.split() for doc in docs]

# Extract topic words from BERTopic model
topic_words = [ [word for word, _ in topic_model.get_topic(i)] for i in range(len(set(topics)) - 1) ]

# Dictionary and corpus for gensim
id2word = corpora.Dictionary(tokenized_text)
corpus = [id2word.doc2bow(text) for text in tokenized_text]

# Coherence model
coherence_model = CoherenceModel(
    topics=topic_words,
    texts=tokenized_text,
    dictionary=id2word,
    coherence='c_v'
)

coherence_score = coherence_model.get_coherence()
print(f"BERTopic c_v coherence score: {coherence_score:.4f}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

BERTopic c_v coherence score: 0.5560


In [15]:
# Get the most 10 probable words for each topic
topics = topic_model.get_topic_info()

# Print a formatted table of topics
print("\nTopic ID | Count | Keywords")
print("-" * 80)
for _, row in topics.iterrows():
    # Get first 5 keywords from representation list
    keywords = ", ".join(row["Representation"])
    print(f"{row['Topic']:8d} | {row['Count']:5d} | {keywords}")


Topic ID | Count | Keywords
--------------------------------------------------------------------------------
      -1 |  9379 | trump, tariff, china, market, war, like, trade, economy, go, just
       0 |  1552 | pay, country, importer, consumer, product, cost, tax, import, citizen, dont
       1 |   652 | china, chinese, additional, beijing, trump, war, win, deal, good, xi
       2 |   457 | china, chinese, pay, import, war, good, product, export, sell, importer
       3 |   380 | canada, canadian, liberal, conservative, election, state, st, free, hurt, government
       4 |   355 | india, indian, ceasefire, offer, deal, claim, trade, reciprocal, nation, report
       5 |   346 | movie, outside, produce, foreign, industry, announces, production, president, die, land
       6 |   318 | inflation, cpi, low, rate, expect, fed, feed, price, impact, data
       7 |   258 | japan, japanese, car, negotiator, auto, negotiation, minister, giant, talk, ton
       8 |   258 | trump, man, tariff

In [None]:
# Group by Topic and sample 5 random documents from each group
topic_info = topic_model.get_topic_info()
valid_topics = topic_info.loc[topic_info["Topic"] != -1, "Topic"].tolist()
top_docs_rows = []
if "probs" not in locals() or probs is None:
    # fallback: representative docs only (no prob scores, no index mapping)
    for t in valid_topics:
        reps = topic_model.get_representative_docs(t) or []
        for rank, doc in enumerate(reps[:5], 1):
            top_docs_rows.append({
                "topic_id": t,
                "rank": rank,
                "prob": None,
                "doc_index": None,
                "document": doc,
                "full_text": None  # cannot map back reliably here
            })
else:
    probs = np.asarray(probs)
    for t in valid_topics:
        if t < probs.shape[1]:
            top_idx = np.argsort(probs[:, t])[::-1][:5]
            for rank, i in enumerate(top_idx, 1):
                top_docs_rows.append({
                    "topic_id": t,
                    "rank": rank,
                    "prob": float(probs[i, t]),
                    "doc_index": int(i),
                    "document": docs[i],         # preprocessed
                    "full_text": lemmatized.full_text[i]    # raw/original text
                })

df_top_docs_bt = pd.DataFrame(top_docs_rows)
df_top_docs_bt.to_csv("bertopic_top5_docs_per_topic.csv", index=False)

In [13]:
import numpy as np

# Clean, aligned inputs
df = lemmatized.copy()
df["txt"] = df["text_lemmatized"].astype(str).str.strip()

# Parse X timestamps like: Tue Jun 24 22:24:33 +0000 2025
df["ts"] = pd.to_datetime(
    df["created_At"],
    format="%a %b %d %H:%M:%S %z %Y",   # match your data
    errors="coerce",
    utc=True
).dt.tz_convert(None)                    # drop timezone

# keep valid rows only
df = df[df["ts"].notna() & df["txt"].ne("")]

docs       = df["txt"].tolist()
timestamps = df["ts"].tolist()           # keep full datetime

# Get per-document TOPIC IDS that match THESE docs 
assigned_topics, _ = topic_model.transform(docs)   # ints: [2, 7, 1, ...]

# Safety checks
print("lens:", len(docs), len(assigned_topics), len(timestamps))
assert len(docs) == len(assigned_topics) == len(timestamps)
assert all(isinstance(t, (int, np.integer)) for t in assigned_topics), "topics must be ints"

# Topics over time 
tot = topic_model.topics_over_time(
    docs=docs,
    topics=assigned_topics,              
    timestamps=timestamps,
    nr_bins=20                           
)


Batches: 100%|██████████| 700/700 [00:32<00:00, 21.86it/s]
2025-08-20 03:13:23,706 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.

Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.

2025-08-20 03:13:33,775 - BERTopic - Dimensionality - Completed ✓
2025-08-20 03:13:33,776 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-08-20 03:13:34,522 - BERTopic - Probabilities - Start calculation of probabilities with HDBSCAN
2025-08-20 03:14:12,506 - BERTopic - Probabilities - Completed ✓
2025-08-20 03:14:12,508 - BERTopic - Cluster - Completed ✓


lens: 22392 22392 22392


20it [00:01, 18.53it/s]


In [14]:
fig_overtime = topic_model.visualize_topics_over_time(tot, top_n_topics=10)
HTML(fig_overtime.to_html(include_plotlyjs="cdn"))