# BERTopic: 
- Preprocess (normalize text, filter for "yes" label)

- Embedding (convert text to number representation)

- Top Modeling (find different cluster setups)

In [5]:
#imports / installs

import pandas as pd
import re
import glob
import os
import itertools
import hdbscan


## Step 1: Preprocessing

In [7]:
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r"http\S+|www\S+", "", text)
    # Remove mentions and hashtags
    text = re.sub(r"@\w+|#\w+", "", text)
    # Remove non-letter characters (keep punctuation if needed)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [8]:
from sentence_transformers import SentenceTransformer

def generate_embeddings(texts, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(texts, show_progress_bar=True)
    return embeddings


In [9]:
INPUT_PATH = "../../data/climate_classified/" #Currently uses the 14 jsons * 100,000 posts 

json_pattern = os.path.join(INPUT_PATH,'*.json')
combined_paths = glob.glob(json_pattern)

dfs = []


for path in combined_paths:
    try:
        df = pd.read_json(path)
        dfs.append(df)
    except ValueError as e:
        print(f"Failed to read {path}: {e}")


In [10]:
if dfs:
    temp = pd.concat(dfs, ignore_index=True)
    print(f"Loaded {len(temp)} posts from {len(dfs)} files.")
else:
    print("No data loaded.")

Loaded 1311270 posts from 14 files.


### NOTE: Currently filters for 60+ characters and score >= 0.99 !

In [11]:
# Step 1: Filter for climate-related posts only
climate_df = temp[temp["label"] == "yes"].copy()

# Step 2: Light preprocessing
climate_df["clean_text"] = climate_df["text"].astype(str).apply(preprocess_text)

# Step 3: Filter on character length and score
climate_df = climate_df[
    
    (climate_df["clean_text"].str.len() >= 60) &
    (climate_df["score"] >= 0.99)
].copy()

print(f"Remaining posts after full filtering: {len(climate_df)}")

Remaining posts after full filtering: 11467


In [12]:
# Save filtered climate_df

output_path = "../../data/filtered/above60chars_above99score.json"
climate_df.to_json(output_path)

print(f"Filtered dataset saved to: {output_path}")

Filtered dataset saved to: ../../data/filtered/above60chars_above99score.json


In [13]:
climate_df.columns

Index(['repo', 'seq', 'text', 'timestamp', 'cid', 'uri', 'label', 'score',
       'clean_text'],
      dtype='object')

In [14]:
climate_df.head()

Unnamed: 0,repo,seq,text,timestamp,cid,uri,label,score,clean_text
25,did:plc:uli2rqyfqasvuawksu2z5jkc,7778280581,Trump's executive order trying to block state ...,2025-04-09 21:10:45.855,bafyreihbjn7mnkbiytl4wc2jjhukux7xfncg772auwhhu...,at://did:plc:uli2rqyfqasvuawksu2z5jkc/app.bsky...,yes,0.997684,trumps executive order trying to block state c...
204,did:plc:4zh2idecxr5zudhn3oniodhw,7778286641,Spain and Canada signed agreements on renewabl...,2025-04-09 21:10:53.664,bafyreibvwoj6qzbffnpz4rkgxena26ejvpfqoznkbed7n...,at://did:plc:4zh2idecxr5zudhn3oniodhw/app.bsky...,yes,0.993054,spain and canada signed agreements on renewabl...
411,did:plc:m6ntt433rso3lp7dxaja3mue,7778293323,When did you bitch about what Republicans were...,2025-04-09 21:11:03.083,bafyreic4vipeqxz6mlm36qa7uw3qqjbxwo3qtvucl2fek...,at://did:plc:m6ntt433rso3lp7dxaja3mue/app.bsky...,yes,0.996543,when did you bitch about what republicans were...
441,did:plc:cm4nhw2xk43bczonk7mbfvrb,7778294643,"Hydrogen, as you know, is useful for decarboni...",2025-04-09 21:11:05.110,bafyreidsl6kwy2dx6rtvqkhjyafcffv6cwdud7aefrubf...,at://did:plc:cm4nhw2xk43bczonk7mbfvrb/app.bsky...,yes,0.997304,hydrogen as you know is useful for decarbonisi...
448,did:plc:ci5fsjcdjgoct5k3yllky4ud,7778294887,Either we end the Fossil Fuel Era or the Fossi...,2025-04-09 21:11:05.321,bafyreifegyoen4hku664cni3qqh3v6xbnoptueyb76dqm...,at://did:plc:ci5fsjcdjgoct5k3yllky4ud/app.bsky...,yes,0.99579,either we end the fossil fuel era or the fossi...


## Step 2: Embedding Generation

pip install -U sentence-transforme

In [15]:
pip install -U sentence-transformers -q

You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [16]:
from sentence_transformers import SentenceTransformer

In [17]:
model = SentenceTransformer('all-miniLM-L6-v2')
texts_to_embed = climate_df["clean_text"].tolist()
embeddings = model.encode(texts_to_embed, show_progress_bar=True) 

Batches: 100%|██████████| 359/359 [00:57<00:00,  6.22it/s]


In [64]:
pip install bertopic -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting bertopic
  Downloading bertopic-0.17.0-py3-none-any.whl (150 kB)
[K     |████████████████████████████████| 150 kB 261 kB/s eta 0:00:01
[?25hCollecting plotly>=4.7.0
  Downloading plotly-6.0.1-py3-none-any.whl (14.8 MB)
[K     |████████████████████████████████| 14.8 MB 167 kB/s eta 0:00:01
Collecting hdbscan>=0.8.29
  Downloading hdbscan-0.8.40.tar.gz (6.9 MB)
[K     |████████████████████████████████| 6.9 MB 60 kB/s eta 0:00:016
  distutils: /private/var/folders/f3/z043mppd07z6npdj2ch2t1mm0000gn/T/pip-build-env-ljns71ow/normal/lib/python3.9/site-packages
  sysconfig: /Library/Python/3.9/site-packages[0m
  distutils: /private/var/folders/f3/z043mppd07z6npdj2ch2t1mm0000gn/T/pip-build-env-ljns71ow/normal/lib/python3.9/site-packages
  sysconfig: /Library/Python/3.9/site-packages[0m
  user = False
  home = None
  root = None
  prefix = '/private/var/folders/f3/z043mppd07z6npdj2ch2t1mm0000gn/T/pip-b

In [65]:
from bertopic import BERTopic

In [66]:
topic_model = BERTopic(language="english", verbose=True)

In [68]:
topics, probs = topic_model.fit_transform(texts_to_embed, embeddings)

topic_model.get_topic_info().head(15)

2025-04-19 19:25:05,472 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-19 19:25:10,167 - BERTopic - Dimensionality - Completed ✓
2025-04-19 19:25:10,169 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-19 19:25:10,672 - BERTopic - Cluster - Completed ✓
2025-04-19 19:25:10,682 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-19 19:25:11,104 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4920,-1_the_and_to_of,"[the, and, to, of, in, for, is, on, that, we]",[understandable i know a lot of people essenti...
1,0,326,0_coal_plants_trump_clean,"[coal, plants, trump, clean, mine, beautiful, ...",[trump tried to bring back coal in his first t...
2,1,283,1_depth_snow_apr_precip,"[depth, snow, apr, precip, low, high, iembot, ...",[chanhassen mn apr climate report high low pre...
3,2,271,2_solar_electricity_panels_power,"[solar, electricity, panels, power, generation...",[renewables met of the growth in electricity d...
4,3,269,3_change_climate_denial_they,"[change, climate, denial, they, it, you, about...",[i think a lot about misinformation and how we...
5,4,246,4_apr_missing_iembot_additional,"[apr, missing, iembot, additional, details, vi...",[atlanta apr climate report high low precip sn...
6,5,146,5_forests_trees_logging_forest,"[forests, trees, logging, forest, national, tr...",[logging doesnt prevent wildfires but trump is...
7,6,138,6_ai_use_artists_crypto,"[ai, use, artists, crypto, environmental, ener...",[that is a good thing more ai more coal and ga...
8,7,107,7_laws_order_state_states,"[laws, order, state, states, executive, trump,...",[in a sweeping executive order signed late tue...
9,8,104,8_airport_apr_missing_iembot,"[airport, apr, missing, iembot, additional, de...",[fullerton airport ca apr climate report high ...


## Tuning

To improve the clustering of topics, we can improve the model in several ways:

- Change UDBSCAN settings (min_cluster_size, min_samples, metrics = euclidean, manhattan, cosine)
- Manual merging of topics (Two related fine-grained topics could be merged into a broader, more general topic)
- Change Sentence Transformer to a different model (ie. "all-mpnet-base-v2")



In [77]:
from hdbscan import HDBSCAN

In [80]:
# Define search grid
min_cluster_sizes = [10, 30]
min_samples_vals = [5, 10]
nr_topics_vals = [None, 30] 
embedding_models = ["all-MiniLM-L6-v2","all-mpnet-base-v2"]  # You can add more if needed

# Storage
results = []

for embed_model in embedding_models:
    model = SentenceTransformer(embed_model)

    for min_cluster_size, min_samples, nr_topics in itertools.product(min_cluster_sizes, min_samples_vals, nr_topics_vals):

        # HDBSCAN config
        hdbscan_model = HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            metric='euclidean',
            cluster_selection_method='eom'
        )

        # BERTopic model
        topic_model = BERTopic(
            hdbscan_model=hdbscan_model,
            language="english",
            calculate_probabilities=False,
            verbose=False
        )

        # Fit model
        topics, _ = topic_model.fit_transform(texts_to_embed, embeddings)

        # Reduce topics if requested
        if nr_topics:
            topic_model.reduce_topics(texts_to_embed, nr_topics=nr_topics)
            topics = topic_model.topics_

        # Evaluate
        topic_info = topic_model.get_topic_info()
        n_topics = len(topic_info[topic_info.Topic != -1])
        n_outliers = topic_info[topic_info.Topic == -1].Count.values[0] if -1 in topic_info.Topic.values else 0
        n_total = sum(topic_info.Count)

        results.append({
            "embedding_model": embed_model,
            "min_cluster_size": min_cluster_size,
            "min_samples": min_samples,
            "nr_topics": nr_topics if nr_topics else "None",
            "n_topics": n_topics,
            "outliers": n_outliers,
            "outlier_pct": round(n_outliers / n_total * 100, 2),
        })

        print(f"Finished: min_cluster_size={min_cluster_size}, min_samples={min_samples}, nr_topics={nr_topics}")


Finished: min_cluster_size=10, min_samples=5, nr_topics=None
Finished: min_cluster_size=10, min_samples=5, nr_topics=30
Finished: min_cluster_size=10, min_samples=10, nr_topics=None
Finished: min_cluster_size=10, min_samples=10, nr_topics=30
Finished: min_cluster_size=30, min_samples=5, nr_topics=None
Finished: min_cluster_size=30, min_samples=5, nr_topics=30
Finished: min_cluster_size=30, min_samples=10, nr_topics=None
Finished: min_cluster_size=30, min_samples=10, nr_topics=30


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Finished: min_cluster_size=10, min_samples=5, nr_topics=None
Finished: min_cluster_size=10, min_samples=5, nr_topics=30
Finished: min_cluster_size=10, min_samples=10, nr_topics=None
Finished: min_cluster_size=10, min_samples=10, nr_topics=30
Finished: min_cluster_size=30, min_samples=5, nr_topics=None
Finished: min_cluster_size=30, min_samples=5, nr_topics=30
Finished: min_cluster_size=30, min_samples=10, nr_topics=None
Finished: min_cluster_size=30, min_samples=10, nr_topics=30


In [82]:
results

[{'embedding_model': 'all-MiniLM-L6-v2',
  'min_cluster_size': 10,
  'min_samples': 5,
  'nr_topics': 'None',
  'n_topics': 188,
  'outliers': 4389,
  'outlier_pct': 38.28},
 {'embedding_model': 'all-MiniLM-L6-v2',
  'min_cluster_size': 10,
  'min_samples': 5,
  'nr_topics': 30,
  'n_topics': 29,
  'outliers': 4634,
  'outlier_pct': 40.41},
 {'embedding_model': 'all-MiniLM-L6-v2',
  'min_cluster_size': 10,
  'min_samples': 10,
  'nr_topics': 'None',
  'n_topics': 154,
  'outliers': 5177,
  'outlier_pct': 45.15},
 {'embedding_model': 'all-MiniLM-L6-v2',
  'min_cluster_size': 10,
  'min_samples': 10,
  'nr_topics': 30,
  'n_topics': 29,
  'outliers': 4831,
  'outlier_pct': 42.13},
 {'embedding_model': 'all-MiniLM-L6-v2',
  'min_cluster_size': 30,
  'min_samples': 5,
  'nr_topics': 'None',
  'n_topics': 72,
  'outliers': 4285,
  'outlier_pct': 37.37},
 {'embedding_model': 'all-MiniLM-L6-v2',
  'min_cluster_size': 30,
  'min_samples': 5,
  'nr_topics': 30,
  'n_topics': 29,
  'outliers': 4

## CURRENT!:

- steps of 2 for hyperparameters
- uses cosine through normalization as well - in case it doesnt work it goes back to euclidean
- logs and saves results in a csv

- 

In [18]:
import os
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from sklearn.preprocessing import normalize
import itertools
import pandas as pd
import time

#data
texts_to_embed = climate_df["clean_text"].tolist()

# Hyperparameter ranges
embedding_models = ["all-MiniLM-L6-v2", "all-mpnet-base-v2"]
min_cluster_sizes = list(range(10, 31, 2))     # 10 to 30, step 2
min_samples_vals = list(range(5, 11, 2))       # 5, 7, 9
nr_topics_vals = [None, 10, 20]
distance_metrics = ["euclidean", "manhattan", "cosine"]

# Logging setup
log_path = "../../data/BERTopic_Hyperparameters/bertopic_grid_log.csv"
log_columns = [
    "embedding_model", "metric", "min_cluster_size", "min_samples",
    "nr_topics", "n_topics", "outliers", "outlier_pct", "time_sec"
]

# Load or initialize log
if os.path.exists(log_path):
    log_df = pd.read_csv(log_path)
else:
    log_df = pd.DataFrame(columns=log_columns)
    log_df.to_csv(log_path, index=False)

# Main loop
for embed_model in embedding_models:
    print(f"\nEmbedding model: {embed_model}")
    model = SentenceTransformer(embed_model)
    embeddings_local = model.encode(texts_to_embed, show_progress_bar=True)

    for min_cluster_size, min_samples, metric, nr_topics in itertools.product(
        min_cluster_sizes, min_samples_vals, distance_metrics, nr_topics_vals
    ):
        # Check if already done
        run_key = {
            "embedding_model": embed_model,
            "metric": metric,
            "min_cluster_size": min_cluster_size,
            "min_samples": min_samples,
            "nr_topics": nr_topics if nr_topics else "None"
        }

        existing = log_df[
            (log_df.embedding_model == run_key["embedding_model"]) &
            (log_df.metric == run_key["metric"]) &
            (log_df.min_cluster_size == run_key["min_cluster_size"]) &
            (log_df.min_samples == run_key["min_samples"]) &
            (log_df.nr_topics == run_key["nr_topics"])
        ]

        if not existing.empty:
            print(f"Skipping already completed: {run_key}")
            continue

        print(f"\nRunning: {run_key}")
        start = time.time()

        # Normalize for cosine
        if metric == "cosine":
            embeddings_used = normalize(embeddings_local, norm="l2")
            hdbscan_metric = "euclidean"
        else:
            embeddings_used = embeddings_local
            hdbscan_metric = metric

        # HDBSCAN setup
        hdbscan_model = HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            metric=hdbscan_metric,
            cluster_selection_method="eom"
        )

        # BERTopic setup
        topic_model = BERTopic(
            hdbscan_model=hdbscan_model,
            language="english",
            calculate_probabilities=False,
            verbose=False
        )

        try:
            topics, _ = topic_model.fit_transform(texts_to_embed, embeddings_used)

            if nr_topics:
                topic_model.reduce_topics(texts_to_embed, nr_topics=nr_topics)
                topics = topic_model.topics_

            topic_info = topic_model.get_topic_info()
            n_topics = len(topic_info[topic_info.Topic != -1])
            n_outliers = topic_info[topic_info.Topic == -1].Count.values[0] if -1 in topic_info.Topic.values else 0
            n_total = sum(topic_info.Count)
            duration = round(time.time() - start, 2)

            log_entry = {
                **run_key,
                "n_topics": n_topics,
                "outliers": n_outliers,
                "outlier_pct": round(n_outliers / n_total * 100, 2),
                "time_sec": duration
            }

            log_df = pd.concat([log_df, pd.DataFrame([log_entry])], ignore_index=True)
            log_df.to_csv(log_path, index=False)

            print(f"Done | Topics: {n_topics}, Outliers: {n_outliers} ({log_entry['outlier_pct']}%) | Time: {duration}s")

        except Exception as e:
            print(f"Failed for config: {run_key} — {e}")



Embedding model: all-MiniLM-L6-v2


Batches: 100%|██████████| 359/359 [00:37<00:00,  9.67it/s]
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.



Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 'None'}
Done | Topics: 193, Outliers: 4803 (41.89%) | Time: 19.89s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 10}
Done | Topics: 9, Outliers: 4433 (38.66%) | Time: 5.32s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'euclidean', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 20}
Done | Topics: 19, Outliers: 4492 (39.17%) | Time: 5.18s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 'None'}
Done | Topics: 195, Outliers: 4679 (40.8%) | Time: 5.27s

Running: {'embedding_model': 'all-MiniLM-L6-v2', 'metric': 'manhattan', 'min_cluster_size': 10, 'min_samples': 5, 'nr_topics': 10}
Done | Topics: 9, Outliers: 4624 (40.32%) | Time: 6.48s

Running: {'embedding_model': 'all-MiniLM-L6-v2

In [None]:
Parameter | Values | Count
embedding_model | ["all-MiniLM-L6-v2", "all-mpnet-base-v2"] | 2
min_cluster_size | range(10, 31, 2) → [10, 12, ..., 30] | 11
min_samples | [5, 7, 9] | 3
metric | ["euclidean", "manhattan", "cosine"] | 3
nr_topics | [None, 10, 20] | 3