# BERTopic Model on Far-Right Telegram Data

This notebook runs BERTopic on a filtered subset of channels from the dataset. It reads data from `data/`, filters by channel names in `docs/network-channel-names.csv`, fits a topic model, and writes results to `output/`.

## Setup

This project uses [uv](https://docs.astral.sh/uv/) to manage Python and dependencies. To run this notebook:

1. Install `uv` from https://docs.astral.sh/uv/getting-started/installation/
2. From the root of this repository, run: `uv run --with jupyter jupyter lab run-model.ipynb`

`uv` will automatically install the correct Python version and all required packages.

In [None]:
import os
os.environ["LOKY_MAX_CPU_COUNT"] = "1"
os.environ["OMP_NUM_THREADS"] = "1"

import re
import glob
import pandas as pd
from natsort import natsorted

In [None]:
clubnames_path = "./docs/network-channel-names.csv"
clubnames_df = pd.read_csv(clubnames_path)
clubnames_list = clubnames_df["clubname"].dropna().tolist()
print(f">>> Loaded {len(clubnames_list)} club names.")

In [None]:
datasets_chunked = natsorted(glob.glob("./data/processed_part_*.csv"))
print(f">>> Found {len(datasets_chunked)} data files (sorted numerically).")

channelnames: list[str] = []
for f in datasets_chunked:
    df = pd.read_csv(f, usecols=["channel_name"])
    channelnames.extend(df["channel_name"].dropna().unique())

channelname_df = pd.DataFrame({"channel_name": list(set(channelnames))})
print(f">>> Found {len(channelname_df)} unique channel names.")

save_path = "./output/allchannelnames.csv"
channelname_df.to_csv(save_path, index=False, encoding="utf-8")
print(f">>> Saved channel names to {save_path}")

In [None]:
channelnames_list = channelname_df["channel_name"].tolist()
channelnames_pattern = r'\b(?:' + '|'.join(map(re.escape, channelnames_list)) + r')\b'

docs: list[str] = []
unique_docs: set[str] = set()

print(">>> Filtering and deduplicating documents...")
total = len(datasets_chunked)
for i, dataset in enumerate(datasets_chunked, 1):
    print(f">>>   [{i}/{total}] {os.path.basename(dataset)}")
    try:
        df_iter = pd.read_csv(
            dataset, usecols=["channel_name", "cleaned_message"],
            sep=None, engine="python", encoding="utf-8", chunksize=10000,
        )
        for chunk in df_iter:
            filtered = chunk[chunk["channel_name"].isin(clubnames_df["clubname"])].copy()
            filtered = filtered.drop_duplicates(subset=["cleaned_message"])
            filtered.loc[:, "cleaned_message"] = (
                filtered["cleaned_message"].str.replace(channelnames_pattern, '', regex=True)
            )
            for msg in filtered["cleaned_message"].dropna():
                if msg not in unique_docs:
                    unique_docs.add(msg)
                    docs.append(msg)
    except Exception as e:
        print(f">>>   Error processing {dataset}: {e}")

print(f">>> Total unique documents collected: {len(docs)}")

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

print(">>> Loading embedding model (paraphrase-multilingual-MiniLM-L12-v2)...")
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

In [None]:
print(f">>> Encoding {len(docs)} documents (this may take a while)...")
embeddings = embedding_model.encode(docs, show_progress_bar=True)
print(">>> Encoding complete.")

In [None]:
print(">>> Initializing UMAP, HDBSCAN, and CountVectorizer...")
umap_model = UMAP(
    n_neighbors=15, n_components=5,
    min_dist=0.0, metric='cosine', random_state=42,
)
hdbscan_model = HDBSCAN(
    min_cluster_size=150, metric='euclidean',
    cluster_selection_method='eom', prediction_data=True,
    core_dist_n_jobs=1,
)
vectorizer_model = CountVectorizer(
    stop_words="english", min_df=2, ngram_range=(1, 2),
)

print(">>> Fitting BERTopic model...")
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    top_n_words=20,
    verbose=True,
)
topics, probs = topic_model.fit_transform(docs, embeddings)

n_topics = len(set(topics)) - (1 if -1 in topics else 0)
print(f">>> BERTopic fitting complete. Found {n_topics} topics.")

In [None]:
print(">>> Computing reduced embeddings for visualization...")
reduced_embeddings = UMAP(
    n_neighbors=10, n_components=2,
    min_dist=0.0, metric='cosine',
).fit_transform(embeddings)
print(">>> Reduced embeddings complete.")

In [None]:
print(f">>> Docs: {len(docs)} | Topics: {len(topics)} | Model topics: {len(topic_model.topics_)}")

df = pd.DataFrame({"topic": topics, "document": docs})
save_path_df = "./output/topic-model-results.csv"
df.to_csv(save_path_df, index=False, encoding="utf-8")
print(f">>> Results saved to {save_path_df}")
print(">>> Done.")