In [2]:
import os
from pathlib import Path
from typing import List, Dict, Any, Tuple

import pandas as pd
import numpy as np

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import umap
import hdbscan

In [3]:
##############
# parameters #
##############

DATA_PATH = Path(r"C:\Users\linna\OneDrive\Documents\Python_Dev\topic-modeling\data\public_comments.json")

TEXT_COL = "comment_text"
DOC_ID_COL = "comment_id"
DOCKET_TO_USE = "TTB-2025-0003"

# repo / outputs
try:
    REPO_ROOT = Path(__file__).parent.parent.resolve()
except NameError:
    REPO_ROOT = Path(os.getcwd()).parent.resolve()

OUTPUTS_DIR = REPO_ROOT / "outputs"
OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)

TOPIC_SUMMARY_CSV = OUTPUTS_DIR / "bertopic_topic_summary.csv"
OUTPUT_DF_CSV = OUTPUTS_DIR / "comments_with_bertopic.csv"
MODEL_SAVE_FILE = Path(r"C:\Users\linna\Documents\bertopic_model")

In [4]:
# embedding + clustering / reduction params
EMBEDDING_MODEL = "all-mpnet-base-v2"   # richer embedding (change if you prefer)
CHUNK_DOCS = False                      # set True to chunk docs into smaller pieces (experimental)

# UMAP (dim reduction) controls (affects granularity of embeddings before clustering)
UMAP_N_NEIGHBORS = 15
UMAP_MIN_DIST = 0.1
UMAP_N_COMPONENTS = 5

# HDBSCAN controls (controls cluster granularity)
HDBSCAN_MIN_CLUSTER_SIZE = 2    # smaller => more, smaller topics (tune this)
HDBSCAN_MIN_SAMPLES = 1

# output params
TOP_WORDS_PER_TOPIC = 10
SAMPLE_DOCS_PER_TOPIC = 5
TOP_N_FOR_LABEL = 7

In [5]:
def load_data(path: Path) -> pd.DataFrame:
    df = pd.read_json(path, orient="records", lines=False)

    if TEXT_COL not in df.columns:
        raise ValueError(f"{TEXT_COL} not found in dataframe columns: {df.columns.tolist()}")

    if "comment_title" in df.columns:
        # deduplicate mass comments
        pattern = re.compile(r'^\s*Mass Comment\s*[#\(\-:\s]*\s*(\d+)', flags=re.IGNORECASE)

        def _extract_mass_num(title):
            if not isinstance(title, str):
                return None
            m = pattern.match(title)
            if m:
                try:
                    return int(m.group(1))
                except ValueError:
                    return None
            return None

        df["__mass_num"] = df["comment_title"].apply(_extract_mass_num)

        # keep first occurrence for each mass_num, drop subsequent ones
        mask_mass = df["__mass_num"].notna()
        before_len = len(df)
        duplicated_mask = df.loc[mask_mass, "__mass_num"].duplicated(keep="first")
        dup_index = df.loc[mask_mass].index[duplicated_mask]
        if len(dup_index) > 0:
            df = df.drop(index=dup_index).reset_index(drop=True)
        else:
            df = df.reset_index(drop=True)
        after_len = len(df)
        print(f"Dropped {before_len - after_len} duplicate 'Mass Comment N' rows (kept first of each N).")

        df = df.drop(columns="__mass_num")
        
    df = df.dropna(subset=[TEXT_COL]).reset_index(drop=True)
    print(f"Loaded {len(df)} comments from {path}")
    return df

def filter_by_docket(df: pd.DataFrame, docket: str | None) -> pd.DataFrame:
    if docket is None:
        return df
    if "docket_id" not in df.columns:
        raise ValueError("docket_id column not in dataframe")
    df_sub = df[df["docket_id"] == docket].reset_index(drop=True)
    print(f"Filtered to docket '{docket}': {len(df_sub)} comments")
    return df_sub

def chunk_documents(documents: List[str], chunk_size: int = 200, overlap: int = 50) -> Tuple[List[str], List[str]]:
    """
    Simple sliding-window chunker returning (chunks, chunk_origin_doc_id)
    chunk_size = number of tokens (approx via split) per chunk
    overlap = overlapping tokens between chunks
    """
    chunks = []
    origin_ids = []
    for i, doc in enumerate(documents):
        toks = str(doc).split()
        if len(toks) <= chunk_size:
            chunks.append(" ".join(toks))
            origin_ids.append(i)
        else:
            start = 0
            while start < len(toks):
                end = min(len(toks), start + chunk_size)
                chunks.append(" ".join(toks[start:end]))
                origin_ids.append(i)
                if end == len(toks):
                    break
                start = end - overlap
    return chunks, origin_ids

def train_bertopic(documents: List[str], embedding_model_name: str, verbose: bool = True) -> Tuple[BERTopic, np.ndarray]:
    """
    Train BERTopic with a SentenceTransformer embedding model and custom UMAP/HDBSCAN.
    Returns (model, embeddings)
    """
    if verbose:
        print("Loading embedding model:", embedding_model_name)
    embedder = SentenceTransformer(embedding_model_name)

    # compute embeddings first (we can reuse them)
    if verbose:
        print("Computing embeddings for", len(documents), "documents...")
    embeddings = embedder.encode(documents, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)

    # build UMAP and HDBSCAN instances with tunable params
    umap_model = umap.UMAP(n_neighbors=UMAP_N_NEIGHBORS, n_components=UMAP_N_COMPONENTS, min_dist=UMAP_MIN_DIST, metric="cosine", random_state=42)
    hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=HDBSCAN_MIN_CLUSTER_SIZE, min_samples=HDBSCAN_MIN_SAMPLES, metric="euclidean", cluster_selection_method="eom", prediction_data=True)

    # instantiate BERTopic with our reducers/clusters
    topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model, calculate_probabilities=True, verbose=verbose)
    if verbose:
        print("Training BERTopic...")
    topics, probs = topic_model.fit_transform(documents, embeddings)
    if verbose:
        print("BERTopic training complete. Generated", len(set(topics)) - (1 if -1 in topics else 0), "non-outlier topics (excludes -1).")
    return topic_model, embeddings

def map_dominant_and_topN_bertopic(model: BERTopic, documents: List[str], df: pd.DataFrame,
                                   doc_id_col: str, embeddings: np.ndarray | None = None,
                                   topN: int = 3) -> pd.DataFrame:
    """
    Adds:
      - bertopic_dominant_topic
      - bertopic_top_topics (list)
      - bertopic_topic_rank_{i}
    Pass 'embeddings' (precomputed) to avoid BERTopic trying to re-embed.
    """
    # If we have embeddings, pass them; otherwise let BERTopic try (may fail if embedding_model is None)
    if embeddings is not None:
        topics, probs = model.transform(documents, embeddings=embeddings)
    else:
        topics, probs = model.transform(documents)  # may raise if model.embedding_model is None

    df["bertopic_dominant_topic"] = topics

    # compute topN from probs if available
    if probs is not None:
        probs_arr = np.array(probs)
        if probs_arr.ndim == 2:
            top_indices = np.argsort(probs_arr, axis=1)[:, ::-1][:, :topN]
            top_lists = top_indices.tolist()
            topic_info = model.get_topic_info().reset_index(drop=True)
            topic_ids_order = topic_info["Topic"].tolist()
            idx_to_topic = {i: tid for i, tid in enumerate(topic_ids_order)}
            top_topics = [[idx_to_topic.get(i, -1) for i in lst] for lst in top_lists]
            df["bertopic_top_topics"] = top_topics
            for i in range(topN):
                df[f"bertopic_topic_rank_{i+1}"] = df["bertopic_top_topics"].apply(lambda l: l[i] if i < len(l) else -1)
        else:
            df["bertopic_top_topics"] = df["bertopic_dominant_topic"].apply(lambda x: [int(x)] + [-1]*(topN-1))
            for i in range(topN):
                df[f"bertopic_topic_rank_{i+1}"] = df["bertopic_top_topics"].apply(lambda l: l[i] if i < len(l) else -1)
    else:
        df["bertopic_top_topics"] = df["bertopic_dominant_topic"].apply(lambda x: [int(x)] + [-1]*(topN-1))
        for i in range(topN):
            df[f"bertopic_topic_rank_{i+1}"] = df["bertopic_top_topics"].apply(lambda l: l[i] if i < len(l) else -1)

    return df

def build_topic_summary_bertopic(model: BERTopic, df: pd.DataFrame, documents: List[str],
                                 doc_id_col: str, top_words: int = 10, sample_docs: int = 5) -> pd.DataFrame:
    """
    Build a topic summary DataFrame similar to your Top2Vec output:
      columns: topic_num, size, top_words (comma sep), sample_comments (|||)
    This version avoids variable shadowing and safely handles different
    return types from model.get_representative_docs(...)
    """
    info = model.get_topic_info()  # DataFrame with 'Topic' and 'Count' and 'Name'
    rows = []
    for _, row in info.iterrows():
        tnum = int(row["Topic"])
        size = int(row["Count"])

        # Get top words for topic (BERTopic returns list of (word, score))
        topic_words = model.get_topic(tnum)
        if topic_words:
            words = [w for w, s in topic_words][:top_words]
        else:
            words = []

        # --- get representative docs safely ---
        rep_docs_list = []
        try:
            rep = model.get_representative_docs(tnum)
            # rep may be None, list, tuple, np.ndarray, or other iterable
            if rep is None:
                rep_docs_list = []
            elif isinstance(rep, (list, tuple)):
                rep_docs_list = list(rep)[:sample_docs]
            else:
                # try to coerce to list (covers numpy arrays, pandas Series, generators)
                try:
                    rep_docs_list = list(rep)[:sample_docs]
                except Exception:
                    rep_docs_list = []
        except Exception:
            # If get_representative_docs throws, fallback to dataframe selection below
            rep_docs_list = []

        # If no representative docs from model, fallback to df rows for that topic
        if not rep_docs_list:
            try:
                mask = df["bertopic_dominant_topic"] == tnum
                rep_docs_list = df.loc[mask, TEXT_COL].astype(str).tolist()[:sample_docs]
            except Exception:
                rep_docs_list = []

        # Prepare sample_texts (truncate and clean)
        sample_texts = []
        for s in rep_docs_list:
            try:
                s_str = str(s).replace("\n", " ")
            except Exception:
                s_str = ""
            sample_texts.append(s_str[:400])

        rows.append({
            "topic_num": tnum,
            "size": size,
            "top_words": ", ".join(words),
            "sample_comments": " ||| ".join(sample_texts)
        })

    summary_df = pd.DataFrame(rows).sort_values("size", ascending=False).reset_index(drop=True)
    return summary_df

In [6]:
# ------------------ main flow ------------------ #
def main_bertopic():
    df = load_data(DATA_PATH)
    df = filter_by_docket(df, DOCKET_TO_USE)
    df = df.dropna(subset=[TEXT_COL]).reset_index(drop=True)
    print("Docs in df:", len(df))

    # optionally chunk documents to increase micro-topic detection
    if CHUNK_DOCS:
        docs = df[TEXT_COL].astype(str).tolist()
        chunks, origin_ids = chunk_documents(docs, chunk_size=120, overlap=30)
        print("Created", len(chunks), "chunks from", len(docs), "original docs.")
        train_docs = chunks
        # keep mapping back to original doc index for summarization if needed
        chunk_origin = origin_ids
    else:
        train_docs = df[TEXT_COL].astype(str).tolist()
        chunk_origin = None

    # Train BERTopic
    model, embeddings = train_bertopic(train_docs, EMBEDDING_MODEL)
    # ...
    df = map_dominant_and_topN_bertopic(model, train_docs, df, DOC_ID_COL, embeddings=embeddings, topN=3)

    # label topics and build summary
    topic_summary = build_topic_summary_bertopic(model, df, train_docs, DOC_ID_COL, top_words=TOP_WORDS_PER_TOPIC, sample_docs=SAMPLE_DOCS_PER_TOPIC)
    topic_summary.to_csv(TOPIC_SUMMARY_CSV, index=False)
    print("Topic summary saved to:", TOPIC_SUMMARY_CSV)

    # save df with topics
    df.to_csv(OUTPUT_DF_CSV, index=False)
    print("Comments with topic columns saved to:", OUTPUT_DF_CSV)

    # save model
    model.save(str(MODEL_SAVE_FILE))
    print("Model saved to:", MODEL_SAVE_FILE)

    print("Top rows of topic_summary:")
    print(topic_summary.head(10).to_string(index=False))

    return model, df, topic_summary

In [7]:
model, df_out, topic_summary = main_bertopic()

Loaded 12437 comments from C:\Users\linna\OneDrive\Documents\Python_Dev\topic-modeling\data\public_comments.json
Filtered to docket 'TTB-2025-0003': 189 comments
Docs in df: 189
Loading embedding model: all-mpnet-base-v2
Computing embeddings for 189 documents...


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

2025-09-08 10:42:21,370 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


Training BERTopic...


2025-09-08 10:42:32,962 - BERTopic - Dimensionality - Completed ✓
2025-09-08 10:42:32,962 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-08 10:42:33,059 - BERTopic - Cluster - Completed ✓
2025-09-08 10:42:33,068 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-08 10:42:33,209 - BERTopic - Representation - Completed ✓
2025-09-08 10:42:33,454 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-09-08 10:42:33,461 - BERTopic - Dimensionality - Completed ✓
2025-09-08 10:42:33,463 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-09-08 10:42:33,478 - BERTopic - Probabilities - Start calculation of probabilities with HDBSCAN
2025-09-08 10:42:33,552 - BERTopic - Probabilities - Completed ✓
2025-09-08 10:42:33,553 - BERTopic - Cluster - Completed ✓


BERTopic training complete. Generated 49 non-outlier topics (excludes -1).
Topic summary saved to: C:\Users\linna\OneDrive\Documents\Python_Dev\topic-modeling\outputs\bertopic_topic_summary.csv
Comments with topic columns saved to: C:\Users\linna\OneDrive\Documents\Python_Dev\topic-modeling\outputs\comments_with_bertopic.csv
Model saved to: C:\Users\linna\Documents\bertopic_model
Top rows of topic_summary:
 topic_num  size                                                                             top_words                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        