In [2]:
###########
# imports #
###########

import os
import re
from pathlib import Path
from typing import List, Dict, Any, Tuple

import pandas as pd
import numpy as np
from nltk.corpus import stopwords

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import umap
import hdbscan

In [3]:
##############
# parameters #
##############

DATA_PATH = Path(r"C:\Users\linna\OneDrive\Documents\Python_Dev\topic-modeling\data\comments_09DEC2025.json")

TEXT_COL = "comment_text"
DOC_ID_COL = "comment_id"
DOCKET_TO_USE = "TTB-2025-0002"

# repo / outputs
try:
    REPO_ROOT = Path(__file__).parent.parent.resolve()
except NameError:
    REPO_ROOT = Path(os.getcwd()).parent.resolve()

OUTPUTS_DIR = REPO_ROOT / "outputs"
OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)

TOPIC_SUMMARY_CSV = OUTPUTS_DIR / "bertopic_topic_summary.csv"
OUTPUT_DF_CSV = OUTPUTS_DIR / "comments_with_bertopic.csv"
MODEL_SAVE_FILE = Path(r"C:\Users\linna\Documents\bertopic_model")

In [4]:
#############
# functions #
#############

def load_data(path: Path) -> pd.DataFrame:
    df = pd.read_json(path, orient="records", lines=False)

    if TEXT_COL not in df.columns:
        raise ValueError(f"{TEXT_COL} not found in dataframe columns: {df.columns.tolist()}")

    if "comment_title" in df.columns:
        # deduplicate mass comments, compute docket-level counts
        pattern = re.compile(r'^\s*Mass Comment\s*[#\(\-:\s]*\s*(\d+)', flags=re.IGNORECASE)

        def _extract_mass_num(title):
            if not isinstance(title, str):
                return None
            m = pattern.match(title)
            if m:
                try:
                    return int(m.group(1))
                except ValueError:
                    return None
            return None

        df["__mass_num"] = df["comment_title"].apply(_extract_mass_num)

        # mask for mass rows
        mask_mass = df["__mass_num"].notna()

        # create mass_count column
        df["mass_count"] = 0

        if mask_mass.any():
            # if docket column exists -- counts per (__mass_num, docket_id)
            if "docket_id" in df.columns:
                # (mass_num, docket_id) -> count
                counts = df.loc[mask_mass].groupby(["__mass_num", "docket_id"]).size()
                # assign mass_count
                def _lookup_count(row):
                    key = (row["__mass_num"], row["docket_id"])
                    return int(counts.get(key, 0))
                df.loc[mask_mass, "mass_count"] = df.loc[mask_mass].apply(_lookup_count, axis=1)
            else:
                # counts per mass_num across whole df if docket unspecified 
                counts = df.loc[mask_mass].groupby("__mass_num").size().to_dict()
                df.loc[mask_mass, "mass_count"] = df.loc[mask_mass, "__mass_num"].map(lambda x: int(counts.get(x, 0)))

        # keep first occurrence for each mass_num
        before_len = len(df)
        duplicated_mask = df.loc[mask_mass, "__mass_num"].duplicated(keep="first")
        dup_index = df.loc[mask_mass].index[duplicated_mask]
        if len(dup_index) > 0:
            df = df.drop(index=dup_index).reset_index(drop=True)
        else:
            df = df.reset_index(drop=True)
        after_len = len(df)
        print(f"Dropped {before_len - after_len} duplicate 'Mass Comment N' rows (kept first of each N).")

        # drop helper column
        df = df.drop(columns="__mass_num")

        # explode & access attachment text:
        a = df.explode('comment_text_sources')
        b = a['comment_text_sources'].apply(pd.Series)
        df = pd.concat([a.drop(columns='comment_text_sources'), b], axis=1)

        # handle cases where attachments have identical text -- i.e. a pdf/docx submission of the same comment
        df['text_clean'] = df['text'].apply(lambda x: str(x).lower().strip())
        cluster = pd.DataFrame(df['text_clean'].value_counts())
        df = df.merge(cluster, how='left', on='text_clean')
        df = df.drop_duplicates(subset=['comment_tracking_nbr', 'comment_title', 'text_clean', 'count'])
        df = df.drop(columns=['text_clean', 'count'])

    return df

def filter_by_docket(df: pd.DataFrame, docket: str | None) -> pd.DataFrame:
    if docket is None:
        return df
    if "docket_id" not in df.columns:
        raise ValueError("docket_id column not in dataframe")
    df_sub = df[df["docket_id"] == docket].reset_index(drop=True)
    print(f"Filtered to docket '{docket}': {len(df_sub)} comments")
    return df_sub

def train_bertopic(documents: List[str], embedding_model_name: str, verbose: bool = True) -> Tuple[BERTopic, np.ndarray]:
    """
    Returns (model, embeddings)
    """
    if verbose:
        print("Loading embedding model:", embedding_model_name)
    embedder = SentenceTransformer(embedding_model_name)

    # compute embeddings (NN)
    if verbose:
        print("Computing embeddings for", len(documents), "documents...")
    embeddings = embedder.encode(documents, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)

    # UMAP (dimension reduction of embeddings) and HDBSCAN instances (clustering algo used on embedding representation of comments)
    umap_model = umap.UMAP(n_neighbors=UMAP_N_NEIGHBORS, n_components=UMAP_N_COMPONENTS, min_dist=UMAP_MIN_DIST, metric="cosine", random_state=42)
    hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=HDBSCAN_MIN_CLUSTER_SIZE, min_samples=HDBSCAN_MIN_SAMPLES, metric="euclidean", cluster_selection_method="eom", prediction_data=True)

    # instantiate BERTopic with our reducers/clusters
    topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model, calculate_probabilities=True, verbose=verbose)
    if verbose:
        print("Training BERTopic...")
    topics, probs = topic_model.fit_transform(documents, embeddings)
    if verbose:
        print("BERTopic training complete. Generated", len(set(topics)) - (1 if -1 in topics else 0), "non-outlier topics (excludes -1).")
    return topic_model, embeddings

def map_dominant_and_topN_bertopic(model: BERTopic, documents: List[str], df: pd.DataFrame,
                                   doc_id_col: str, embeddings: np.ndarray | None = None,
                                   topN: int = 3) -> pd.DataFrame:
    """
    Adds:
      - bertopic_dominant_topic
      - bertopic_top_topics (list of best-represented topics in comment)
      - bertopic_topic_rank_{i}
    Pass 'embeddings' (precomputed) to avoid BERTopic trying to re-embed.
    """
    # failsafe if (for some reason) we did not already compute/pass embeddings:
    if embeddings is not None:
        topics, probs = model.transform(documents, embeddings=embeddings)
    else:
        topics, probs = model.transform(documents)

    df["bertopic_dominant_topic"] = topics

    # compute topN from probs if available
    if probs is not None:
        probs_arr = np.array(probs)
        if probs_arr.ndim == 2:
            top_indices = np.argsort(probs_arr, axis=1)[:, ::-1][:, :topN]
            top_lists = top_indices.tolist()
            topic_info = model.get_topic_info().reset_index(drop=True)
            topic_ids_order = topic_info["Topic"].tolist()
            idx_to_topic = {i: tid for i, tid in enumerate(topic_ids_order)}
            top_topics = [[idx_to_topic.get(i, -1) for i in lst] for lst in top_lists]
            df["bertopic_top_topics"] = top_topics
            for i in range(topN):
                df[f"bertopic_topic_rank_{i+1}"] = df["bertopic_top_topics"].apply(lambda l: l[i] if i < len(l) else -1)
        else:
            df["bertopic_top_topics"] = df["bertopic_dominant_topic"].apply(lambda x: [int(x)] + [-1]*(topN-1))
            for i in range(topN):
                df[f"bertopic_topic_rank_{i+1}"] = df["bertopic_top_topics"].apply(lambda l: l[i] if i < len(l) else -1)
    else:
        df["bertopic_top_topics"] = df["bertopic_dominant_topic"].apply(lambda x: [int(x)] + [-1]*(topN-1))
        for i in range(topN):
            df[f"bertopic_topic_rank_{i+1}"] = df["bertopic_top_topics"].apply(lambda l: l[i] if i < len(l) else -1)

    return df

def build_topic_summary_bertopic(model: BERTopic, df: pd.DataFrame, documents: List[str],
                                 doc_id_col: str, top_words: int = 10, sample_docs: int = 5) -> pd.DataFrame:
    """
    Build a topic summary df for output with columns: topic_num, size, top_words, sample_comments
    """
    info = model.get_topic_info()  # df with 'topic', 'count' and 'name'
    rows = []
    for _, row in info.iterrows():
        tnum = int(row["Topic"])
        size = int(row["Count"])

        # get top words for topic (BERTopic returns list of (word, score))
        topic_words = model.get_topic(tnum)
        if topic_words:
            words = [w for w, s in topic_words][:top_words]
        else:
            words = []

        # get representative comments
        rep_docs_list = []
        try:
            rep = model.get_representative_docs(tnum)
            # handle multiple output configurations
            if rep is None:
                rep_docs_list = []
            elif isinstance(rep, (list, tuple)):
                rep_docs_list = list(rep)[:sample_docs]
            else:
                try:
                    rep_docs_list = list(rep)[:sample_docs]
                except Exception:
                    rep_docs_list = []
        except Exception:
            rep_docs_list = []

        # if no representative docs from model grab df rows for that topic instead
        if not rep_docs_list:
            try:
                mask = df["bertopic_dominant_topic"] == tnum
                rep_docs_list = df.loc[mask, TEXT_COL].astype(str).tolist()[:sample_docs]
            except Exception:
                rep_docs_list = []

        # format sample comments
        sample_texts = []
        for s in rep_docs_list:
            try:
                s_str = str(s).replace("\n", " ")
            except Exception:
                s_str = ""
            sample_texts.append(s_str[:400])

        rows.append({
            "topic_num": tnum,
            "size": size,
            "top_words": ", ".join(words),
            "sample_comments": " ||| ".join(sample_texts)
        })

    summary_df = pd.DataFrame(rows).sort_values("size", ascending=False).reset_index(drop=True)
    return summary_df

def chunk_documents(documents: List[str], chunk_size: int = 80, overlap: int = 20):
    chunks = []
    origin_ids = []
    for i, doc in enumerate(documents):
        toks = str(doc).split()
        if len(toks) <= chunk_size:
            chunks.append(" ".join(toks))
            origin_ids.append(i)
        else:
            start = 0
            while start < len(toks):
                end = min(len(toks), start + chunk_size)
                chunks.append(" ".join(toks[start:end]))
                origin_ids.append(i)
                if end == len(toks):
                    break
                start = end - overlap
    return chunks, origin_ids

def train_bertopic_on_chunks(documents: List[str], embedding_model_name: str,
                             chunk_size=80, overlap=20,
                             umap_n_neighbors=10, umap_min_dist=0.0, umap_n_components=5, cluster_selection_epsilon=0.2,
                             hdb_min_cluster_size=3, hdb_min_samples=1,
                             verbose=True):
    # chunk
    chunks, origin_ids = chunk_documents(documents, chunk_size=chunk_size, overlap=overlap)
    print(f"Created {len(chunks)} chunks from {len(documents)} documents (ratio {len(chunks)/len(documents):.2f}).")

    # embed
    embedder = SentenceTransformer(embedding_model_name)
    embeddings = embedder.encode(chunks, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)

    # build UMAP/HDBSCAN
    umap_model = umap.UMAP(n_neighbors=umap_n_neighbors, n_components=umap_n_components,
                           min_dist=umap_min_dist, metric="cosine", random_state=42)
    hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=hdb_min_cluster_size, cluster_selection_epsilon=HDBSCAN_CLUSTER_SELECTION_EPSILON,
                                    min_samples=hdb_min_samples,
                                    metric="euclidean", cluster_selection_method="eom",
                                    prediction_data=True)
    topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model,
                           calculate_probabilities=True, verbose=verbose)

    # fit
    topics, probs = topic_model.fit_transform(chunks, embeddings)
    print("BERTopic (chunks) training complete. Non-outlier topics:",
          len(set(topics)) - (1 if -1 in topics else 0))
    return topic_model, chunks, origin_ids, embeddings, topics, probs

def aggregate_chunk_topics_to_docs(chunks, origin_ids, chunk_topics, chunk_probs, df, doc_id_col, topN=3):
    """
    Aggregates chunk-level topics back to doc-level.
    - Majority vote on chunk topics per document for dominant topic.
    - For topN, uses chunk counts per topic (you can extend to use probabilities).
    """
    tmp = _pd.DataFrame({
        "origin_idx": origin_ids,
        "chunk_topic": chunk_topics
    })
    # count topic frequency per original doc index
    counts = tmp.groupby(["origin_idx", "chunk_topic"]).size().rename("cnt").reset_index()
    # find dominant topic per origin_idx
    dominant = counts.sort_values(["origin_idx", "cnt"], ascending=[True, False]).groupby("origin_idx").first().reset_index()
    dominant_map = dict(zip(dominant["origin_idx"], dominant["chunk_topic"]))

    # build topN lists per origin_idx
    topn_map = {}
    for oid, group in counts.groupby("origin_idx"):
        top_topics = group.sort_values("cnt", ascending=False).head(topN)["chunk_topic"].astype(int).tolist()
        # pad if needed
        while len(top_topics) < topN:
            top_topics.append(-1)
        topn_map[oid] = top_topics

    # attach to df by original index
    # original index in df corresponds to 0..len(df)-1 as created in main flow
    df = df.copy()
    df["bertopic_dominant_topic"] = df.index.map(lambda i: int(dominant_map.get(i, -1)))
    df["bertopic_top_topics"] = df.index.map(lambda i: topn_map.get(i, [-1]*topN))
    # expand top rank columns
    for i in range(topN):
        df[f"bertopic_topic_rank_{i+1}"] = df["bertopic_top_topics"].apply(lambda l: int(l[i]) if i < len(l) else -1)
    return df

In [5]:
##############
# parameters #
##############

CHUNK_DOCS = True              
CHUNK_SIZE = 80                # tokens per chunk (words)
CHUNK_OVERLAP = 20             # overlapping tokens between chunks
HDBSCAN_MIN_CLUSTER_SIZE = 3   # 3-5 usually serves well, start tuning here. Also adjust UMAP_MIN_DISTANCE & N_COMPONENTS
HDBSCAN_MIN_SAMPLES = 1
HDBSCAN_CLUSTER_SELECTION_EPSILON = 0.2 # for data with meaningful large and small clusters 
UMAP_N_NEIGHBORS = 10
UMAP_MIN_DIST = 0.07 # min dist to maintain bt points when reducing embeddings
UMAP_N_COMPONENTS = 20 # n-dimensions
EMBEDDING_MODEL = "all-mpnet-base-v2" # pretrained NN

TOP_WORDS_PER_TOPIC = 7
SAMPLE_DOCS_PER_TOPIC = 5

In [6]:
# load data, deduplicate attachments and mass comments
# df = load_data(DATA_PATH)
# df = filter_by_docket(df, DOCKET_TO_USE)
# df = df.dropna(subset=[TEXT_COL]).reset_index(drop=True)
# docs = df[TEXT_COL].astype(str).tolist()

In [7]:
###################################################
# implement topic modeling pipeline with chunking #
###################################################

# load, deduplicate (attachments and mass comments)
df = load_data(DATA_PATH)
df = filter_by_docket(df, DOCKET_TO_USE)
df = df.dropna(subset=[TEXT_COL]).reset_index(drop=True)
docs = df[TEXT_COL].astype(str).tolist()

# embeddings, dimension reduction, clustering
topic_model, chunks, origin_ids, embeddings, topics, probs = train_bertopic_on_chunks(
    docs,
    EMBEDDING_MODEL,
    chunk_size=CHUNK_SIZE,
    overlap=CHUNK_OVERLAP,
    umap_n_neighbors=UMAP_N_NEIGHBORS,
    umap_min_dist=UMAP_MIN_DIST,
    umap_n_components=UMAP_N_COMPONENTS,
    hdb_min_cluster_size=HDBSCAN_MIN_CLUSTER_SIZE,
    hdb_min_samples=HDBSCAN_MIN_SAMPLES,
    cluster_selection_epsilon=HDBSCAN_CLUSTER_SELECTION_EPSILON,
    verbose=True
)

Dropped 8545 duplicate 'Mass Comment N' rows (kept first of each N).
Filtered to docket 'TTB-2025-0002': 262 comments
Created 627 chunks from 262 documents (ratio 2.39).


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

2025-12-17 15:34:18,409 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-17 15:35:07,404 - BERTopic - Dimensionality - Completed ✓
2025-12-17 15:35:07,411 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-17 15:35:09,121 - BERTopic - Cluster - Completed ✓
2025-12-17 15:35:09,139 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-17 15:35:10,035 - BERTopic - Representation - Completed ✓


BERTopic (chunks) training complete. Non-outlier topics: 87


In [8]:
# # always an option to consolidate -- usually somewhere between 20-30 works well
# topic_model.reduce_topics(chunks, nr_topics=15)

In [9]:
# aggregate chunk-level topics back to original df rows
df_with_topics = aggregate_chunk_topics_to_docs(chunks, origin_ids, topics, probs, df, DOC_ID_COL, topN=3)

# build topic summary (prev step ensures example comments, not chunks)
topic_summary = build_topic_summary_bertopic(topic_model, df_with_topics, chunks, DOC_ID_COL, top_words=TOP_WORDS_PER_TOPIC, sample_docs=SAMPLE_DOCS_PER_TOPIC)

SAVE_NAME = OUTPUTS_DIR / f"bertopic_topic_summary_{DOCKET_TO_USE}.csv"
OUTPUT_NAME = OUTPUTS_DIR / f"comments_with_bertopic_{DOCKET_TO_USE}.csv"

# save outputs
topic_summary.to_csv(SAVE_NAME, index=False)
df_with_topics.to_csv(OUTPUT_NAME, index=False)
# topic_model.save(str(MODEL_SAVE_FILE))
print("Saved outputs. Topics:", len(topic_model.get_topic_info()) - (1 if -1 in topic_model.get_topic_info()["Topic"].tolist() else 0))

NameError: name '_pd' is not defined

In [None]:
# # example implementation without chunking:

# df = load_data(DATA_PATH)
# df = filter_by_docket(df, DOCKET_TO_USE)
# df = df.dropna(subset=[TEXT_COL]).reset_index(drop=True)
# docs = df[TEXT_COL].astype(str).tolist()

# topic_model, embeddings = train_bertopic(
#     docs,
#     EMBEDDING_MODEL,
#     verbose=True
# )

# df = map_dominant_and_topN_bertopic(model=topic_model, documents=docs, df=df, doc_id_col=DOC_ID_COL, embeddings=embeddings)

# df_sum = build_topic_summary_bertopic(topic_model, df, docs, DOC_ID_COL)