In [1]:
import os
from pathlib import Path
from typing import List, Dict, Any

import pandas as pd
import numpy as np

from top2vec import Top2Vec

##############
# parameters #
##############

DATA_PATH = Path("C:\\Users\\linna\\OneDrive\\Documents\\Python_Dev\\topic-modeling\\data\\public_comments.json")

TEXT_COL = "comment_text"
DOC_ID_COL = "comment_id"
DOCKET_TO_USE = "TTB-2025-0003"   # change as needed (or set to None to use full df)

# will clean later:
try:
    REPO_ROOT = Path(__file__).parent.parent.resolve() # keep this for .py
except NameError:
    # fallback for notebook
    REPO_ROOT = Path(os.getcwd()).parent.resolve()

OUTPUTS_DIR = REPO_ROOT / "outputs"
OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)

# OUTPUTS_DIR = Path("outputs")
# OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)

TOPIC_SUMMARY_CSV = OUTPUTS_DIR / "top2vec_topic_summary.csv"
OUTPUT_DF_CSV = OUTPUTS_DIR / "comments_with_top2vec.csv"

# TOPIC_SUMMARY_CSV = Path("top2vec_topic_summary.csv")
# OUTPUT_DF_CSV = Path("comments_with_top2vec.csv")
# MODEL_SAVE_DIR = Path("top2vec_model_classic")

# top2vec training
EMBEDDING_MODEL = "doc2vec" # changed from originial all-MiniLM-L6-v2 for richer embeddings 
SPEED = "deep-learn"        # options are: 'fast-learn', 'learn', 'deep-learn' 
WORKERS = os.cpu_count() or 1

# output params
TOP_WORDS_PER_TOPIC = 10
SAMPLE_DOCS_PER_TOPIC = 5
TOP_N_FOR_LABEL = 5    # how many top words to use to create a label

In [2]:
MODEL_SAVE_FILE = Path(r"C:\Users\linna\Documents\top2vec_model_classic.pkl") 

In [3]:
def load_data(path: Path) -> pd.DataFrame:
    df = pd.read_json(path, orient="records", lines=False)

    if TEXT_COL not in df.columns:
        raise ValueError(f"{TEXT_COL} not found in dataframe columns: {df.columns.tolist()}")

    if "comment_title" in df.columns:
        # deduplicate mass comments
        pattern = re.compile(r'^\s*Mass Comment\s*[#\(\-:\s]*\s*(\d+)', flags=re.IGNORECASE)

        def _extract_mass_num(title):
            if not isinstance(title, str):
                return None
            m = pattern.match(title)
            if m:
                try:
                    return int(m.group(1))
                except ValueError:
                    return None
            return None

        df["__mass_num"] = df["comment_title"].apply(_extract_mass_num)

        # keep first occurrence for each mass_num, drop subsequent ones
        mask_mass = df["__mass_num"].notna()
        before_len = len(df)
        duplicated_mask = df.loc[mask_mass, "__mass_num"].duplicated(keep="first")
        dup_index = df.loc[mask_mass].index[duplicated_mask]
        if len(dup_index) > 0:
            df = df.drop(index=dup_index).reset_index(drop=True)
        else:
            df = df.reset_index(drop=True)
        after_len = len(df)
        print(f"Dropped {before_len - after_len} duplicate 'Mass Comment N' rows (kept first of each N).")

        df = df.drop(columns="__mass_num")
        
    df = df.dropna(subset=[TEXT_COL]).reset_index(drop=True)
    print(f"Loaded {len(df)} comments from {path}")
    return df


def filter_by_docket(df: pd.DataFrame, docket: str | None) -> pd.DataFrame:
    if docket is None:
        return df
    if "docket_id" not in df.columns:
        raise ValueError("docket_id column not in dataframe")
    df_sub = df[df["docket_id"] == docket].reset_index(drop=True)
    print(f"Filtered to docket '{docket}': {len(df_sub)} comments")
    return df_sub


def train_top2vec(
    documents: List[str],
    document_ids: List[str],
    embedding_model: str,
    speed: str,
    workers: int
) -> Top2Vec:
    print("Training Top2Vec:")
    print(f"  embedding_model={embedding_model}, speed={speed}, workers={workers}")
    model = Top2Vec(
        documents=documents,
        document_ids=document_ids,
        embedding_model=embedding_model,
        speed=speed,
        workers=workers
    )
    print("Training complete.")
    return model


def map_dominant_and_topN(model: Top2Vec, df: pd.DataFrame, doc_id_col: str, text_col: str, topN: int = 3) -> pd.DataFrame:
    """
    Adds columns:
      - top2vec_dominant_topic (int, -1 if missing)
      - top2vec_top_topics (list of ints, length topN)
    """
    df_ids_str = df[doc_id_col].astype(str).tolist()

    model_doc_type = getattr(model, "doc_id_type", None)
    if model_doc_type is None:
        try:
            model_doc_type = type(model.document_ids[0])
        except Exception:
            model_doc_type = str

    def to_model_type(x):
        try:
            return model_doc_type(x)
        except Exception:
            return x

    coerced_ids = [to_model_type(x) for x in df_ids_str]
    try:
        topn_res = model.get_documents_topics(coerced_ids, num_topics=topN)
        # import numpy as _np
        arr = np.array(topn_res[0]) if isinstance(topn_res, (list, tuple)) and len(topn_res) >= 1 else np.array(topn_res)
        # if arr is shape (n_docs, topN)
        df["top2vec_top_topics"] = list(arr.tolist())
        df["top2vec_dominant_topic"] = arr[:, 0].astype(int)
        for i in range(min(topN, arr.shape[1])):
            df[f"top2vec_topic_rank_{i+1}"] = arr[:, i].astype(int)
        return df
    except Exception as e:
        print("get_documents_topics failed; falling back to search_documents_by_topic iterative mapping.")
        print("Exception:", repr(e))

    docid_to_topic = {}
    try:
        topic_sizes, topic_nums = model.get_topic_sizes()
    except Exception:
        try:
            n_topics = model.get_num_topics()
            topic_nums = list(range(n_topics))
            topic_sizes = [0] * n_topics
        except Exception:
            raise RuntimeError("Cannot obtain topic list from model for fallback mapping.")

    for size, tnum in zip(topic_sizes, topic_nums):
        num_to_get = min(max(1, int(size)), len(df_ids_str))
        try:
            docs, doc_scores, doc_ids_for_topic = model.search_documents_by_topic(topic_num=tnum, num_docs=num_to_get)
        except Exception:
            # if that fails, skip this topic
            continue
        for did in doc_ids_for_topic:
            docid_to_topic[str(did)] = int(tnum)

    # Map dominant topic
    df["top2vec_dominant_topic"] = df[doc_id_col].astype(str).map(docid_to_topic).fillna(-1).astype(int)

    # Top N
    df["top2vec_top_topics"] = df["top2vec_dominant_topic"].apply(lambda x: [int(x)] + [-1] * (topN - 1))
    for i in range(topN):
        df[f"top2vec_topic_rank_{i+1}"] = df["top2vec_top_topics"].apply(lambda l: int(l[i]) if i < len(l) else -1)

    return df


def build_topic_label_map(model: Top2Vec, top_k_words: int = 5) -> Dict[int, str]:
    n_topics = model.get_num_topics()
    topic_words, word_scores, topic_nums = model.get_topics(n_topics)
    label_map = {}
    for tnum, words in zip(topic_nums, topic_words):
        words_sel = words[:top_k_words]
        label_map[int(tnum)] = ", ".join(words_sel)
    return label_map


def build_topic_summary_df(model: Top2Vec, df: pd.DataFrame, text_col: str, top_words: int = 10, sample_docs: int = 5) -> pd.DataFrame:
    topic_sizes, topic_nums = model.get_topic_sizes()
    # to gather words: request all topics
    n_topics = len(topic_nums)
    topic_words_all, _, topic_numbers_all = model.get_topics(n_topics)

    rows = []
    for i, tnum in enumerate(topic_nums):
        size = int(topic_sizes[i])
        # find words corresponding to tnum (topic_numbers_all may be same order as topic_nums)
        idx = list(topic_numbers_all).index(tnum) if tnum in topic_numbers_all else i
        words = topic_words_all[idx][:top_words]
        # sample docs
        num_to_get = min(sample_docs, size if size > 0 else 1)
        docs, doc_scores, doc_ids = model.search_documents_by_topic(topic_num=tnum, num_docs=num_to_get)
        sample_texts = []
        for did, doc_text in zip(doc_ids, docs):
            # try to map back to original df text if present
            mask = df["comment_id"].astype(str) == str(did)
            if mask.any():
                sample_texts.append(df.loc[mask, text_col].iloc[0])
            else:
                sample_texts.append(doc_text)
        rows.append({
            "topic_num": int(tnum),
            "size": size,
            "top_words": ", ".join(words),
            "sample_comments": " ||| ".join([s[:400].replace("\n", " ") for s in sample_texts])
        })
    summary_df = pd.DataFrame(rows).sort_values("size", ascending=False).reset_index(drop=True)
    return summary_df



In [4]:
def main():
    # load and filter
    df = load_data(DATA_PATH)
    df = filter_by_docket(df, DOCKET_TO_USE)
    df = df.dropna(subset=[TEXT_COL]).reset_index(drop=True)
    print("Docs in df:", len(df))

    # prepare docs and ids
    documents = df[TEXT_COL].astype(str).tolist()
    document_ids = df[DOC_ID_COL].astype(str).tolist()

    # train
    model = train_top2vec(
        documents=documents,
        document_ids=document_ids,
        embedding_model=EMBEDDING_MODEL,
        speed=SPEED,
        workers=WORKERS
    )

    # outputs
    n_topics = model.get_num_topics()
    print("Number of topics discovered:", n_topics)
    topic_sizes, topic_nums = model.get_topic_sizes()
    print("Top 10 topic sizes (docs per topic):")
    for size, num in zip(topic_sizes[:10], topic_nums[:10]):
        print(f"  Topic {num}: {size} documents")

    # map topics back to df (dominant + topN)
    df = map_dominant_and_topN(model, df, DOC_ID_COL, TEXT_COL, topN=3)

    # label topics using top terms
    topic_label_map = build_topic_label_map(model, top_k_words=TOP_N_FOR_LABEL)
    df["top2vec_terms"] = df["top2vec_dominant_topic"].map(topic_label_map).fillna("Unclear")

    # topic summary table
    topic_summary = build_topic_summary_df(model, df, TEXT_COL, top_words=TOP_WORDS_PER_TOPIC, sample_docs=SAMPLE_DOCS_PER_TOPIC)
    topic_summary.to_csv(TOPIC_SUMMARY_CSV, index=False)
    print(f"Topic summary saved to {TOPIC_SUMMARY_CSV}")

    # save df with topic columns
    df.to_csv(OUTPUT_DF_CSV, index=False)
    print(f"Comments with topic columns saved to {OUTPUT_DF_CSV}")

    # save model
    model.save(str(MODEL_SAVE_FILE))
    print("Model saved to:", MODEL_SAVE_FILE)
    # temporary until repo is finalized 
    # MODEL_SAVE_DIR.mkdir(parents=True, exist_ok=True)
    # model.save(str(MODEL_SAVE_DIR))
    # print("Model saved to:", MODEL_SAVE_DIR)

    # check
    print("topic_summary (top rows):")
    print(topic_summary.head(10).to_string(index=False))


if __name__ == "__main__":
    main()

2025-09-05 16:59:30,069 - top2vec - INFO - Pre-processing documents for training
2025-09-05 16:59:30,155 - top2vec - INFO - Creating joint document/word embedding


Loaded 12437 comments from C:\Users\linna\OneDrive\Documents\Python_Dev\topic-modeling\data\public_comments.json
Filtered to docket 'TTB-2025-0003': 189 comments
Docs in df: 189
Training Top2Vec:
  embedding_model=doc2vec, speed=deep-learn, workers=12


2025-09-05 16:59:43,458 - top2vec - INFO - Creating lower dimension embedding of documents
2025-09-05 16:59:56,079 - top2vec - INFO - Finding dense areas of documents
2025-09-05 16:59:56,095 - top2vec - INFO - Finding topics


Training complete.
Number of topics discovered: 2
Top 10 topic sizes (docs per topic):
  Topic 0: 118 documents
  Topic 1: 71 documents
Topic summary saved to C:\Users\linna\OneDrive\Documents\Python_Dev\topic-modeling\outputs\top2vec_topic_summary.csv
Comments with topic columns saved to C:\Users\linna\OneDrive\Documents\Python_Dev\topic-modeling\outputs\comments_with_top2vec.csv
Model saved to: C:\Users\linna\Documents\top2vec_model_classic.pkl
topic_summary (top rows):
 topic_num  size                                                                       top_words                                                                                                                                                                                                                                                                                                                                                                                                                                           