In [1]:
import os
from pathlib import Path
from typing import List, Dict, Any

import pandas as pd
import numpy as np

from top2vec import Top2Vec

# parameters
DATA_PATH = Path("C:\\Users\\linna\\OneDrive\\Documents\\Python_Dev\\topic-modeling\\data\\public_comments.json")
TEXT_COL = "comment_text"
DOC_ID_COL = "comment_id"
DOCKET_TO_USE = "TTB-2025-0003"   # change as needed (or set to None to use full df)
MODEL_SAVE_DIR = Path("top2vec_model_classic")
TOPIC_SUMMARY_CSV = Path("top2vec_topic_summary.csv")
OUTPUT_DF_CSV = Path("comments_with_top2vec.csv")

# top2vec training options
EMBEDDING_MODEL = "all-MiniLM-L6-v2"
SPEED = "learn"        # 'fast-learn', 'learn', 'deep-learn' — choose based on time/quality tradeoff
WORKERS = os.cpu_count() or 1

# output params
TOP_WORDS_PER_TOPIC = 10
SAMPLE_DOCS_PER_TOPIC = 5
TOP_N_FOR_LABEL = 5    # how many top words to use to create a label

In [2]:
print(MODEL_SAVE_DIR)

top2vec_model_classic


In [3]:
MODEL_SAVE_FILE = Path(r"C:\Users\linna\Documents\top2vec_model_classic.pkl") 

In [4]:
# translate notebook to functions
def load_data(path: Path) -> pd.DataFrame:
    df = pd.read_json(path, orient="records", lines=False)
    if TEXT_COL not in df.columns:
        raise ValueError(f"{TEXT_COL} not found in dataframe columns: {df.columns.tolist()}")
    df = df.dropna(subset=[TEXT_COL]).reset_index(drop=True)
    print(f"Loaded {len(df)} comments from {path}")
    return df


def filter_by_docket(df: pd.DataFrame, docket: str | None) -> pd.DataFrame:
    if docket is None:
        return df
    if "docket_id" not in df.columns:
        raise ValueError("docket_id column not in dataframe")
    df_sub = df[df["docket_id"] == docket].reset_index(drop=True)
    print(f"Filtered to docket '{docket}': {len(df_sub)} comments")
    return df_sub


def train_top2vec(
    documents: List[str],
    document_ids: List[str],
    embedding_model: str,
    speed: str,
    workers: int
) -> Top2Vec:
    print("Training Top2Vec:")
    print(f"  embedding_model={embedding_model}, speed={speed}, workers={workers}")
    model = Top2Vec(
        documents=documents,
        document_ids=document_ids,
        embedding_model=embedding_model,
        speed=speed,
        workers=workers
    )
    print("Training complete.")
    return model


def map_dominant_and_topN(model: Top2Vec, df: pd.DataFrame, doc_id_col: str, text_col: str, topN: int = 3) -> pd.DataFrame:
    """
    Adds columns:
      - top2vec_dominant_topic (int, -1 if missing)
      - top2vec_top_topics (list of ints, length topN)
    """
    doc_ids = df[doc_id_col].astype(str).tolist()
    # dominant topic
    try:
        dominant_arr = model.get_documents_topics(doc_ids, num_topics=1)  # shape (n_docs, 1)
        df["top2vec_dominant_topic"] = dominant_arr[:, 0].astype(int)
    except Exception:
        # fallback to iterative approach if API/version mismatch
        print("get_documents_topics failed; falling back to search_documents_by_topic iterative mapping.")
        docid_to_topic = {}
        topic_sizes, topic_nums = model.get_topic_sizes()
        for size, tnum in zip(topic_sizes, topic_nums):
            docs, doc_scores, doc_ids_for_topic = model.search_documents_by_topic(topic_num=tnum, num_docs=int(size))
            for did in doc_ids_for_topic:
                if did not in docid_to_topic:
                    docid_to_topic[did] = tnum
        df["top2vec_dominant_topic"] = df[doc_id_col].astype(str).map(docid_to_topic).fillna(-1).astype(int)

    # top-N topics per doc
    try:
        topN_arr = model.get_documents_topics(doc_ids, num_topics=topN)  # shape (n_docs, topN)
        df["top2vec_top_topics"] = list(topN_arr.tolist())
        for i in range(topN):
            df[f"top2vec_topic_rank_{i+1}"] = topN_arr[:, i].astype(int)
    except Exception:
        print("get_documents_topics (topN) not available; filling top2vec_top_topics with dominant only.")
        df["top2vec_top_topics"] = df["top2vec_dominant_topic"].apply(lambda x: [int(x)])

    return df


def build_topic_label_map(model: Top2Vec, top_k_words: int = 5) -> Dict[int, str]:
    n_topics = model.get_num_topics()
    topic_words, word_scores, topic_nums = model.get_topics(n_topics)
    label_map = {}
    for tnum, words in zip(topic_nums, topic_words):
        words_sel = words[:top_k_words]
        label_map[int(tnum)] = ", ".join(words_sel)
    return label_map


def build_topic_summary_df(model: Top2Vec, df: pd.DataFrame, text_col: str, top_words: int = 10, sample_docs: int = 5) -> pd.DataFrame:
    topic_sizes, topic_nums = model.get_topic_sizes()
    # to gather words: request all topics
    n_topics = len(topic_nums)
    topic_words_all, _, topic_numbers_all = model.get_topics(n_topics)

    rows = []
    for i, tnum in enumerate(topic_nums):
        size = int(topic_sizes[i])
        # find words corresponding to tnum (topic_numbers_all may be same order as topic_nums)
        idx = list(topic_numbers_all).index(tnum) if tnum in topic_numbers_all else i
        words = topic_words_all[idx][:top_words]
        # sample docs
        num_to_get = min(sample_docs, size if size > 0 else 1)
        docs, doc_scores, doc_ids = model.search_documents_by_topic(topic_num=tnum, num_docs=num_to_get)
        sample_texts = []
        for did, doc_text in zip(doc_ids, docs):
            # try to map back to original df text if present
            mask = df["comment_id"].astype(str) == str(did)
            if mask.any():
                sample_texts.append(df.loc[mask, text_col].iloc[0])
            else:
                sample_texts.append(doc_text)
        rows.append({
            "topic_num": int(tnum),
            "size": size,
            "top_words": ", ".join(words),
            "sample_comments": " ||| ".join([s[:400].replace("\n", " ") for s in sample_texts])
        })
    summary_df = pd.DataFrame(rows).sort_values("size", ascending=False).reset_index(drop=True)
    return summary_df



In [5]:
def main():
    # load and filter
    df = load_data(DATA_PATH)
    df = filter_by_docket(df, DOCKET_TO_USE)
    df = df.dropna(subset=[TEXT_COL]).reset_index(drop=True)
    print("Docs in df:", len(df))

    # prepare docs and ids
    documents = df[TEXT_COL].astype(str).tolist()
    document_ids = df[DOC_ID_COL].astype(str).tolist()

    # train
    model = train_top2vec(
        documents=documents,
        document_ids=document_ids,
        embedding_model=EMBEDDING_MODEL,
        speed=SPEED,
        workers=WORKERS
    )

    # outputs
    n_topics = model.get_num_topics()
    print("Number of topics discovered:", n_topics)
    topic_sizes, topic_nums = model.get_topic_sizes()
    print("Top 10 topic sizes (docs per topic):")
    for size, num in zip(topic_sizes[:10], topic_nums[:10]):
        print(f"  Topic {num}: {size} documents")

    # map topics back to df (dominant + topN)
    df = map_dominant_and_topN(model, df, DOC_ID_COL, TEXT_COL, topN=3)

    # label topics using top terms
    topic_label_map = build_topic_label_map(model, top_k_words=TOP_N_FOR_LABEL)
    df["top2vec_terms"] = df["top2vec_dominant_topic"].map(topic_label_map).fillna("Unclear")

    # topic summary table
    topic_summary = build_topic_summary_df(model, df, TEXT_COL, top_words=TOP_WORDS_PER_TOPIC, sample_docs=SAMPLE_DOCS_PER_TOPIC)
    topic_summary.to_csv(TOPIC_SUMMARY_CSV, index=False)
    print(f"Topic summary saved to {TOPIC_SUMMARY_CSV}")

    # save df with topic columns
    df.to_csv(OUTPUT_DF_CSV, index=False)
    print(f"Comments with topic columns saved to {OUTPUT_DF_CSV}")

    # save model
    model.save(str(MODEL_SAVE_FILE))
    print("Model saved to:", MODEL_SAVE_FILE)
    # temporary until repo is finalized 
    # MODEL_SAVE_DIR.mkdir(parents=True, exist_ok=True)
    # model.save(str(MODEL_SAVE_DIR))
    # print("Model saved to:", MODEL_SAVE_DIR)

    # check
    print("topic_summary (top rows):")
    print(topic_summary.head(10).to_string(index=False))


if __name__ == "__main__":
    main()

2025-08-27 15:44:47,913 - top2vec - INFO - Pre-processing documents for training


Loaded 12437 comments from C:\Users\linna\OneDrive\Documents\Python_Dev\topic-modeling\data\public_comments.json
Filtered to docket 'TTB-2025-0003': 189 comments
Docs in df: 189
Training Top2Vec:
  embedding_model=all-MiniLM-L6-v2, speed=learn, workers=12


2025-08-27 15:44:48,229 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model
2025-08-27 15:44:49,541 - top2vec - INFO - Creating joint document/word embedding
2025-08-27 15:45:02,616 - top2vec - INFO - Creating lower dimension embedding of documents
2025-08-27 15:45:42,179 - top2vec - INFO - Finding dense areas of documents
2025-08-27 15:45:42,205 - top2vec - INFO - Finding topics


Training complete.
Number of topics discovered: 2
Top 10 topic sizes (docs per topic):
  Topic 0: 118 documents
  Topic 1: 71 documents
get_documents_topics failed; falling back to search_documents_by_topic iterative mapping.
get_documents_topics (topN) not available; filling top2vec_top_topics with dominant only.
Topic summary saved to top2vec_topic_summary.csv
Comments with topic columns saved to comments_with_top2vec.csv
Model saved to: C:\Users\linna\Documents\top2vec_model_classic.pkl
topic_summary (top rows):
 topic_num  size                                                                                           top_words                                                                                                                                                                                                                                                                                                                                                                           