In [None]:
# Standard imports
import pandas as pd 
import numpy as np
import os
import sklearn
import scipy
from pathlib import Path

# Topic modeling with embeddings
from top2vec import Top2Vec
import joblib 

# Guided topic modeling
from corextopic import corextopic as ct
from sklearn.feature_extraction.text import CountVectorizer
import scipy.sparse as ss

# Sentiment analysis
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Interactive visuals
import streamlit as st
import plotly.express as px
import umap

In [None]:
# Path to JSON file (one row per comment; each row a dict/object)
DATA_PATH = Path("C:\\Users\\linna\\OneDrive\\Documents\\Python_Dev\\topic-modeling\\data\\public_comments.json")

df = pd.read_json(DATA_PATH, orient="records", lines=False)  # or lines=True if newline-delimited

# Ensure comments exist
TEXT_COL = "comment_text"     
if TEXT_COL not in df.columns:
    raise ValueError(f"{TEXT_COL} not found in dataframe columns: {df.columns.tolist()}")

df = df.dropna(subset=[TEXT_COL]).reset_index(drop=True)

print("Loaded", len(df), "comments")
df.head()

Loaded 12437 comments


Unnamed: 0,attachment_count,comment_text_sources,comment_id,document_id,comment_type,comment_last_modified_date,comment_highlighted_content,comment_withdrawn,comment_title,comment_object_id,...,agency_id,comment_start_date,comment_end_date,within_comment_period,open_for_comment,cfr_part,fr_doc_num,fr_vol_num,start_end_page,document_abstract
0,1,"[{'source': 'DIRECT (NON-ATTACHMENT)', 'text':...",TTB-2007-0067-0124,TTB-2007-0067-0001,Public Submission,2008-03-21 19:57:10,,False,"Comment from Espinosa, Adriana",0900006480402e6b,...,TTB,2007-11-20 05:00:00,2008-03-21 03:59:59,False,False,"27 CFR 4, 9",E7-22715,72 FR,65256 - 65261,TTB has revised its 2005 proposal to establish...
1,1,"[{'source': 'DIRECT (NON-ATTACHMENT)', 'text':...",TTB-2007-0067-0136,TTB-2007-0067-0001,Public Submission,2008-03-22 17:58:34,,False,"Comment from Kaplan, Marc",0900006480403ff7,...,TTB,2007-11-20 05:00:00,2008-03-21 03:59:59,False,False,"27 CFR 4, 9",E7-22715,72 FR,65256 - 65261,TTB has revised its 2005 proposal to establish...
2,1,"[{'source': 'DIRECT (NON-ATTACHMENT)', 'text':...",TTB-2007-0067-0159,TTB-2007-0067-0001,Public Submission,2008-03-22 20:09:30,,False,"Comment from Parry, Cameron",0900006480403ff1,...,TTB,2007-11-20 05:00:00,2008-03-21 03:59:59,False,False,"27 CFR 4, 9",E7-22715,72 FR,65256 - 65261,TTB has revised its 2005 proposal to establish...
3,1,"[{'source': 'DIRECT (NON-ATTACHMENT)', 'text':...",TTB-2007-0067-0192,TTB-2007-0067-0001,Public Submission,2008-03-27 19:22:09,,False,Transmission e-mail for Comment from Paso Robl...,0900006480401ad3,...,TTB,2007-11-20 05:00:00,2008-03-21 03:59:59,False,False,"27 CFR 4, 9",E7-22715,72 FR,65256 - 65261,TTB has revised its 2005 proposal to establish...
4,1,"[{'source': 'DIRECT (NON-ATTACHMENT)', 'text':...",TTB-2007-0067-0179,TTB-2007-0067-0001,Public Submission,2008-03-26 19:09:49,,False,"Comment from Cass Vineyards (Cass, Stephen)",09000064804099ad,...,TTB,2007-11-20 05:00:00,2008-03-21 03:59:59,False,False,"27 CFR 4, 9",E7-22715,72 FR,65256 - 65261,TTB has revised its 2005 proposal to establish...


In [3]:
pr_238 = df[df['docket_id'] == 'TTB-2025-0003']
pr_237 = df[df['docket_id'] == 'TTB-2025-0002']

In [4]:
print('TTB-2025-0003: ' + str(len(pr_238)) + ' comments')
print('TTB-2025-0002: ' + str(len(pr_237)) + ' comments')

TTB-2025-0003: 189 comments
TTB-2025-0002: 174 comments


In [5]:
df = pr_238

In [6]:
df = df.dropna(subset=[TEXT_COL]).reset_index(drop=True)
print("Docs in df:", len(df))

Docs in df: 189


In [7]:
# Prepare documents (list of comments as strings) and document (comment) ids
documents = df[TEXT_COL].astype(str).tolist()
# Keep comment_id as strings (Top2Vec will return these when using document_ids)
document_ids = df["comment_id"].astype(str).tolist()

# Set embedding model and training parameters
embedding_model = "all-MiniLM-L6-v2"
speed = "deep-learn"
workers = os.cpu_count() or 1

# Train model (may take a while depending on number of comments and chosen 'speed' 
model = Top2Vec(
    documents=documents,
    document_ids=document_ids,
    embedding_model=embedding_model,
    speed=speed,
    workers=workers
)

2025-08-27 13:52:29,818 - top2vec - INFO - Pre-processing documents for training
2025-08-27 13:52:29,983 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model
2025-08-27 13:52:31,969 - top2vec - INFO - Creating joint document/word embedding
2025-08-27 13:52:41,719 - top2vec - INFO - Creating lower dimension embedding of documents
2025-08-27 13:52:58,303 - top2vec - INFO - Finding dense areas of documents
2025-08-27 13:52:58,337 - top2vec - INFO - Finding topics


In [8]:
# Topics
n_topics = model.get_num_topics()
print("Number of topics discovered:", n_topics)

topic_sizes, topic_nums = model.get_topic_sizes()
print("Top 10 topic sizes (docs per topic):")
for size, num in zip(topic_sizes[:10], topic_nums[:10]):
    print(f"  Topic {num}: {size} documents")

Number of topics discovered: 1
Top 10 topic sizes (docs per topic):
  Topic 0: 189 documents


In [9]:
# Top N words for top N topics
top_n = min(10, n_topics)
topic_words, word_scores, topic_numbers = model.get_topics(top_n)
for words, scores, tnum in zip(topic_words, word_scores, topic_numbers):
    print("\nTopic", tnum)
    # show top 10 words (they come ordered)
    for w, s in zip(words[:10], scores[:10]):
        print(f"  {w} ({s:.3f})")


Topic 0
  allergens (0.559)
  allergies (0.424)
  label (0.367)
  labeling (0.342)
  alcohol (0.341)
  beverages (0.327)
  allergen (0.321)
  labels (0.313)
  ingredients (0.283)
  alcoholic (0.261)


In [10]:
# Map dominant topic back to df
# Use search_documents_by_topic for each topic and assign docs to their best topic.
docid_to_topic = {}
topic_sizes, topic_nums = model.get_topic_sizes()  # topics ordered by size

for size, tnum in zip(topic_sizes, topic_nums):
    # retrieve all documents for this topic (num_docs = topic size)
    docs, doc_scores, doc_ids = model.search_documents_by_topic(topic_num=tnum, num_docs=int(size))
    # doc_ids are the original document_ids you passed to the model
    for did in doc_ids:
        # assign topic tnum as the dominant topic for did
        # if a doc appears in multiple lists (unlikely), first assignment wins (largest topic first)
        if did not in docid_to_topic:
            docid_to_topic[did] = tnum

# Create a column in df with the dominant topic (or -1 if not assigned)
df["top2vec_dominant_topic"] = df["comment_id"].astype(str).map(docid_to_topic).fillna(-1).astype(int)

In [11]:
# Label with top terms
top_n = 5

# Get the words for every topic the model discovered
num_topics = model.get_num_topics()
topic_words, word_scores, topic_nums = model.get_topics(num_topics)

topic_label_map = {}
for tnum, words in zip(topic_nums, topic_words):
    words_sel = words[:top_n]                     
    label = ", ".join(words_sel)                  
    topic_label_map[int(tnum)] = label

In [12]:
# map to df; fallback label for unknown topics
df["top2vec_terms"] = (
    df["top2vec_dominant_topic"]
      .map(topic_label_map)
      .fillna("Unclear")
)

In [None]:
# Save Topic2Vec model (Top2Vec has its own save/load)
model_save_path = "C:\\Users\\linna\\OneDrive\\Documents\\Python_Dev\\topic-modeling\\models\\top2vec_model"
model.save(model_save_path)
print("Model saved to:", model_save_path)

Model saved to: top2vec_model
