In [1]:
import pandas as pd
from pathlib import Path
import os
import math
from typing import List, Dict, Any
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
from tqdm.auto import tqdm
import re

In [2]:
docket_id = "TTB-2025-0003"

In [3]:
# repo / outputs / model
try:
    REPO_ROOT = Path(__file__).parent.parent.resolve()
except NameError:
    REPO_ROOT = Path(os.getcwd()).parent.resolve()

OUTPUTS_DIR = REPO_ROOT / "outputs"
OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)

COMMENTS_CSV = OUTPUTS_DIR / f"comments_with_bertopic_{docket_id}.csv"
TOPIC_SUMMARY_CSV = OUTPUTS_DIR / f"bertopic_topic_summary_{docket_id}.csv"

comments_df = pd.read_csv(COMMENTS_CSV)
topic_summary = pd.read_csv(TOPIC_SUMMARY_CSV)

# print("Comments:", len(comments_df))
# print("Topic summary rows:", len(topic_summary))
# comments_df.head(2)

In [4]:
print(COMMENTS_CSV)

C:\Users\linna\OneDrive\Documents\Python_Dev\topic-modeling\outputs\comments_with_bertopic_TTB-2025-0003.csv


In [5]:
##########
# params #
##########

MODEL_NAME = "SamLowe/roberta-base-go_emotions"  #huggingface.co for more
BATCH_SIZE = 16
DEVICE = 0 if torch.cuda.is_available() else -1  # use GPU if available
MAX_LENGTH = 512   
CHUNK_LONG_TEXTS = True   # set False to simply truncate long comments
CHUNK_SIZE_WORDS = 300    # approximate chunk size in words (must be <= MAX_LENGTH tokens)
CHUNK_OVERLAP_WORDS = 50  

In [6]:
def word_chunk_text(text: str, chunk_size: int = CHUNK_SIZE_WORDS, overlap: int = CHUNK_OVERLAP_WORDS):
    toks = text.split()
    if len(toks) <= chunk_size:
        return [text]
    chunks = []
    start = 0
    L = len(toks)
    while start < L:
        end = min(L, start + chunk_size)
        chunks.append(" ".join(toks[start:end]))
        if end == L:
            break
        start = end - overlap
    return chunks

def softmax(x):
    ex = np.exp(x - np.max(x))
    return ex / ex.sum(axis=-1, keepdims=True)

def classify_texts(texts: List[str], batch_size:int = BATCH_SIZE, chunk_long: bool = CHUNK_LONG_TEXTS):
    """
    Returns list of (top_label, top_score, full_scores_dict) per original text.
    If chunk_long==True, splits long texts into chunks and aggregates by max score across chunks.
    """
    results = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        # For each text, create list of chunks (1 element if not chunking)
        doc_chunks = [word_chunk_text(t) if (chunk_long and len(t.split()) > CHUNK_SIZE_WORDS) else [t] for t in batch]
        # flatten chunks for batching
        flat_chunks = [c for sub in doc_chunks for c in sub]
        # tokenize with truncation and padding
        enc = tokenizer(flat_chunks, padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
        enc = {k: v.to("cuda" if DEVICE != -1 else "cpu") for k, v in enc.items()}
        with torch.no_grad():
            out = model(**enc)
            logits = out.logits.detach().cpu().numpy()  # shape (n_chunks, n_labels)

        # split logits back into per-doc lists
        idx = 0
        for chunks in doc_chunks:
            n = len(chunks)
            if n == 1:
                log = logits[idx]
                probs = softmax(log)
                best_idx = int(np.argmax(probs))
                label = id2label[best_idx] if id2label is not None else str(best_idx)
                results.append((label, float(probs[best_idx]), {id2label[j] if id2label else str(j): float(probs[j]) for j in range(len(probs))}))
            else:
                # aggregate across chunks (take max probability per label)
                chunk_logits = logits[idx: idx + n]  # shape (n, n_labels)
                chunk_probs = softmax(chunk_logits)  # shape (n, n_labels)
                # aggregate by max across chunks (keeps strong signals)
                agg = chunk_probs.max(axis=0)  # shape (n_labels,)
                best_idx = int(np.argmax(agg))
                label = id2label[best_idx] if id2label is not None else str(best_idx)
                results.append((label, float(agg[best_idx]), {id2label[j] if id2label else str(j): float(agg[j]) for j in range(len(agg))}))
            idx += n
    return results

In [7]:
# load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.to("cuda" if DEVICE != -1 else "cpu")
id2label = {int(k): v for k, v in model.config.id2label.items()} if hasattr(model.config, "id2label") else None

In [8]:
texts = comments_df["comment_text"].astype(str).tolist()
print("Classifying", len(texts), "comments (device)", "cuda" if DEVICE!=-1 else "cpu")
out = classify_texts(texts, batch_size=BATCH_SIZE, chunk_long=CHUNK_LONG_TEXTS)

Classifying 214 comments (device) cpu


In [9]:
# attach to dataframe
top_labels, top_scores, full_scores = zip(*out)
comments_df["top_emotion"] = top_labels
comments_df["top_emotion_score"] = top_scores
comments_df["top_emotion_scores_full"] = full_scores

In [10]:
print("Done. Sample:")
print(comments_df[["comment_id","top_emotion","top_emotion_score"]].head(5).to_string(index=False))

Done. Sample:
        comment_id top_emotion  top_emotion_score
TTB-2025-0003-2168   gratitude           0.987184
TTB-2025-0003-2630   gratitude           0.955423
TTB-2025-0003-2632     neutral           0.997809
TTB-2025-0003-1124   gratitude           0.986016
TTB-2025-0003-0161     neutral           0.600973


In [11]:
comments_df.to_csv(COMMENTS_CSV, index=False)