In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import re
import html
from bs4 import BeautifulSoup
import ftfy
import pandas as pd

In [2]:
dockets = ["TTB-2025-0003", "TTB-2025-0002"]

In [3]:
try:
    REPO_ROOT = Path(__file__).parent.parent.resolve()
except NameError:
    REPO_ROOT = Path(os.getcwd()).parent.resolve()

OUTPUTS_DIR = REPO_ROOT / "outputs"
OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)

In [4]:
comments_paths = []
summary_paths = []

for i in dockets:
    comments_path = OUTPUTS_DIR / f"comments_with_bertopic_{i}.csv"
    summary_path    = OUTPUTS_DIR / f"bertopic_topic_summary_{i}.csv"
    comments_paths.append(comments_path)
    summary_paths.append(summary_path)

In [5]:
print(comments_paths)

[WindowsPath('C:/Users/linna/OneDrive/Documents/Python_Dev/topic-modeling/outputs/comments_with_bertopic_TTB-2025-0003.csv'), WindowsPath('C:/Users/linna/OneDrive/Documents/Python_Dev/topic-modeling/outputs/comments_with_bertopic_TTB-2025-0002.csv')]


In [6]:
comments_dfs = []
for p in comments_paths:
    if not p.exists():
        print(f"Warning: missing file {p}; skipping.")
        continue
    comments_dfs.append(pd.read_csv(p))

In [7]:
comments_dfs = []

for p in comments_paths:
    if not p.exists():
        print(f"Warning: missing file {p}; skipping.")
        continue
    comments_dfs.append(pd.read_csv(p))

In [8]:
summary_dfs = []

for p, docket_id in zip(summary_paths, dockets):
    if not p.exists():
        print(f"Warning: missing file {p}; skipping.")
        continue
    df = pd.read_csv(p)
    df["docket_id"] = docket_id
    summary_dfs.append(df)

In [9]:
if comments_dfs:
    all_comments = pd.concat(comments_dfs, ignore_index=True)
else:
    all_comments = pd.DataFrame()

In [10]:
if summary_dfs:
    all_summary = pd.concat(summary_dfs, ignore_index=True)
else:
    all_summary = pd.DataFrame()

### Basic cleaning to remove characters like \<br/>

In [12]:
def clean_comment_text(s: str) -> str:
    if pd.isna(s):
        return s

    soup_text = BeautifulSoup(s, "html.parser").get_text(" ")
    
    unescaped = html.unescape(soup_text)
    
    fixed = ftfy.fix_text(unescaped)
    fixed = re.sub(r'\b(?:mdash|ndash|nbsp|nbsp;|amp|lt|gt|quot|rsquo|lsquo|ldquo|rsquo;|rdquo|br|br/)\b',
                   ' ', fixed, flags=re.IGNORECASE)
    
    fixed = re.sub(r'\d+\s*px', ' ', fixed, flags=re.IGNORECASE)   # remove 30px, 12 px etc
    fixed = re.sub(r'\b(\d{0-2}span|div|padding-left|class|style|font-family|stylepaddingleft|span\D{0,7}span|font-size)\b', ' ', fixed, flags=re.IGNORECASE)
    fixed = re.sub(r'39(?=\D{0,2}\b)', "'", fixed, flags=re.IGNORECASE)
    fixed = re.sub(r'\s+', ' ', fixed).strip()

    return fixed

In [13]:
all_comments['comment_text'] = all_comments['comment_text'].map(clean_comment_text)

  soup_text = BeautifulSoup(s, "html.parser").get_text(" ")


In [14]:
all_summary['sample_comments'] = all_summary['sample_comments'].map(clean_comment_text)

In [15]:
all_summary['top_words'] = all_summary['top_words'].map(clean_comment_text)

In [16]:
all_summary['top_words'] = all_summary['top_words'].str.replace(
    r'(\s)\,\s',
    "",
    regex=True,
    flags=re.IGNORECASE
)

In [17]:
# save
all_comments.to_csv(OUTPUTS_DIR / "all_comments_with_bertopic_combined.csv", index=False)
all_summary.to_csv(OUTPUTS_DIR / "all_bertopic_topic_summary_combined.csv", index=False)

In [18]:
# all_comments.to_csv(OUTPUTS_DIR / "all_comments_with_bertopic_combined.csv", index=False)
# all_summary.to_csv(OUTPUTS_DIR / "all_bertopic_topic_summary_combined.csv", index=False)