In [4]:
# Import both raw files, ensure proceeding number exists

import json
import pandas as pd
import numpy as np
from pathlib import Path

rawdir = Path("../PTAB_Project_Data/Raw")

latest_procs = sorted(rawdir.glob("proceedings_*.json"))[-1]
latest_decs  = sorted(rawdir.glob("decisions_*.json"))[-1]
print("Using:", latest_procs.name, "and", latest_decs.name)

procs = pd.DataFrame(json.load(open(latest_procs))["results"])
decs  = pd.DataFrame(json.load(open(latest_decs))["results"])

# Clean keys
for df in (procs, decs):
    if "proceedingNumber" not in df.columns:
        raise ValueError("Missing 'proceedingNumber'")
    df["proceedingNumber"] = df["proceedingNumber"].astype(str).str.strip()

# Make sure TC field exists in proceedings (if not already normalized upstream)
if "technologyCenterNumber" not in procs.columns:
    tc_cols = [c for c in procs.columns if c.lower().endswith("technologycenternumber")]
    procs["technologyCenterRaw"] = procs[tc_cols].bfill(axis=1).iloc[:,0] if tc_cols else pd.NA
    procs["technologyCenterNumber"] = (
        procs["technologyCenterRaw"].astype("string").str.extract(r"(\d{4})", expand=False).str.strip()
    )

Using: proceedings_2025-08-01_to_2025-08-20_2025-08-21_091356.json and decisions_2025-08-01_to_2025-08-20_2025-08-21_091356.json


In [5]:
# Add TC into each decision row

df = decs.merge(
    procs[["proceedingNumber","technologyCenterNumber"]].drop_duplicates("proceedingNumber"),
    on="proceedingNumber", how="left"
)
print("Rows, Cols:", df.shape)

Rows, Cols: (488, 35)


In [8]:
# Label decision stage

def s(col):
    return (pd.Series(df[col]).astype("string").fillna("").str.lower()
            if col in df.columns else pd.Series([""]*len(df), index=df.index, dtype="string"))

txt = s("decisionTypeCategory") + " " + s("subdecisionTypeCategory") + " " + s("documentTitle") + " " + s("decisionSummary")

is_disc_deny = txt.str.contains("discretionary denial", na=False) | txt.str.contains(r"\bfintiv\b", na=False) | txt.str.contains(r"\b314\(a\)\b", na=False) | txt.str.contains(r"\b325\(d\)\b", na=False)
is_final     = txt.str.contains("final written decision", na=False) | txt.str.contains("final decision", na=False) | txt.str.contains("final judgment", na=False)
is_institute = txt.str.contains("institution decision", na=False) | txt.str.contains("decision on institution", na=False) | txt.str.contains(r"\binstituted\b", na=False) | txt.str.contains(r"\binstitution denied\b", na=False) | txt.str.contains(r"\bdeny institution\b", na=False) | txt.str.contains(r"\binstitution\b", na=False)

df["decision_stage"] = "Other"
df.loc[is_institute, "decision_stage"] = "Institution"
df.loc[is_final,     "decision_stage"] = "Final"
df.loc[is_disc_deny, "decision_stage"] = "Discretionary Denial"

In [14]:
# Nat Sec related flag targets

import numpy as np  

TC_TO_SUBJECT = {
    "1600": "Biotechnology",
    "1700": "New Energy",
    "2100": "Computing/Software",
    "2400": "Networking/Security",
    "2600": "Communications",
    "2800": "Semiconductors",
    "2900": "Designs",
    "3600": "Mechanical/Business Methods",
    "3700": "Mechanical/Manufacturing",
}

df["subject_category"] = (
    df["technologyCenterNumber"]
      .map(TC_TO_SUBJECT)
      .fillna("Other/Unknown")
)

TARGET_SUBJECTS = {
    "Semiconductors",
    "Biotechnology",
    "New Energy",
    "Networking/Security",  
    "Communications",
}

# Definite national security flagh where if subject category is target subjects, then it is flagged as nat sec related
df["national_security_flag"] = np.where(df["subject_category"].isin(TARGET_SUBJECTS),
                                        "National Security related",
                                        "Not National Security related")
# Quick check
print(df["subject_category"].value_counts(dropna=False).head(10))
print(df["national_security_flag"].value_counts(dropna=False))

subject_category
Mechanical/Manufacturing       90
New Energy                     76
Networking/Security            54
Mechanical/Business Methods    54
Computing/Software             53
Biotechnology                  53
Semiconductors                 53
Communications                 49
Other/Unknown                   5
Designs                         1
Name: count, dtype: int64
national_security_flag
National Security related        285
Not National Security related    203
Name: count, dtype: int64


In [15]:
# Save outputs - write both files

from pathlib import Path
from datetime import datetime
stamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")

# point to sibling folder "PTAB_Project_Data/Processed"
outdir = Path("../PTAB_Project_Data/Processed")
outdir.mkdir(parents=True, exist_ok=True)

# write both extended + compact files
df.to_csv(outdir / f"extended_{stamp}.csv", index=False)
df.loc[df["national_security_flag"]=="National Security related"].to_csv(
    outdir / f"compact_natsec_{stamp}.csv", index=False
)

print("✅ Saved processed files to:", outdir.resolve())

✅ Saved processed files to: /Users/rellu/Documents/PTAB_Project/ABB_Projects/AI-Builders-Bootcamp-5/PTAB_Project_Data/Processed
