In [1]:
# preprocess_cases.py
import os, csv, json
from pathlib import Path

In [2]:
RAW_DIR = Path("data/raw_cases")
OUT_DIR = Path("data/processed")
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [7]:
rows = []
for file in RAW_DIR.glob("*"):
    text = ""
    if file.suffix == ".txt":
        text = file.read_text(encoding="utf-8", errors="ignore")

    elif file.suffix == ".json":
        data = json.loads(file.read_text())

        # Extract from casebody → opinions → text
        if "casebody" in data and "opinions" in data["casebody"]:
            opinions = data["casebody"]["opinions"]
            if isinstance(opinions, list) and len(opinions) > 0:
                text = opinions[0].get("text", "")

        # fallback if still empty
        if not text:
            text = data.get("analysis", {}).get("cardinality", "")

    if not text.strip():
        continue

    # crude placeholder label
    label = 1 if "guilty" in text.lower() else 0
    rows.append({"case_id": file.stem, "text": text, "label": label})

In [8]:
csv_path = OUT_DIR / "ljp_data.csv"
with open(csv_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["case_id", "text", "label"])
    writer.writeheader()
    writer.writerows(rows)

print(f"Saved {len(rows)} cases to {csv_path}")

Saved 122 cases to data/processed/ljp_data.csv
