# Azure OCR

In [None]:
# debug_blackbook_markdown_dump.py
import os, base64, sys, time, traceback
from pathlib import Path
from dotenv import load_dotenv
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import DocumentContentFormat

IN_DIR  = Path("files/blackbooks")        # adjust if needed
OUT_DIR = Path("files/blackbooks_md")
OUT_DIR.mkdir(parents=True, exist_ok=True)

def main():
    t0 = time.time()
    load_dotenv()
    AZURE_KEY = os.getenv("AZURE_KEY")
    AZURE_ENDPOINT = os.getenv("AZURE_ENDPOINT")

    print(f"CWD: {Path.cwd()}")
    print(f"IN_DIR exists: {IN_DIR.exists()}  -> {IN_DIR.resolve()}")
    print(f"OUT_DIR: {OUT_DIR.resolve()}")
    print(f"AZURE_ENDPOINT set: {bool(AZURE_ENDPOINT)}  AZURE_KEY set: {bool(AZURE_KEY)}")
    if not AZURE_KEY or not AZURE_ENDPOINT:
        print("❌ Missing AZURE_KEY or AZURE_ENDPOINT in environment. Check your .env.")
        sys.exit(1)

    ai_client = DocumentIntelligenceClient(
        endpoint=AZURE_ENDPOINT,
        credential=AzureKeyCredential(AZURE_KEY)
    )

    files = sorted(list(IN_DIR.rglob("*.pdf")) + list(IN_DIR.rglob("*.docx")))
    print(f"Found {len(files)} file(s):")
    for p in files[:10]:
        print("  -", p)
    if len(files) > 10:
        print(f"  ... and {len(files)-10} more")

    if not files:
        print("⚠️ No .pdf or .docx files found. Check the path and extensions.")
        print("Tip: Are you running this from the repo root? Try absolute paths.")
        sys.exit(0)

    processed = 0
    for i, path in enumerate(files, 1):
        print(f"[{i}/{len(files)}] {path} …", end="", flush=True)
        try:
            data = path.read_bytes()
            poller = ai_client.begin_analyze_document(
                model_id="prebuilt-layout",
                body={"base64Source": base64.b64encode(data).decode()},
                output_content_format=DocumentContentFormat.MARKDOWN,
            )
            res = poller.result()

            # Prefer page-based extraction if pages are populated
            md_text = ""
            if getattr(res, "pages", None):
                parts = []
                for page in res.pages:
                    if not page.spans:
                        continue
                    span = page.spans[0]
                    parts.append(res.content[span.offset : span.offset + span.length])
                md_text = "\n\n---\n\n".join(parts)
            else:
                md_text = res.content or ""

            if not md_text.strip():
                print(" empty markdown returned")
            else:
                out_file = OUT_DIR / f"{path.stem}.md"
                out_file.write_text(md_text, encoding="utf-8")
                print(f" saved -> {out_file.name}")
                processed += 1
        except Exception as e:
            print(" FAILED")
            traceback.print_exc()

    dt = time.time() - t0
    print(f"✅ Done. Processed {processed}/{len(files)} file(s) in {dt:.2f}s.")

if __name__ == "__main__":
    main()


Processing blackbooks: 100%|██████████| 16/16 [00:01<00:00, 12.09it/s]

✅ Saved 2490 entries to ../files/excel/all_blackbooks_orders.xlsx





In [28]:
"""
Blackbook Extraction — ONE ROW PER *NAMED + DATED* BODY OF RULES (LLM)
- Counts "(2) AMENDED ORDER AMENDING RULES …"
- Skips headings containing "CORRECTIVE"
- Overlap=0, anchor_excerpt fingerprint, strong dedup
"""

from __future__ import annotations
import re
from pathlib import Path
from typing import List
import pandas as pd
from tqdm import tqdm
from pydantic import BaseModel, Field
from concurrent.futures import ThreadPoolExecutor, as_completed

# ---- LLM ----
from langchain_openai import ChatOpenAI

# ====== Schemas ======
class RuleBody(BaseModel):
    body_title: str = Field(description="Exact name of the body of rules (e.g., 'Rules of Civil Procedure', 'Local Rules of Practice for the Yuma County Superior Court')")
    parent_order_title: str = Field(description="The full heading/title block of the order this body came from (everything before the first Dated/Filed/Effective/Approved line)")
    issued_year: int = Field(description="Earliest year from date lines inside the order (Dated/Filed/Effective/Approved). If none present, use nearest year header above the order.")
    anchor_excerpt: str = Field(description="First ~120 chars of parent_order_title (before date lines), whitespace-collapsed")
    category: str = Field(description="Short family inferred from the body title, e.g., 'Rules of the Supreme Court', 'Local Rules of Practice', 'Uniform Rules of Practice', 'Rules of Criminal Procedure'. If unclear, return empty string.")

class BodiesList(BaseModel):
    bodies: List[RuleBody]

# ====== Chunking ======
# Capture likely starts (ORDER..., any heading containing RULES, plus common ‘ADOPTION OF …’ / ‘STATE BAR – …’ banners)
UNICODE_DASH = r"\u2010-\u2015\u2212\uFE58\uFE63\uFF0D\-–—"
PUNCT = r" .,:;/'\"()\[\]&/"

ORDER_START_REGEX = re.compile(
    rf"""(?mix)
    ^\s*
    (?:\(\d+\)\s*)?                                     # optional numbering e.g. (2)
    (?:
        (?:AMENDED\s+)?ORDER\b[^\n]*                    # ORDER... or AMENDED ORDER...
        |
        [A-Z{PUNCT}{UNICODE_DASH}]*\bRULES\b[^\n]*      # any heading line containing RULES
        |
        ADOPTION\s+OF\b[^\n]*                           # ADOPTION OF ...
        |
        STATE\s+BAR\b[^\n]*                             # STATE BAR — RULES OF THE SUPREME COURT etc.
    )\s*$
    """,
)

YEAR_HEADER_REGEX = re.compile(r'(?m)^\s*((?:19|20)\d{2})\s*$')

def smart_chunk_text(text: str, max_chars: int = 6000, overlap: int = 0) -> list[str]:
    """
    Split on likely order starts and year headers, pack segments into chunks <= max_chars.
    Overlap kept at 0 to avoid cross-chunk duplicates.
    """
    t = text.replace('\r\n', '\n').replace('\r', '\n')
    starts = [m.start() for m in ORDER_START_REGEX.finditer(t)]
    starts += [m.start() for m in YEAR_HEADER_REGEX.finditer(t)]
    starts = sorted(set([0] + starts + [len(t)]))

    segments = []
    for i in range(len(starts) - 1):
        seg = t[starts[i]:starts[i+1]]
        if seg.strip():
            segments.append(seg)

    chunks, cur = [], ""
    for seg in segments:
        if not cur:
            cur = seg
            continue
        if len(cur) + len(seg) <= max_chars:
            cur += seg
        else:
            chunks.append(cur.strip())
            cur = seg  # overlap=0
    if cur.strip():
        chunks.append(cur.strip())

    chunks = [c for c in chunks if len(c.strip()) > 50]
    print(f"smart_chunk_text: produced {len(chunks)} chunks (max_chars={max_chars}, overlap={overlap})")
    return chunks

# ====== LLM extraction per chunk ======
LLM_MODEL = "gpt-4.1"  # or "gpt-4o-mini" if you want cheaper/faster

def build_prompt(chunk: str, chunk_id: int) -> str:
    return f"""
You are extracting **bodies of rules** from Blackbook text. A *valid row* is anything that is BOTH:
(1) **Named** (a heading like "ORDER …", "STATE BAR – RULES OF THE SUPREME COURT:", "LOCAL RULES …", "UNIFORM RULES …", "ADOPTION OF …", etc.)
AND
(2) **Dated** (has at least one date line such as "Dated …", "Filed …", "Effective …", or "Approved by Supreme Court …" with a year).

Your unit of output is **ONE BODY OF RULES** (not one order).
- A single order can contain multiple distinct bodies (e.g., "Rules of Civil Procedure" AND "Rules of Criminal Procedure"). Output a row for EACH body.
- Do **NOT** split when "and" only connects rule **numbers** within the SAME body (e.g., "amending Rules 16, 26, 37" → one body).
- Treat entries like "(2) AMENDED ORDER AMENDING RULES 26.11, 29, 30, and 41, RULES OF CRIMINAL PROCEDURE [R-18-0028]" as **valid** if dated.
- **Exclude** items whose heading contains the word **CORRECTIVE**.
- **parent_order_title** = full heading/title block up to the first date line (include any quote marks, punctuation, brackets, petition IDs, etc.).
- **issued_year** = the **earliest** year in the date lines (Dated/Filed/Effective/Approved). If no date line is present, use the nearest year header above the item.
- **anchor_excerpt** = the first ~120 characters of parent_order_title (whitespace collapsed).
- **category** = concise family label derived from the body title (e.g., "Local Rules of Practice", "Rules of the Supreme Court",
  "Uniform Rules of Practice", "Rules of Criminal Procedure", "Rules of Procedure in Traffic Cases"). If unclear, return "".

Edge cases that MUST count:
1) "(2) AMENDED ORDER AMENDING RULES 26.11, 29, 30, and 41, RULES OF CRIMINAL PROCEDURE [R-18-0028]"
   → body_title should reflect the body of rules (e.g., "Rules of Criminal Procedure"), not the petition number.

Return ONLY structured data with fields:
- body_title
- parent_order_title
- issued_year
- anchor_excerpt
- category

Text chunk {chunk_id}:
{chunk}
""".strip()

def extract_chunk(chunk_data):
    chunk, chunk_id = chunk_data
    try:
        llm = ChatOpenAI(model=LLM_MODEL, temperature=0)
        structured_llm = llm.with_structured_output(BodiesList)
        result = structured_llm.invoke(build_prompt(chunk, chunk_id))
        return [b.model_dump() for b in result.bodies]
    except Exception as e:
        print(f"[LLM ERROR] chunk {chunk_id}: {e}")
        return []

# ====== Post: normalize + strong dedup ======
def normalize_for_dedup(df: pd.DataFrame) -> pd.DataFrame:
    def norm(s: str) -> str:
        s = (s or "")
        s = s.lower()
        s = re.sub(r'\s+', ' ', s)
        s = re.sub(r'[\u2010-\u2015\u2212\uFE58\uFE63\uFF0D\-–—]+', '-', s)  # unify dashes
        return s.strip(' :;,.()[]{}')

    for col in ["body_title", "parent_order_title", "anchor_excerpt", "category"]:
        df[col] = df[col].astype(str).str.strip()

    df["body_title_norm"] = df["body_title"].map(norm)
    df["parent_order_title_norm"] = df["parent_order_title"].map(norm)
    df["anchor_excerpt_norm"] = df["anchor_excerpt"].map(norm)
    return df

def strong_dedup(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return df
    before = len(df)
    df = df.drop_duplicates(
        subset=["body_title_norm", "parent_order_title_norm", "anchor_excerpt_norm", "issued_year"],
        keep="first"
    ).copy()
    after = len(df)
    print(f"Deduped: {before} → {after}")
    return df

# ====== Main ======
def main():
    # Your markdown folder
    input_dir = Path("/Users/othmanbensouda/Desktop/jobtalk_paper/files/blackbooks_md")
    md_files = sorted(input_dir.glob("*.md"))
    if not md_files:
        print(f"[ERR] No markdown files found in {input_dir}")
        return

    combined = ""
    for f in tqdm(md_files, desc="Reading markdowns"):
        t = f.read_text(encoding="utf-8", errors="ignore")
        # keep structure; just remove HTML comments and fix hyphen linewraps
        t = t.replace("\r\n","\n").replace("\r","\n")
        t = re.sub(r'<!--.*?-->', '', t, flags=re.S)
        t = re.sub(r'-\s*\n\s*', '', t)
        combined += "\n\n" + t

    print(f"Total length: {len(combined):,} chars")

    # Chunk & extract via LLM
    chunks = smart_chunk_text(combined, max_chars=6000, overlap=0)
    all_rows = []
    with ThreadPoolExecutor(max_workers=8) as ex:
        futures = {ex.submit(extract_chunk, (chunk, i+1)): i+1 for i, chunk in enumerate(chunks)}
        for fut in tqdm(as_completed(futures), total=len(futures), desc="Extracting bodies"):
            rows = fut.result() or []
            all_rows.extend(rows)

    print(f"Extracted {len(all_rows)} bodies of rules (pre-dedup)")
    if not all_rows:
        print("[ERR] 0 rows returned. Consider lowering chunk size or inspecting a sample chunk.")
        return

    # DataFrame + dedup
    df = pd.DataFrame(all_rows, columns=["body_title","parent_order_title","issued_year","anchor_excerpt","category"])
    df["issued_year"] = pd.to_numeric(df["issued_year"], errors="coerce").astype("Int64")
    df = normalize_for_dedup(df)
    df = strong_dedup(df)

    # Save
    out_dir = Path("/Users/othmanbensouda/Desktop/jobtalk_paper/excel"); out_dir.mkdir(parents=True, exist_ok=True)
    out_path = out_dir / "extracted_blackbooks_bodies_llm.xlsx"
    df.drop(columns=["body_title_norm","parent_order_title_norm","anchor_excerpt_norm"], errors="ignore").to_excel(out_path, index=False)
    print(f"✅ Excel saved to {out_path}")

if __name__ == "__main__":
    main()


Reading markdowns: 100%|██████████| 16/16 [00:00<00:00, 2164.24it/s]


Total length: 493,116 chars
smart_chunk_text: produced 85 chunks (max_chars=6000, overlap=0)


Extracting bodies: 100%|██████████| 85/85 [04:22<00:00,  3.09s/it]

Extracted 2193 bodies of rules (pre-dedup)
Deduped: 2193 → 2090
✅ Excel saved to /Users/othmanbensouda/Desktop/jobtalk_paper/excel/extracted_blackbooks_bodies_llm.xlsx





In [43]:
import pandas as pd
IN_PATH = "../excel/extracted_blackbooks_bodies_llm.xlsx"
OUT_PATH = "../excel/extracted_blackbooks_bodies_llm_categorized.xlsx"

# Load
df = pd.read_excel(IN_PATH)

# ---- config: set the column that holds the rule/body name ----
NAME_COL = "parent_order_title"  # change if needed

# Drop the column
if "anchor_excerpt" in df.columns:
    df = df.drop(columns=["anchor_excerpt"])

# Normalized helper
name = df[NAME_COL].fillna("").astype(str).str.lower()

# 1) Local vs statewide
df["is_local_rule"] = name.str.contains("local", na=False).astype(int)
df["is_statewide_rule"] = (~name.str.contains("local", na=False)).astype(int)

# 2) Statewide trial court rule
is_not_supreme = ~name.str.contains("supreme court", na=False)
is_appellate = name.str.contains("appellate", na=False)
mentions_superior = name.str.contains("superior", na=False)

df["is_statewide_trial_court_rule"] = (
    (df["is_statewide_rule"] == 1) &
    is_not_supreme &
    (~is_appellate | mentions_superior)
).astype(int)

# Save
df.to_excel(OUT_PATH, index=False)
print("Wrote:", OUT_PATH)


Wrote: ../excel/extracted_blackbooks_bodies_llm_categorized.xlsx


In [70]:
import pandas as pd
import matplotlib.pyplot as plt

IN_PATH  = "../excel/extracted_blackbooks_bodies_llm_categorized.xlsx"
OUT_XLSX = "../excel/blackbooks_fig2_fig3_tables.xlsx"
FIG2_PNG = "../excel/fig2_statewide_trial_rules_per_year.png"
FIG3_DECADE_PNG = "../excel/fig3_decade_100pct_stacked.png"

# ==== CONFIG ====
YEAR_COL = "issued_year"
LOCAL_COL = "is_local_rule"
STATEWIDE_TRIAL_COL = "is_statewide_trial_court_rule"

# ---- Load & normalize ----
df = pd.read_excel(IN_PATH).copy()
df[YEAR_COL] = pd.to_numeric(df[YEAR_COL], errors="coerce").astype("Int64")

# Universe: local OR statewide-trial
trial = df[(df[LOCAL_COL] == 1) | (df[STATEWIDE_TRIAL_COL] == 1)].copy()

# -------- FIGURE 2: statewide trial-court rules per year --------
fig2 = (trial[trial[STATEWIDE_TRIAL_COL] == 1]
        .groupby(YEAR_COL, dropna=True)
        .size()
        .rename("statewide_trial_rules")
        .reset_index())

if len(fig2):
    yr_min, yr_max = int(fig2[YEAR_COL].min()), int(fig2[YEAR_COL].max())
    years = pd.DataFrame({YEAR_COL: range(yr_min, yr_max + 1)})
    fig2 = years.merge(fig2, on=YEAR_COL, how="left").fillna({"statewide_trial_rules": 0})
    fig2["statewide_trial_rules"] = fig2["statewide_trial_rules"].astype(int)

plt.figure(figsize=(10, 4))
plt.plot(fig2[YEAR_COL], fig2["statewide_trial_rules"], marker="o")
plt.title("Figure 2. Statewide rule changes issued by the Arizona Supreme Court per year.\nIt shows a similarly upward trend.")
plt.xlabel("Year")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(FIG2_PNG, dpi=300)
plt.close()

# -------- FIGURE 3: Decade 100% stacked bar (percentages on each color) --------
def decade_label(y):
    if pd.isna(y): 
        return pd.NA
    y = int(y)
    start = (y // 10) * 10
    return f"{start}-{start+9}"

trial["decade"] = trial[YEAR_COL].apply(decade_label)
dcounts = (trial
           .groupby("decade", dropna=True)
           .agg(local_rules=(LOCAL_COL, "sum"),
                statewide_rules=(STATEWIDE_TRIAL_COL, "sum"))
           .reset_index())
dcounts["trial_total"] = dcounts["local_rules"] + dcounts["statewide_rules"]
dcounts = dcounts[dcounts["trial_total"] > 0].copy()
dcounts["pct_local"] = dcounts["local_rules"] / dcounts["trial_total"] * 100
dcounts["pct_statewide"] = dcounts["statewide_rules"] / dcounts["trial_total"] * 100

# Sort decades chronologically
try:
    dcounts["dec_start"] = dcounts["decade"].str.split("-").str[0].astype(int)
    dcounts = dcounts.sort_values("dec_start")
except:
    pass

plt.figure(figsize=(10, 5))
bars_local = plt.bar(dcounts["decade"], dcounts["pct_local"], label="Local")
bars_state = plt.bar(dcounts["decade"], dcounts["pct_statewide"], 
                     bottom=dcounts["pct_local"], label="Statewide")

# --- Add percentage labels inside each segment ---
for bar, pct in zip(bars_local, dcounts["pct_local"]):
    if pct > 0:
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height()/2,
                 f"{pct:.0f}%", ha="center", va="center", fontsize=9, color="white")

for bar, bottom, pct in zip(bars_state, dcounts["pct_local"], dcounts["pct_statewide"]):
    if pct > 0:
        plt.text(bar.get_x() + bar.get_width()/2, bottom + bar.get_height()/2,
                 f"{pct:.0f}%", ha="center", va="center", fontsize=9, color="white")

plt.title("The Share of Trial-Court Rules by Decade (Statewide on Top)")
plt.xlabel("Decade")
plt.ylabel("Share of Trial-Court Rules (%)")
plt.ylim(0, 100)
plt.legend()
plt.tight_layout()
plt.savefig(FIG3_DECADE_PNG, dpi=300)
plt.close()

# ---- Save tables ----
with pd.ExcelWriter(OUT_XLSX, engine="xlsxwriter") as xw:
    fig2.to_excel(xw, index=False, sheet_name="fig2_year_counts")
    dcounts.to_excel(xw, index=False, sheet_name="fig3_decade_counts_share")

print("Wrote plots:\n -", FIG2_PNG, "\n -", FIG3_DECADE_PNG)
print("And tables:\n -", OUT_XLSX)


Wrote plots:
 - ../excel/fig2_statewide_trial_rules_per_year.png 
 - ../excel/fig3_decade_100pct_stacked.png
And tables:
 - ../excel/blackbooks_fig2_fig3_tables.xlsx
