# Extract all orders from markdown blackbooks

In [None]:
"""
Blackbook Extraction ‚Äî ORDER LEVEL (All Files)
Chunking ‚âà 5000 chars, extends to next Effective boundary
Issued year priority:
Filed > Dated > Approved > Effective
"""

import re
from pathlib import Path
from typing import List, Optional
import pandas as pd
from tqdm import tqdm
from pydantic import BaseModel
from langchain_openai import ChatOpenAI


# ============================================================
# CONFIG
# ============================================================

LLM_MODEL = "gpt-5-mini"

INPUT_DIR = Path(
    "/Users/othmanbensouda/Desktop/Orion/jobtalk_paper/files/blackbooks/md_format_clean"
)

OUTPUT_FILE = Path(
    "/Users/othmanbensouda/Desktop/Orion/jobtalk_paper/files/order_extraction/extracted_orders_all_files.xlsx"
)


# ============================================================
# SCHEMA
# ============================================================

class OrderEntry(BaseModel):
    order_title: str
    filed_date: Optional[str] = None
    dated_date: Optional[str] = None
    approved_date: Optional[str] = None
    effective_date: Optional[str] = None


class OrdersList(BaseModel):
    orders: List[OrderEntry]


# ============================================================
# CHUNKING (~5000 chars, extend to next Effective boundary)
# ============================================================

def chunk_page_extend_to_effective(text: str):

    text = text.replace("\r\n", "\n").replace("\r", "\n")

    effective_pattern = re.compile(
        r"Effective\s*:?\s*"
        r"(?:\d{1,2}/\d{1,2}/\d{4}"
        r"|[A-Za-z]+\s+\d{1,2},?\s+\d{4})",
        re.IGNORECASE
    )

    TARGET_SIZE = 5000
    MAX_EXTENSION = 15000  # safety cap

    chunks = []
    i = 0
    n = len(text)

    while i < n:

        tentative_end = min(i + TARGET_SIZE, n)

        match = effective_pattern.search(text, tentative_end)

        if match:
            end = match.end()

            # prevent runaway large chunk
            if end - i > MAX_EXTENSION:
                end = tentative_end
        else:
            end = n

        chunk = text[i:end].strip()

        if chunk:
            chunks.append(chunk)

        i = end

    return chunks


# ============================================================
# PROMPT
# ============================================================

def build_prompt(chunk: str) -> str:
    return f"""
Extract complete court orders.

An order usually contains a heading/title, and at least one date. You might retrieve an order that has no date associated.

Note that an order is literally any paragraph that has a date after, such as 'AMENDING RULES 47.1, 48, AND 79, RULES OF PROCEDURE FOR THE JUVENILE COURT, ON A PERMANENT BASIS
Filed: 12/12/2019
Effective 12/12/2019'

Additionally extract if present:
‚Ä¢ Filed: date
‚Ä¢ Dated: date
‚Ä¢ Approved: date

Return structured output only.

TEXT:
{chunk}
""".strip()


# ============================================================
# ISSUED YEAR LOGIC (DETERMINISTIC)
# ============================================================

def extract_year(date_str):
    if not date_str:
        return None
    match = re.search(r"\d{4}", str(date_str))
    return int(match.group()) if match else None


def compute_issued_year(row):
    for field in ["filed_date", "dated_date", "approved_date", "effective_date"]:
        year = extract_year(row.get(field))
        if year:
            return year
    return None


# ============================================================
# MAIN
# ============================================================

def main():

    print("üöÄ Starting extraction (all files)")

    if not INPUT_DIR.exists():
        print("‚ùå Input directory not found:", INPUT_DIR)
        return

    md_files = sorted(INPUT_DIR.glob("*.md"))

    if not md_files:
        print("‚ùå No markdown files found.")
        return

    print(f"Found {len(md_files)} markdown files.")

    llm = ChatOpenAI(
        model=LLM_MODEL,
    )

    structured_llm = llm.with_structured_output(OrdersList)

    # --------------------------------------------------
    # PRECOMPUTE TOTAL CHUNKS
    # --------------------------------------------------

    file_chunks_map = {}
    total_chunks = 0

    for f in md_files:
        text = f.read_text(encoding="utf-8", errors="ignore")
        text = re.sub(r"<!--.*?-->", "", text, flags=re.S)
        chunks = chunk_page_extend_to_effective(text)

        file_chunks_map[f] = chunks
        total_chunks += len(chunks)

    print(f"Total chunks to process: {total_chunks}")

    # --------------------------------------------------
    # PROCESS CHUNKS WITH GLOBAL tqdm
    # --------------------------------------------------

    all_rows = []

    with tqdm(total=total_chunks, desc="Chunks") as pbar:

        for f, chunks in file_chunks_map.items():

            if not chunks:
                tqdm.write(f"‚ö†Ô∏è No chunks in {f.name}")
                continue

            for chunk in chunks:

                try:
                    result = structured_llm.invoke(build_prompt(chunk))
                    rows = [o.model_dump() for o in result.orders]

                    for r in rows:
                        r["source_file"] = f.name

                    all_rows.extend(rows)

                except Exception as e:
                    tqdm.write(f"‚ùå LLM ERROR in {f.name}: {e}")

                pbar.update(1)

            # --------------------------------------------------
            # SAVE AFTER EACH FILE
            # --------------------------------------------------

            if all_rows:

                df = pd.DataFrame(all_rows)

                df["issued_year"] = df.apply(compute_issued_year, axis=1)

                df["order_title_norm"] = (
                    df["order_title"]
                    .str.lower()
                    .str.replace(r"\s+", " ", regex=True)
                    .str.strip()
                )

                df = df.drop_duplicates(
                    subset=["order_title_norm", "effective_date"],
                    keep="first",
                ).drop(columns=["order_title_norm"])

                df.to_excel(OUTPUT_FILE, index=False)

                tqdm.write(
                    f"üíæ Saved after {f.name} "
                    f"({len(df)} total orders so far)"
                )

    print(f"\n‚úÖ Finished processing all chunks.")
    print(f"Final output saved to: {OUTPUT_FILE}")



if __name__ == "__main__":
    main()


# Extract bodies of rules from orders

In [None]:
"""
Blackbook Extraction ‚Äî BODY OF RULE LEVEL
Research-grade stable version
"""

from pathlib import Path
from typing import List
import pandas as pd
from pydantic import BaseModel
from tqdm import tqdm
from langchain_openai import ChatOpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed
import re

# ============================================================
# CONFIG
# ============================================================

LLM_MODEL = "gpt-5"
MAX_WORKERS = 500   

BASE_DIR = Path.cwd().parent

INPUT_FILE = BASE_DIR / "files" / "order_extraction" / "extracted_orders_all_files.xlsx"
OUTPUT_FILE = BASE_DIR / "files" / "order_extraction" / "extracted_rule_bodies.xlsx"


# ============================================================
# STRUCTURED OUTPUT
# ============================================================

class SplitBody(BaseModel):
    body_segment: str   # ONLY the rule system + rule numbers
    body_name: str      # canonical name

class SplitBodiesList(BaseModel):
    bodies: List[SplitBody]


# ============================================================
# METADATA EXTRACTION (DETERMINISTIC)
# ============================================================

def extract_metadata(order_title):
    bracket = re.search(r"\[R-[^\]]+\]", order_title)
    bracket = bracket.group(0) if bracket else ""

    filed = re.search(r"Filed:\s*[^E\n]+", order_title)
    effective = re.search(r"Effective:?\s*.+", order_title)

    metadata_parts = []
    for m in [bracket,
              filed.group(0) if filed else None,
              effective.group(0) if effective else None]:
        if m:
            metadata_parts.append(m.strip())

    metadata = " ".join(metadata_parts)

    base_text = order_title
    for part in metadata_parts:
        base_text = base_text.replace(part, "")

    return base_text.strip(), metadata


# ============================================================
# PROMPT (LLM ONLY SPLITS)
# ============================================================

def build_split_prompt(base_text: str) -> str:
    return f"""
Split this into one segment per distinct body of rules.

Return:
- body_segment: only the rule-system part with rule numbers
- body_name: canonical name of the rule system

Do NOT include bracket codes or dates.
Do NOT rewrite wording.

TEXT:
{base_text}
"""


# ============================================================
# PROCESS FUNCTION
# ============================================================

def process_row(row, structured_llm):

    base_text, metadata = extract_metadata(row["order_title"])

    try:
        result = structured_llm.invoke(
            build_split_prompt(base_text)
        )
        bodies = result.bodies

        if not bodies:
            bodies = [SplitBody(body_segment=base_text, body_name="Unknown")]

    except:
        bodies = [SplitBody(body_segment=base_text, body_name="Unknown")]

    result_rows = []

    for body in bodies:
        new_row = row.to_dict()
        new_row["order_title"] = f"{body.body_segment} {metadata}".strip()
        new_row["body_of_rules"] = body.body_name
        result_rows.append(new_row)

    return result_rows


# ============================================================
# MAIN
# ============================================================

def main():

    df = pd.read_excel(INPUT_FILE)

    llm = ChatOpenAI(model=LLM_MODEL)
    structured_llm = llm.with_structured_output(SplitBodiesList)

    all_rows = []

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:

        futures = [
            executor.submit(process_row, row, structured_llm)
            for _, row in df.iterrows()
        ]

        with tqdm(total=len(futures), desc="Orders") as pbar:
            for future in as_completed(futures):
                try:
                    all_rows.extend(future.result())
                except Exception as e:
                    print("Error:", e)
                pbar.update(1)

    final_df = pd.DataFrame(all_rows)

    # Deduplicate safely
    final_df["norm"] = (
        final_df["order_title"]
        .str.lower()
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
    )

    final_df = final_df.drop_duplicates(
        subset=["norm", "effective_date"],
        keep="first"
    ).drop(columns=["norm"])

    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
    final_df.to_excel(OUTPUT_FILE, index=False)

    print("Finished.")
    print("Rows:", len(final_df))


if __name__ == "__main__":
    main()


# Create an "Issued_Date" column

In [6]:
"""
Create issued_date column from filed_date, dated_date, approved_date, effective_date
"""
from pathlib import Path
import pandas as pd

# ============================================================
# CONFIG
# ============================================================
BASE_DIR = Path.cwd().parent  

INPUT_FILE = BASE_DIR / "files" / "order_extraction" / "extracted_rule_bodies.xlsx"
OUTPUT_FILE = BASE_DIR / "files" / "order_extraction" / "extracted_rule_bodies.xlsx"

# ============================================================
# MAIN
# ============================================================
def main():
    print("üöÄ Creating issued_date column")
    
    df = pd.read_excel(INPUT_FILE)
    
    # Create issued_date: use first non-null value
    df['issued_date'] = (df['filed_date']
                         .fillna(df['dated_date'])
                         .fillna(df['approved_date'])
                         .fillna(df['effective_date']))
    
    df.to_excel(OUTPUT_FILE, index=False)
    
    print("‚úÖ Done")
    print(f"Rows: {len(df)}")

if __name__ == "__main__":
    main()

üöÄ Creating issued_date column
‚úÖ Done
Rows: 2157


# Categorize (local, statewide, statewide trial)

In [None]:
"""
Step 4 Robust Version ‚Äî Deterministic + LLM Comparison

Creates:

Local Rule (Strict)
Local Rule (Expanded)
Local Rule (LLM)

Statewide Rule (Strict)
Statewide Rule (Expanded)
Statewide Rule (LLM)
"""

from pathlib import Path
import pandas as pd
import re
from pydantic import BaseModel
from langchain_openai import ChatOpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# ============================================================
# CONFIG
# ============================================================

LLM_MODEL = "gpt-5"
MAX_WORKERS = 500

BASE_DIR = Path.cwd().parent

INPUT_FILE = BASE_DIR / "files" / "order_extraction" / "extracted_rule_bodies.xlsx"
OUTPUT_FILE = BASE_DIR / "files" / "order_extraction" / "extracted_rule_bodies_llm_regex.xlsx"

# ============================================================
# LLM SCHEMA
# ============================================================

class LocalClassification(BaseModel):
    is_local: int  # 1 or 0


# ============================================================
# LLM PROMPT
# ============================================================

def build_prompt(text: str) -> str:
    return f"""
Classify this court rule as either:

1 = Local Rule (applies to specific county, superior court, justice court, municipal court, etc.)
0 = Statewide Rule (applies statewide across Arizona courts)

Examples of Local:
- Rules of Practice for the Maricopa County Superior Court
- Local Rules of Civil Procedure, Mohave County Superior Court

Examples of Statewide:
- Arizona Rules of Civil Procedure
- Arizona Rules of Criminal Procedure

Return only:
is_local: 0 or 1

TEXT:
{text}
""".strip()


# ============================================================
# MAIN
# ============================================================

def main():

    df = pd.read_excel(INPUT_FILE)
    df["order_title"] = df["order_title"].astype(str)

    # ============================================================
    # STRICT DETERMINISTIC
    # ============================================================

    df["Local Rule (Strict)"] = df["order_title"].str.contains(
        r"\blocal rules?\b",
        case=False,
        na=False
    ).astype(int)

    df["Statewide Rule (Strict)"] = 1 - df["Local Rule (Strict)"]

    # ============================================================
    # EXPANDED DETERMINISTIC
    # ============================================================

    county_pattern = r"(Maricopa|Pima|Coconino|Yavapai|Mohave|Pinal|Yuma|Navajo|Gila|Cochise|Santa Cruz|La Paz|Greenlee|Graham|Apache)"
    trial_pattern = r"(Superior Court|Justice Court|Municipal Court)"

    mentions_county = df["order_title"].str.contains(
        county_pattern,
        case=False,
        na=False
    )

    mentions_trial = df["order_title"].str.contains(
        trial_pattern,
        case=False,
        na=False
    )

    expanded_local = (
        (df["Local Rule (Strict)"] == 1) |
        (mentions_county & mentions_trial)
    )

    df["Local Rule (Expanded)"] = expanded_local.astype(int)
    df["Statewide Rule (Expanded)"] = 1 - df["Local Rule (Expanded)"]

    # ============================================================
    # LLM CLASSIFICATION
    # ============================================================

    llm = ChatOpenAI(model=LLM_MODEL)
    structured_llm = llm.with_structured_output(LocalClassification)

    def classify(text):
        try:
            result = structured_llm.invoke(build_prompt(text))
            return int(result.is_local)
        except:
            return 0

    llm_results = []

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [
            executor.submit(classify, text)
            for text in df["order_title"]
        ]

        with tqdm(total=len(futures), desc="LLM Classification") as pbar:
            for future in as_completed(futures):
                llm_results.append(future.result())
                pbar.update(1)

    # Keep original order
    df["Local Rule (LLM)"] = llm_results
    df["Statewide Rule (LLM)"] = 1 - df["Local Rule (LLM)"]

    # ============================================================
    # SAVE
    # ============================================================

    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
    df.to_excel(OUTPUT_FILE, index=False)

    print("\nFinished.")
    print("Saved to:", OUTPUT_FILE)

    # Quick comparison summary
    print("\nComparison:")
    print("Strict Local:", df["Local Rule (Strict)"].sum())
    print("Expanded Local:", df["Local Rule (Expanded)"].sum())
    print("LLM Local:", df["Local Rule (LLM)"].sum())


if __name__ == "__main__":
    main()


  mentions_county = df["order_title"].str.contains(
  mentions_trial = df["order_title"].str.contains(
LLM Classification: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2157/2157 [01:26<00:00, 24.97it/s] 



Finished.
Saved to: /Users/othmanbensouda/Desktop/Orion/jobtalk_paper/files/order_extraction/extracted_rule_bodies_local_comparison.xlsx

Comparison:
Strict Local: 190
Expanded Local: 195
LLM Local: 77
