In [1]:
#!/usr/bin/env python3
"""
DRF PDF -> CSV parser (baseline v1)

Designed for DRF Past Performances / Entries-like PDFs where each horse entry
contains a post position, odds, horse name, and lines like:
- "Own:"
- Jockey line with stats
- "Tr:" trainer line

Usage:
  python drf_pdf_to_csv.py --pdf OP--12-26-2025.pdf --out OP--12-26-2025.csv
"""

from __future__ import annotations

import argparse
import re
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Iterable, Iterator, Optional, List, Dict, Tuple

import pandas as pd
from pypdf import PdfReader


# -----------------------------
# Regex patterns (tweak as needed)
# -----------------------------

# Race header examples:
# "1 Oaklawn Park çClm 30000(30-25)B"
RACE_HEADER_RE = re.compile(
    r"^(?P<race_no>\d+)\s+(?P<track>[A-Za-z][A-Za-z\s'.-]+)\s+.+$"
)

# Horse block header examples often look like:
# "1\n6-1\nTaker Back"  OR "5\n7-2\nTartaria"
HORSE_HEADER_RE = re.compile(
    r"^(?P<post>\d{1,2})\s*$"
)

ODDS_RE = re.compile(
    r"^(?P<odds>\d+[-/]\d+|\*?\d+\.\d+|\d+-\d+)\s*$"
)

# Lines inside a horse block
OWNER_RE = re.compile(r"^Own:\s*(?P<owner>.+)$")
TRAINER_RE = re.compile(r"^Tr:\s*(?P<trainer>.+?)\s*\(")  # stops before (stats)
JOCKEY_LINE_RE = re.compile(r"^(?P<jockey>[A-Z][A-Za-z.'\s-]+)\s+\(\d+\s+\d+")
WEIGHT_RE = re.compile(r"\b(?P<weight>\d{3})\b")  # often 118/120/122/124 on its own line
SEX_AGE_RE = re.compile(r"\b(?P<sex>[A-Za-z.]+)\s*(?P<age>\d+)\b")  # e.g., "B. f. 4"
CLAIM_PRICE_RE = re.compile(r"^\$(?P<claim_price>[\d,]+)\s*$")


@dataclass
class HorseRow:
    track: Optional[str] = None
    race_no: Optional[int] = None
    post: Optional[int] = None
    odds: Optional[str] = None
    horse_name: Optional[str] = None

    owner: Optional[str] = None
    trainer: Optional[str] = None
    jockey: Optional[str] = None
    weight: Optional[int] = None
    claim_price: Optional[str] = None

    raw_block: Optional[str] = None  # keep for debugging / iteration


def extract_pages_text(pdf_path: Path) -> List[str]:
    """Extract text from each PDF page using pypdf."""
    reader = PdfReader(str(pdf_path))
    pages = []
    for page in reader.pages:
        pages.append(page.extract_text() or "")
    return pages


def normalize_lines(text: str) -> List[str]:
    """Split into stripped, non-empty lines while preserving useful separators."""
    lines = [ln.strip() for ln in text.splitlines()]
    # Keep non-empty lines only
    return [ln for ln in lines if ln]


def iter_race_sections(lines: List[str]) -> Iterator[Tuple[Dict[str, str], List[str]]]:
    """
    Yield (race_meta, race_lines) for each race section discovered.
    Simple heuristic: a race starts at a line matching RACE_HEADER_RE.
    """
    current_meta: Optional[Dict[str, str]] = None
    current_lines: List[str] = []

    for ln in lines:
        m = RACE_HEADER_RE.match(ln)
        if m:
            # flush previous
            if current_meta is not None and current_lines:
                yield current_meta, current_lines
            current_meta = m.groupdict()
            current_lines = [ln]
        else:
            if current_meta is not None:
                current_lines.append(ln)

    if current_meta is not None and current_lines:
        yield current_meta, current_lines


def split_horse_blocks(race_lines: List[str]) -> List[List[str]]:
    """
    Split race lines into horse blocks.
    Heuristic: horse blocks begin with a line that is just post position,
    followed by odds line, followed by horse name line.
    """
    blocks: List[List[str]] = []
    i = 0
    n = len(race_lines)

    while i < n:
        if HORSE_HEADER_RE.match(race_lines[i] or ""):
            # Lookahead for odds + horse name
            if i + 2 < n and ODDS_RE.match(race_lines[i + 1]) and race_lines[i + 2]:
                # Start new block
                block = [race_lines[i], race_lines[i + 1], race_lines[i + 2]]
                i += 3
                # Consume until next header or end
                while i < n:
                    if HORSE_HEADER_RE.match(race_lines[i]) and i + 2 < n and ODDS_RE.match(race_lines[i + 1]):
                        break
                    block.append(race_lines[i])
                    i += 1
                blocks.append(block)
                continue
        i += 1

    return blocks


def parse_horse_block(block: List[str], track: str, race_no: int) -> HorseRow:
    """
    Parse a single horse block into fields.
    Keep raw_block for debugging and iterative improvements.
    """
    row = HorseRow(track=track, race_no=race_no, raw_block="\n".join(block))

    # Header: [post, odds, horse_name]
    try:
        row.post = int(block[0].strip())
    except Exception:
        row.post = None

    row.odds = block[1].strip() if len(block) > 1 else None
    row.horse_name = block[2].strip() if len(block) > 2 else None

    # Search for fields in the rest
    for ln in block[3:]:
        if row.owner is None:
            m = OWNER_RE.match(ln)
            if m:
                row.owner = m.group("owner").strip()

        if row.trainer is None:
            m = TRAINER_RE.match(ln)
            if m:
                row.trainer = m.group("trainer").strip()

        if row.jockey is None:
            m = JOCKEY_LINE_RE.match(ln)
            if m:
                row.jockey = m.group("jockey").strip()

        if row.claim_price is None:
            m = CLAIM_PRICE_RE.match(ln)
            if m:
                row.claim_price = m.group("claim_price")

    # Weight is often a standalone number line (118/120/122/124)
    # We’ll choose the first plausible 3-digit value found in a line that is ONLY digits,
    # or fallback to first 3-digit match if needed.
    if row.weight is None:
        for ln in block[3:]:
            if ln.isdigit() and len(ln) == 3:
                row.weight = int(ln)
                break
        if row.weight is None:
            for ln in block[3:]:
                m = WEIGHT_RE.search(ln)
                if m:
                    w = int(m.group("weight"))
                    if 100 <= w <= 140:
                        row.weight = w
                        break

    return row


def parse_pdf_to_rows(pdf_path: Path) -> List[HorseRow]:
    pages = extract_pages_text(pdf_path)

    # Combine all pages into one line stream; race headers are in-flow.
    all_lines: List[str] = []
    for page_text in pages:
        all_lines.extend(normalize_lines(page_text))

    rows: List[HorseRow] = []

    for race_meta, race_lines in iter_race_sections(all_lines):
        track = race_meta.get("track", "").strip()
        try:
            race_no = int(race_meta.get("race_no", "0"))
        except ValueError:
            race_no = 0

        horse_blocks = split_horse_blocks(race_lines)

        for hb in horse_blocks:
            row = parse_horse_block(hb, track=track, race_no=race_no)
            # Basic sanity check: require horse name
            if row.horse_name:
                rows.append(row)

    return rows


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--pdf", required=True, type=Path, help="Path to DRF PDF")
    ap.add_argument("--out", required=True, type=Path, help="Output CSV path")
    ap.add_argument("--debug_out", type=Path, default=None, help="Optional: write debug blocks as TXT")
    args = ap.parse_args()

    rows = parse_pdf_to_rows(args.pdf)
    df = pd.DataFrame([asdict(r) for r in rows])

    # Drop raw_block by default unless you want it
    if "raw_block" in df.columns:
        # Keep raw_block only if debugging
        if args.debug_out is None:
            df = df.drop(columns=["raw_block"])

    df.to_csv(args.out, index=False)

    if args.debug_out is not None:
        # Write all raw blocks to help you refine regex
        debug_txt = []
        for r in rows:
            debug_txt.append(f"=== {r.track} R{r.race_no} Post {r.post} {r.horse_name} ===\n{r.raw_block}\n")
        args.debug_out.write_text("\n".join(debug_txt), encoding="utf-8")

    print(f"Wrote {len(df)} rows -> {args.out}")


if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] --pdf PDF --out OUT [--debug_out DEBUG_OUT]
ipykernel_launcher.py: error: the following arguments are required: --pdf, --out


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
