## Import PDF and Convert to TXT

In [320]:
from pathlib import Path
import os
import shutil
from pypdf import PdfReader
import re
import unicodedata
from collections import Counter
from __future__ import annotations
from typing import Dict
import pandas as pd

# ----------------------------
# 1) Resolve repo + data paths
# ----------------------------
REPO_ROOT = Path.cwd()  # assumes notebook opened from repo root
DATA_DIR = REPO_ROOT / "race_data"

PDF_NAME = "OP--12-26-2025.pdf"
PDF_PATH = DATA_DIR / PDF_NAME
TXT_PATH = DATA_DIR / PDF_PATH.with_suffix(".txt").name  # OP--12-26-2025.txt

print("REPO_ROOT:", REPO_ROOT)
print("DATA_DIR:", DATA_DIR)
print("PDF_PATH:", PDF_PATH)
print("TXT_PATH:", TXT_PATH)

if not PDF_PATH.exists():
    raise FileNotFoundError(f"Could not find PDF at: {PDF_PATH}")

# -----------------------------------------
# 2) Permission workaround: temp PDF copy
#    (avoids file-in-use locks / OneDrive)
# -----------------------------------------
TEMP_DIR = Path(os.environ.get("TEMP", str(REPO_ROOT))) / "horse_racing_tmp"
TEMP_DIR.mkdir(parents=True, exist_ok=True)

TEMP_PDF_PATH = TEMP_DIR / PDF_NAME

# Copy PDF to temp to avoid permission/file-lock issues
shutil.copy2(PDF_PATH, TEMP_PDF_PATH)
print("Temp PDF copy created:", TEMP_PDF_PATH)

# -----------------------------------------
# 3) Extract text from temp PDF -> write TXT
#    directly to your data location
# -----------------------------------------
reader = PdfReader(str(TEMP_PDF_PATH))

DATA_DIR.mkdir(parents=True, exist_ok=True)  # ensure exists
with TXT_PATH.open("w", encoding="utf-8") as out:
    for i, page in enumerate(reader.pages, start=1):
        text = page.extract_text() or ""
        out.write(f"\n\n--- Page {i} ---\n")
        out.write(text)

print(f"TXT written to: {TXT_PATH}")

# -----------------------------------------
# 4) Clean up temp PDF
# -----------------------------------------

_ZERO_WIDTH = re.compile(r"[\u200B-\u200F\u202A-\u202E\u2060\uFEFF]")

def normalize_pdf_text(s: str) -> str:
    """
    Make extracted PDF text more matchable:
    - NFKC: folds compatibility forms
    - Removes common zero-width / bidi controls
    """
    s = unicodedata.normalize("NFKC", s)
    s = _ZERO_WIDTH.sub("", s)
    return s


def _replace_all(s: str, mapping: Dict[str, str]) -> str:
    """
    One-pass replacement using a regex alternation.
    Longest keys first to avoid partial matches.
    """
    if not mapping:
        return s

    keys = sorted(mapping.keys(), key=len, reverse=True)
    pattern = re.compile("|".join(re.escape(k) for k in keys))
    return pattern.sub(lambda m: mapping[m.group(0)], s)


# --- Your tiered converters ---
def convert_3seq_glyphs(s: str, map3: Dict[str, str]) -> str:
    return _replace_all(s, map3)

def convert_2seq_glyphs(s: str, map2: Dict[str, str]) -> str:
    return _replace_all(s, map2)

def convert_1glyphs(s: str, map1: Dict[str, str]) -> str:
    return _replace_all(s, map1)

def convert_0glyphs(s: str, map0: Dict[str, str]) -> str:
    return _replace_all(s, map0)

def convert_Cglyphs(s: str, mapC: Dict[str, str]) -> str:
    return _replace_all(s, mapC)

def convert_all_glyphs(s: str, map3: Dict[str, str], map2: Dict[str, str], map1: Dict[str, str], map0: Dict[str, str], mapC: Dict[str, str]) -> str:
    s = normalize_pdf_text(s)
    s = convert_3seq_glyphs(s, map3)
    s = convert_2seq_glyphs(s, map2)
    s = convert_1glyphs(s, map1)
    s = convert_0glyphs(s, map0)
    s = convert_Cglyphs(s, mapC)
    return s

map3 = {
    '¦¦õ': '^11.25 ',
    '¦¦ô': '^11.50 ',
    '¦¦ö': '^11.75 ',
    '¦§õ': '^12.25 ',
    '¦§ô': '^12.50 ',
    '¦§ö': '^12.75 ',
    '¦ ̈õ': '^13.25 ',
    '¦ ̈ô': '^13.50 ',
    '¦ ̈ö': '^13.75 ',
    '¦©õ': '^14.25 ',
    '¦©ô': '^14.50 ',
    '¦©ö': '^14.75 ',
    '¦aõ': '^15.25 ',
    '¦aô': '^15.50 ',
    '¦aö': '^15.75 ',
    '¦«õ': '^16.25 ',
    '¦«ô': '^16.50 ',
    '¦«ö': '^16.75 ',
    '¦¬õ': '^17.25 ',
    '¦¬ô': '^17.50 ',
    '¦¬ö': '^17.75 ',
    '¦¤õ': '^18.25 ',
    '¦¤ô': '^18.50 ',
    '¦¤ö': '^18.75 ',
    '¦®õ': '^19.25 ',
    '¦®ô': '^19.50 ',
    '¦®ö': '^19.75 ',
    '§¥õ': '^20.25 ',
    '§¥ô': '^20.50 ',
    '§¥ö': '^20.75 ',
    '§¦õ': '^21.25 ',
    '§¦ô': '^21.50 ',
    '§¦ö': '^21.75 ',
    '§§õ': '^22.25 ',
    '§§ô': '^22.50 ',
    '§§ö': '^22.75 ',
    '§ ̈õ': '^23.25 ',
    '§ ̈ô': '^23.50 ',
    '§ ̈ö': '^23.75 ',
    '§©õ': '^24.25 ',
    '§©ô': '^24.50 ',
    '§©ö': '^24.75 ',
    '§aõ': '^25.25 ',
    '§aô': '^25.50 ',
    '§aö': '^25.75 ',
    '§«õ': '^26.25 ',
    '§«ô': '^26.50 ',
    '§«ö': '^26.75 ',
    '§¬õ': '^27.25 ',
    '§¬ô': '^27.50 ',
    '§¬ö': '^27.75 ',
    '§¤õ': '^28.25 ',
    '§¤ô': '^28.50 ',
    '§¤ö': '^28.75 ',
    '§®õ': '^29.25 ',
    '§®ô': '^29.50 ',
    '§®ö': '^29.75 ',
    ' ̈¥õ': '^30.25 ',
    ' ̈¥ô': '^30.50 ',
    ' ̈¥ö': '^30.75 ',
    ' ̈¦õ': '^31.25 ',
    ' ̈¦ô': '^31.50 ',
    ' ̈¦ö': '^31.75 ',
    ' ̈§õ': '^32.25 ',
    ' ̈§ô': '^32.50 ',
    ' ̈§ö': '^32.75 ',
    ' ̈ ̈õ': '^33.25 ',
    ' ̈ ̈ô': '^33.50 ',
    ' ̈ ̈ö': '^33.75 ',
    ' ̈©õ': '^34.25 ',
    ' ̈©ô': '^34.50 ',
    ' ̈©ö': '^34.75 ',
    ' ̈aõ': '^35.25 ',
    ' ̈aô': '^35.50 ',
    ' ̈aö': '^35.75 ',
    ' ̈«õ': '^36.25 ',
    ' ̈«ô': '^36.50 ',
    ' ̈«ö': '^36.75 ',
    ' ̈¬õ': '^37.25 ',
    ' ̈¬ô': '^37.50 ',
    ' ̈¬ö': '^37.75 ',
    ' ̈¤õ': '^38.25 ',
    ' ̈¤ô': '^38.50 ',
    ' ̈¤ö': '^38.75 ',
    ' ̈®õ': '^39.25 ',
    ' ̈®ô': '^39.50 ',
    ' ̈®ö': '^39.75 ',
    '©¥õ': '^40.25 ',
    '©¥ô': '^40.50 ',
    '©¥ö': '^40.75 ',
    '©¦õ': '^41.25 ',
    '©¦ô': '^41.50 ',
    '©¦ö': '^41.75 ',
    '©§õ': '^42.25 ',
    '©§ô': '^42.50 ',
    '©§ö': '^42.75 ',
    '© ̈õ': '^43.25 ',
    '© ̈ô': '^43.50 ',
    '© ̈ö': '^43.75 ',
    '©©õ': '^44.25 ',
    '©©ô': '^44.50 ',
    '©©ö': '^44.75 ',
    '©aõ': '^45.25 ',
    '©aô': '^45.50 ',
    '©aö': '^45.75 ',
    '©«õ': '^46.25 ',
    '©«ô': '^46.50 ',
    '©«ö': '^46.75 ',
    '©¬õ': '^47.25 ',
    '©¬ô': '^47.50 ',
    '©¬ö': '^47.75 ',
    '©¤õ': '^48.25 ',
    '©¤ô': '^48.50 ',
    '©¤ö': '^48.75 ',
    '©®õ': '^49.25 ',
    '©®ô': '^49.50 ',
    '©®ö': '^49.75 ',
    'a¥õ': '^50.25 ',
    'a¥ô': '^50.50 ',
    'a¥ö': '^50.75 ',
    'a¦õ': '^51.25 ',
    'a¦ô': '^51.50 ',
    'a¦ö': '^51.75 ',
    'a§õ': '^52.25 ',
    'a§ô': '^52.50 ',
    'a§ö': '^52.75 ',
    'a ̈õ': '^53.25 ',
    'a ̈ô': '^53.50 ',
    'a ̈ö': '^53.75 ',
    'a©õ': '^54.25 ',
    'a©ô': '^54.50 ',
    'a©ö': '^54.75 ',
    'aaõ': '^55.25 ',
    'aaô': '^55.50 ',
    'aaö': '^55.75 ',
    'a«õ': '^56.25 ',
    'a«ô': '^56.50 ',
    'a«ö': '^56.75 ',
    'a¬õ': '^57.25 ',
    'a¬ô': '^57.50 ',
    'a¬ö': '^57.75 ',
    'a¤õ': '^58.25 ',
    'a¤ô': '^58.50 ',
    'a¤ö': '^58.75 ',
    'a®õ': '^59.25 ',
    'a®ô': '^59.50 ',
    'a®ö': '^59.75 ',
    '«¥õ': '^60.25 ',
    '«¥ô': '^60.50 ',
    '«¥ö': '^60.75 ',
    '«¦õ': '^61.25 ',
    '«¦ô': '^61.50 ',
    '«¦ö': '^61.75 ',
    '«§õ': '^62.25 ',
    '«§ô': '^62.50 ',
    '«§ö': '^62.75 ',
    '« ̈õ': '^63.25 ',
    '« ̈ô': '^63.50 ',
    '« ̈ö': '^63.75 ',
    '«©õ': '^64.25 ',
    '«©ô': '^64.50 ',
    '«©ö': '^64.75 ',
    '«aõ': '^65.25 ',
    '«aô': '^65.50 ',
    '«aö': '^65.75 ',
    '««õ': '^66.25 ',
    '««ô': '^66.50 ',
    '««ö': '^66.75 ',
    '«¬õ': '^67.25 ',
    '«¬ô': '^67.50 ',
    '«¬ö': '^67.75 ',
    '«¤õ': '^68.25 ',
    '«¤ô': '^68.50 ',
    '«¤ö': '^68.75 ',
    '«®õ': '^69.25 ',
    '«®ô': '^69.50 ',
    '«®ö': '^69.75 ',
    '¬¥õ': '^70.25 ',
    '¬¥ô': '^70.50 ',
    '¬¥ö': '^70.75 ',
    '¬¦õ': '^71.25 ',
    '¬¦ô': '^71.50 ',
    '¬¦ö': '^71.75 ',
    '¬§õ': '^72.25 ',
    '¬§ô': '^72.50 ',
    '¬§ö': '^72.75 ',
    '¬ ̈õ': '^73.25 ',
    '¬ ̈ô': '^73.50 ',
    '¬ ̈ö': '^73.75 ',
    '¬©õ': '^74.25 ',
    '¬©ô': '^74.50 ',
    '¬©ö': '^74.75 ',
    '¬aõ': '^75.25 ',
    '¬aô': '^75.50 ',
    '¬aö': '^75.75 ',
    '¬«õ': '^76.25 ',
    '¬«ô': '^76.50 ',
    '¬«ö': '^76.75 ',
    '¬¬õ': '^77.25 ',
    '¬¬ô': '^77.50 ',
    '¬¬ö': '^77.75 ',
    '¬¤õ': '^78.25 ',
    '¬¤ô': '^78.50 ',
    '¬¤ö': '^78.75 ',
    '¬®õ': '^79.25 ',
    '¬®ô': '^79.50 ',
    '¬®ö': '^79.75 ',
    '¤¥õ': '^80.25 ',
    '¤¥ô': '^80.50 ',
    '¤¥ö': '^80.75 ',
    '¤¦õ': '^81.25 ',
    '¤¦ô': '^81.50 ',
    '¤¦ö': '^81.75 ',
    '¤§õ': '^82.25 ',
    '¤§ô': '^82.50 ',
    '¤§ö': '^82.75 ',
    '¤ ̈õ': '^83.25 ',
    '¤ ̈ô': '^83.50 ',
    '¤ ̈ö': '^83.75 ',
    '¤©õ': '^84.25 ',
    '¤©ô': '^84.50 ',
    '¤©ö': '^84.75 ',
    '¤aõ': '^85.25 ',
    '¤aô': '^85.50 ',
    '¤aö': '^85.75 ',
    '¤«õ': '^86.25 ',
    '¤«ô': '^86.50 ',
    '¤«ö': '^86.75 ',
    '¤¬õ': '^87.25 ',
    '¤¬ô': '^87.50 ',
    '¤¬ö': '^87.75 ',
    '¤¤õ': '^88.25 ',
    '¤¤ô': '^88.50 ',
    '¤¤ö': '^88.75 ',
    '¤®õ': '^89.25 ',
    '¤®ô': '^89.50 ',
    '¤®ö': '^89.75 ',
    '®¥õ': '^90.25 ',
    '®¥ô': '^90.50 ',
    '®¥ö': '^90.75 ',
    '®¦õ': '^91.25 ',
    '®¦ô': '^91.50 ',
    '®¦ö': '^91.75 ',
    '®§õ': '^92.25 ',
    '®§ô': '^92.50 ',
    '®§ö': '^92.75 ',
    '® ̈õ': '^93.25 ',
    '® ̈ô': '^93.50 ',
    '® ̈ö': '^93.75 ',
    '®©õ': '^94.25 ',
    '®©ô': '^94.50 ',
    '®©ö': '^94.75 ',
    '®aõ': '^95.25 ',
    '®aô': '^95.50 ',
    '®aö': '^95.75 ',
    '®«õ': '^96.25 ',
    '®«ô': '^96.50 ',
    '®«ö': '^96.75 ',
    '®¬õ': '^97.25 ',
    '®¬ô': '^97.50 ',
    '®¬ö': '^97.75 ',
    '®¤õ': '^98.25 ',
    '®¤ô': '^98.50 ',
    '®¤ö': '^98.75 ',
    '®®õ': '^99.25 ',
    '®®ô': '^99.50 ',
    '®®ö': '^99.75 ',
}

map2 = {
    "¦õ": "^1.25 ",
    "¦ô": "^1.5 ",
    "¦ö": "^1.75 ",
    "§õ": "^2.25 ",
    "§ô": "^2.5 ",
    "§ö": "^2.75 ",
    " ̈õ": "^3.25 ",
    " ̈ô": "^3.5 ",
    " ̈ö": "^3.75 ",
    "©õ": "^4.25 ",
    "©ô": "^4.5 ",
    "©ö": "^4.75 ",
    "aõ": "^5.25 ",
    "aô": "^5.5 ",
    "aö": "^5.75 ",
    "«õ": "^6.25 ",
    "«ô": "^6.5 ",
    "«ö": "^6.75 ",
    "¬õ": "^7.25 ",
    "¬ô": "^7.5 ",
    "¬ö": "^7.75 ",
    "¤õ": "^8.25 ",
    "¤ô": "^8.5 ",
    "¤ö": "^8.75 ",
    "®õ": "^9.25 ",
    "®ô": "^9.5 ",
    "®ö": "^9.75 ",
    "¦¥": "^10 ",
    "¦¦": "^11 ",
    "¦§": "^12 ",
    "¦¨": "^13 ",
    "¦©": "^14 ",
    "¦a": "^15 ",
    "¦«": "^16 ",
    "¦¬": "^17 ",
    "¦¤­": "^18 ",
    "¦®": "^19 ",
    "§¥": "^20 ",
    "§¦": "^21 ",
    "§§": "^22 ",
    "§¨": "^23 ",
    "§©": "^24 ",
    "§a": "^25 ",
    "§«": "^26 ",
    "§¬": "^27 ",
    "§¤­": "^28 ",
    "§®": "^29 ",
    "¨¥": "^30 ",
    "¨¦": "^31 ",
    "¨§": "^32 ",
    "¨¨": "^33 ",
    "¨©": "^34 ",
    "¨a": "^35 ",
    "¨«": "^36 ",
    "¨¬": "^37 ",
    "¨¤­": "^38 ",
    "¨®": "^39 ",
    "©¥": "^40 ",
    "©¦": "^41 ",
    "©§": "^42 ",
    "©¨": "^43 ",
    "©©": "^44 ",
    "©a": "^45 ",
    "©«": "^46 ",
    "©¬": "^47 ",
    "©¤­": "^48 ",
    "©®": "^49 ",
    "a¥": "^50 ",
    "a¦": "^51 ",
    "a§": "^52 ",
    "a ̈ ": "^53 ",
    "a©": "^54 ",
    "aa": "^55 ",
    "a«": "^56 ",
    "a¬": "^57 ",
    "a¤­": "^58 ",
    "a®": "^59 ",
    "«¥": "^60 ",
    "«¦": "^61 ",
    "«§": "^62 ",
    "«¨": "^63 ",
    "«©": "^64 ",
    "«a": "^65 ",
    "««": "^66 ",
    "«¬": "^67 ",
    "«¤­": "^68 ",
    "«®": "^69 ",
    "¬¥": "^70",
    "¬¦": "^71 ",
    "¬§": "^72 ",
    "¬¨": "^73 ",
    "¬©": "^74 ",
    "¬a": "^75 ",
    "¬«": "^76 ",
    "¬¬": "^77 ",
    "¬¤­": "^78 ",
    "¬®": "^79 ",
    "¤¥": "^80 ",
    "¤­¦": "^81 ",
    "¤­§": "^82 ",
    "¤­¨": "^83 ",
    "¤­©": "^84 ",
    "¤­a": "^85 ",
    "¤­«": "^86 ",
    "¤­¬": "^87 ",
    "­¤¤­": "^88 ",
    "¤­®": "^89 ",
    "®¥": "^90 ",
    "®¦": "^91 ",
    "®§": "^92 ",
    "®¨": "^93 ",
    "®©": "^94 ",
    "®a": "^95 ",
    "®«": "^96 ",
    "®¬": "^97 ",
    "®¤­": "^98 ",
    "®®": "^99 ",
}

map1 = {

    '¥': '^0 ',
    'Â': '^0.0625',
    '°': '^0.125',
    'õ': '^0.25 ',
    'ô': '^0.50 ',
    'ö': '^0.75 ',
    '¦': '^1 ',
    '1±': '1^.1875 ',
    '1o': '1^.375 ',
    '§': '^2 ',
    ' ̈ ': '^3 ',
    ' Ë': '^3 ',
    ' ̈1': '^3 1',
    '©': '^4 ',
    'a ': '^5 ',
    '«': '^6 ',
    '¬': '^7 ',
    '¤': '^8 ',
    '®': '^9 ',
    "Ç": "^(head) ",
    "É": "^(neck) ",
    "ó": "^(nose) ",
        
}

map0 = {
    
    "= ": "=",
    "¹": "^1",
    "²": "^2",
    "³": "^3",
    "â": "Jan",
    "á": "Feb",
    "à": "Mar",
    "ß": "Apr",
    "Ü": "May",
    "Þ": "June",
    "Û": "July",
    "Ý": "Aug",
    "æ": "Sept",
    "å": "Oct",
    "ä": "Nov",
    "ã": "Dec",

    " fst": "fst",
    " fm": "fm",
    " gd": "gd",
    " sly": "sly",
    " my": "my",
    " Í": "(star) ",

    " f": "f",
    " C ": "(C) ",
    " S ": "(S) ",
    "ê": "(T)",
    "ý": "(c) ",
    " ý": "(c) ",
    "ï": "(X)",
    "ø": "^S",
    "ÿ": "(s)",
    " ÿ": "(s)",
    "ú": "(A)",

    "ç": "(fillie)",
    "Î": "(older)",
    "Md ": "(Md)",
    "Sp ": "(Sp)",
    "Wt ": "(Wt)",
    "Alw ": "(Alw)",
    "Clm ": "(Clm)",
    "OC ": "(OC)",
    "Ð": "(state)",
    " N2X": "N2X",
    " NC": "NC",
    " N2L": "N2L",
    " N3X": "N3X",
    " NC": "NC",
    " N3L": "N3L",
    " N4X": "N4X",
    " NC": "NC",
    " N4L": "N4L",
    " -N": "-N",
    " B ": "B ",
    

    " /": "/",
    "10/": " 10/",
    "11/": " 11/",
    "12/": " 12/",
    "13/": " 13/",
    "14/": " 14/",
    "15/": " 15/",
    "16/": " 16/",
    "17/": " 17/",
    "18/": " 18/",
        
}

mapC = {

    " C\n": "(C) ",
    " (C) ": "(C) ",
    " (X) ": "(X) ",
    " (A) ": "(A) ",
    " (T) ": "(T) ",
    " (s) ": "(s) ",
    " (c) ": "(c) ",
    
    }

raw_text = TXT_PATH.read_text(encoding="utf-8", errors="ignore")
converted_text = convert_all_glyphs(raw_text, map3, map2, map1, map0, mapC)

output_path = Path("C:\\Users\\tnaut\\horse_racing_calculator\\race_data") / "converted_data.txt"
output_path.parent.mkdir(parents=True, exist_ok=True)

with output_path.open("w", encoding="utf-8", newline="\n") as f:
    f.write(converted_text)



REPO_ROOT: c:\Users\tnaut\horse_racing_calculator
DATA_DIR: c:\Users\tnaut\horse_racing_calculator\race_data
PDF_PATH: c:\Users\tnaut\horse_racing_calculator\race_data\OP--12-26-2025.pdf
TXT_PATH: c:\Users\tnaut\horse_racing_calculator\race_data\OP--12-26-2025.txt
Temp PDF copy created: C:\Users\tnaut\AppData\Local\Temp\horse_racing_tmp\OP--12-26-2025.pdf
TXT written to: c:\Users\tnaut\horse_racing_calculator\race_data\OP--12-26-2025.txt


## Stitch together any split lines and create new TXT

In [None]:
# ----------------------------
# 0) PRE-COMBINE (STITCH) SPLIT HISTORY ROWS
# ----------------------------

# Update these paths to match your project
DATA_DIR = Path("race_data")  # or Path("data") or whatever folder you use
IN_TXT   = DATA_DIR / "converted_data.txt"
OUT_TXT  = DATA_DIR / "converted_data_combined.txt"

# --- Line classifiers ---
RACE_HEADER_RE   = re.compile(r"^\s*Oaklawn\s+Park\b", re.IGNORECASE)  # your "new race" signal
HORSE_NUM_RE     = re.compile(r"^\s*\S+\s*$")
MORNINGLINE_RE   = re.compile(r"^\s*\d+\s*-\s*\d+\s*$")  # e.g., 6-1, 10-1
HISTORY_START_RE = re.compile(r"^\s*[0-9A-Za-z]+=")      # e.g., 14Dec25=...

# Stop blocks that should never be appended onto a history line
STOP_PREFIXES = (
    "WORKS:", "TRAINER:", "Previously trained",
    "Claimedfrom", "Claimed from",
    "Daily Racing Form", "OP, race", "--- Page"
)

def is_race_header(line: str) -> bool:
    return bool(RACE_HEADER_RE.match(line))

def is_horse_header(lines, i: int) -> bool:
    """
    Detects the 3-line horse header:
      line i   : horse number (digits-only)
      line i+1 : morning line (e.g., 6-1)
      line i+2 : horse name (non-empty, not a label)
    """
    if i + 2 >= len(lines):
        return False
    if not HORSE_NUM_RE.match(lines[i]):
        return False
    if not MORNINGLINE_RE.match(lines[i + 1]):
        return False
    name = lines[i + 2].strip()
    if not name:
        return False
    # Avoid false positives
    if name.startswith(("Own:", "Tr:", "Timeform", "Post time", "Wagers:", "Beyer par")):
        return False
    return True

def should_stop_append(next_line: str, lines, j: int) -> bool:
    """
    Stop conditions for joining lines onto a history record.
    """
    s = next_line.strip()
    if not s:
        return False  # blank lines are ignored (not a stop)
    if s.startswith(STOP_PREFIXES):
        return True
    if is_race_header(s):
        return True
    if is_horse_header(lines, j):
        return True
    if HISTORY_START_RE.match(s):  # next history record begins
        return True
    return False

def stitch_history_rows(lines):
    """
    For each line starting with date=, append subsequent lines until we reach
    a clear boundary (next history, new horse, new race, WORKS/TRAINER/etc).
    """
    stitched = []
    i = 0
    n = len(lines)

    while i < n:
        cur = lines[i].rstrip("\n")
        cur_s = cur.strip()

        if HISTORY_START_RE.match(cur_s):
            buf = cur_s
            j = i + 1

            # Append continuation lines conservatively
            while j < n and not should_stop_append(lines[j], lines, j):
                nxt = lines[j].strip()
                if nxt:  # ignore blanks
                    buf += " " + nxt
                j += 1

            # Normalize internal whitespace for stability
            buf = re.sub(r"\s+", " ", buf).strip()
            stitched.append(buf)
            i = j
            continue

        # Non-history lines: keep as-is (trim only trailing newline)
        stitched.append(cur_s)
        i += 1

    return stitched

# --- Run stitch + write output ---
raw = IN_TXT.read_text(encoding="utf-8", errors="ignore").splitlines()
combined_lines = stitch_history_rows(raw)

OUT_TXT.parent.mkdir(parents=True, exist_ok=True)
OUT_TXT.write_text("\n".join(combined_lines) + "\n", encoding="utf-8")

print(f"Stitched history rows written to: {OUT_TXT}")
print("Tip: In your parsing cell, open OUT_TXT instead of IN_TXT.")


Stitched history rows written to: race_data\converted_data_combined.txt
Tip: In your parsing cell, open OUT_TXT instead of IN_TXT.


In [322]:
# ============================================================
# 1) Race + horse boundary detection (line-by-line safe)
# ============================================================

race_string = re.compile(r"^\s*Oaklawn\s+Park\b", re.IGNORECASE)
drf_footer  = re.compile(r"^\s*Daily\s+Racing\s+Form\b", re.IGNORECASE)

horse_num_line     = re.compile(r"^\d+$")
morningline_string = re.compile(r"^(\d+)-(\d+)$")

# ============================================================
# 2) Horse-level single-line fields
# ============================================================

earlylate_string = re.compile(r"Early\s+(\d+)\s+Late(\d+)")
jockey_string    = re.compile(r"^(\w+)\(\S+\)\S+:\(\S+\)$")
trainer_string   = re.compile(r"^Tr:\s+(\S+(?:\s+\w+)).*$")
dirtrec_string   = re.compile(r"^D\.Fst\s+(\d+)\s+(\d+)")
synthrec_string  = re.compile(r"^Synth\([^)]*\)\s+(\d+)\s+(\d+)")
turfrec_string   = re.compile(r"^Turf\([^)]*\)\s+(\d+)\s+(\d+)")
distance_string  = re.compile(r"^Dst\([^)]*\)\s+(\d+)\s+(\d+)")

# ============================================================
# 3) HISTORY REGEX — EXACTLY YOUR ORIGINAL PATTERN
#    (no token parsing, no group reinterpretation)
# ============================================================

history_string = re.compile(
    r"^(?P<date>\d{1,2}[A-Za-z]{1,}\d{2})"               # date
    r"=\S+\s+"
    r"(?P<type>\S+)"                                     # type
    r".*\:\S+\s+"
    r"(?P<class>.*)"                                    # class
    r"\s+"                
    r"(?P<beyer>\S+)"                                    # beyer
    r"\s+\d+\/\d+\s+"
    r"(?P<first_call>\S+)"                               # first call
    r"\s+\S+\s+\S+\s+\S+\s+"
    r"(?P<last_call>\S+)"                                # last call
    r".*$"
)

# ============================================================
# 4) Normalization helper
# ============================================================

def norm_line(line: str) -> str:
    return (
        line.replace("\u2022", " ")
            .replace("\u00A0", " ")
            .strip()
    )

# ============================================================
# 5) DataFrame schema (wide history columns)
# ============================================================

MAX_HIST = 12

HIST_FIELDS = [
    "date",
    "type",
    "class",
    "beyer",
    "first_call",
    "last_call",
]

columns = [
    "race_number",
    "horse_number",
    "horse_name",
    "jockey",
    "trainer",
    "morning_line",
    "early_speed",
    "late_speed",
    "dirt_starts",
    "dirt_wins",
    "synth_starts",
    "synth_wins",
    "turf_starts",
    "turf_wins",
    "distance_starts",
    "distance_wins",
] + [
    f"hist{i}_{field}"
    for i in range(1, MAX_HIST + 1)
    for field in HIST_FIELDS
]

# ============================================================
# 6) Parsing logic
# ============================================================

race_number = 0
rows = []
current = None

def init_history_slots(d: dict):
    d["_hist_idx"] = 1
    for i in range(1, MAX_HIST + 1):
        for field in HIST_FIELDS:
            d[f"hist{i}_{field}"] = None

def flush_current():
    global current
    if current and current.get("horse_number") is not None:
        current.pop("_hist_idx", None)
        rows.append(current)
    current = None

prev_prev = ""
prev = ""

with open(OUT_TXT, "r", encoding="utf-8", errors="ignore") as f:
    for line_number, raw_line in enumerate(f, start=1):
        line = norm_line(raw_line.rstrip("\n"))

        # -------------------------
        # RACE START
        # -------------------------
        if race_string.search(line) and not drf_footer.search(line):
            race_number += 1
            flush_current()
            prev_prev, prev = prev, line
            continue

        # -------------------------
        # HORSE START (num → odds → name)
        # -------------------------
        if horse_num_line.fullmatch(prev_prev) and morningline_string.fullmatch(prev) and line:
            flush_current()

            ml = morningline_string.fullmatch(prev)

            current = {
                "race_number": race_number,
                "horse_number": int(prev_prev),
                "horse_name": line,
                "jockey": None,
                "trainer": None,
                "morning_line": f"{ml.group(1)}-{ml.group(2)}",
                "early_speed": None,
                "late_speed": None,
                "dirt_starts": None,
                "dirt_wins": None,
                "synth_starts": None,
                "synth_wins": None,
                "turf_starts": None,
                "turf_wins": None,
                "distance_starts": None,
                "distance_wins": None,
            }

            init_history_slots(current)
            prev_prev, prev = prev, line
            continue

        # -------------------------
        # FIELD EXTRACTION (inside horse)
        # -------------------------
        if current is not None and line:

            m = earlylate_string.search(line)
            if m:
                current["early_speed"] = int(m.group(1))
                current["late_speed"] = int(m.group(2))

            m = jockey_string.search(line)
            if m:
                current["jockey"] = m.group(1).strip()

            m = trainer_string.search(line)
            if m:
                current["trainer"] = m.group(1).strip()

            m = dirtrec_string.search(line)
            if m:
                current["dirt_starts"] = int(m.group(1))
                current["dirt_wins"] = int(m.group(2))

            m = synthrec_string.search(line)
            if m:
                current["synth_starts"] = int(m.group(1))
                current["synth_wins"] = int(m.group(2))

            m = turfrec_string.search(line)
            if m:
                current["turf_starts"] = int(m.group(1))
                current["turf_wins"] = int(m.group(2))

            m = distance_string.search(line)
            if m:
                current["distance_starts"] = int(m.group(1))
                current["distance_wins"] = int(m.group(2))

            # -------------------------
            # HISTORY (single-regex capture)
            # -------------------------
            m = history_string.search(line)
            if m:
                idx = current["_hist_idx"]
                if idx <= MAX_HIST:
                    current[f"hist{idx}_date"]        = m.group("date")
                    current[f"hist{idx}_type"]        = m.group("type")
                    current[f"hist{idx}_class"]       = m.group("class")
                    current[f"hist{idx}_beyer"]       = m.group("beyer")
                    current[f"hist{idx}_first_call"]  = m.group("first_call")
                    current[f"hist{idx}_last_call"]   = m.group("last_call")
                    current["_hist_idx"] = idx + 1

        prev_prev, prev = prev, line

# Flush last horse
flush_current()

df = pd.DataFrame(rows, columns=columns)

df.to_csv("C:\\Users\\tnaut\\horse_racing_calculator\\race_data\\output.csv", index=False)

df.to_excel("C:\\Users\\tnaut\\horse_racing_calculator\\race_data\\output.xlsx", index=False)



