# LLM post-processing for OCR text

- Uses OpenAI chat completion for cleaning and Q&A over OCR output (you will paste your key).

- Install deps if needed: `pip install -q openai`.

- Store enhanced text in `enhanced_context`; reuse it to answer multiple user queries.


In [10]:
# Optional installs (uncomment if not already installed)

# %pip install -q openai



import os

from typing import Dict

from openai import OpenAI



# Set your OpenAI API key via environment variable or paste below

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or "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"

MODEL_NAME = "gpt-4o-mini"



def get_client() -> OpenAI:

    key = os.getenv("OPENAI_API_KEY") or OPENAI_API_KEY

    if not key or "PASTE" in key:

        raise ValueError("Set OPENAI_API_KEY env or replace <PASTE_YOUR_KEY> with your key.")

    return OpenAI(api_key=key)



client = get_client()



def chat(prompt: str, max_tokens: int = 400) -> str:

    resp = client.chat.completions.create(

        model=MODEL_NAME,

        messages=[{"role": "user", "content": prompt}],

        max_tokens=max_tokens,

        temperature=0,

    )

    return resp.choices[0].message.content.strip()


In [7]:

raw_ocr_text = """--- OCR TEXT ---
FakeDoc M.D.
HEALTH INTAKE FORM
Please fill out the questionnaire carefully. The information you provide will be used to complete
your health profile and will be kept confidential.
Date: 4 [ 4 : 14
d
Name: Da Walker DOB: dF | v4 [14 86
Address: 24 Barnes Lune City: Josato State: I Zip: 7932
Email: Sal}, val ed® Cmatl-t09 Phone #: ie) 17-3938 0
Gender: f Marital Status: S14 4 Le Occupation: Sol vure Ta Ancer
Referred By: AAA
Emergency Contact: = fA_Aaver Emergency Contact Phone: ( 4) 5 ) 334 dt lho
Describe your medical concerns (symptoms, diagnoses, etc):
Paan A%M®.e Mitas In lwo. wl phy ss
a /
ALACS Chile S bred
"""


In [12]:
# Enhance OCR text (cleanup + structured extraction) and store for reuse

from typing import Dict

import re



# Ensure raw_ocr_text exists (paste your OCR output if empty)

try:

    raw_ocr_text

except NameError:

    raw_ocr_text = """<paste your OCR output here>"""



# Limit context length to avoid model truncation

MAX_CHARS = 1500



def enhance_text(raw_text: str, max_tokens: int = 196) -> str:

    clipped = raw_text[:MAX_CHARS]

    prompt = (

        "Clean and normalize this OCR text from a filled form. "

        "Keep all fields, fix spacing/casing, and remove obvious OCR artifacts only. "

        "Do NOT invent values. Return the cleaned text.\n"

        f"OCR text:\n{clipped}"

    )

    return chat(prompt, max_tokens=max_tokens)



FIELD_SCHEMA = {

    "name": "",

    "dob": "",

    "address": "",

    "city": "",

    "state": "",

    "zip": "",

    "phone": "",

    "email": "",

    "gender": "",

    "marital_status": "",

    "occupation": "",

    "emergency_contact_name": "",

    "emergency_contact_phone": "",

    "policy_number": "",

    "date": "",

}



def clean_value(text: str) -> str:

    return re.sub(r"\s+", " ", text).strip(" ,;:-")



def enforce_formats(fields: Dict[str, str]) -> Dict[str, str]:

    out = dict(fields)

    if out.get("name"):

        name = re.sub(r"[^A-Za-z .'-]", "", out["name"])

        name = re.split(r"\bDOB\b", name, flags=re.IGNORECASE)[0]

        out["name"] = name.strip()

    for k in ["dob", "date"]:

        if out.get(k):

            val = re.sub(r"[^0-9/\\-]", "", out[k])

            out[k] = val.strip("-/")

    for k in ["phone", "emergency_contact_phone"]:

        if out.get(k):

            phone = re.sub(r"[^0-9+]", "", out[k])

            out[k] = phone

    if out.get("zip"):

        out["zip"] = re.sub(r"[^0-9]", "", out["zip"])

    if out.get("email"):

        email = out["email"].lower()

        m = re.search(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", email)

        out["email"] = m.group(0) if m else ""

    return out



def heuristic_extract(clean_text: str) -> Dict[str, str]:

    out = {}

    name_match = re.search(r"Name[:\s]+([A-Za-z][A-Za-z\s.'-]{1,40})", clean_text, re.IGNORECASE)

    dob_match = re.search(r"DOB[:\s]+([0-9]{1,2}[\-/][0-9]{1,2}[\-/][0-9]{2,4})", clean_text, re.IGNORECASE)

    email_match = re.search(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", clean_text)

    phone_match = re.search(r"Phone[:\s#]*([+\d][\d\s()\-]{7,20})", clean_text, re.IGNORECASE)

    if name_match:

        out["name"] = clean_value(name_match.group(1))

    if dob_match:

        out["dob"] = clean_value(dob_match.group(1))

    if email_match:

        out["email"] = clean_value(email_match.group(0))

    if phone_match:

        out["phone"] = clean_value(phone_match.group(1))

    return out



def extract_fields(clean_text: str, max_tokens: int = 196) -> Dict[str, str]:

    clipped = clean_text[:MAX_CHARS]

    schema_lines = "\n".join([f"- {k}" for k in FIELD_SCHEMA.keys()])

    prompt = (

        "Extract the following fields from the cleaned OCR form text. "

        "Respond ONLY as a single-line JSON object with exactly these keys. "

        "If a field is missing, use an empty string. Do NOT add text outside JSON.\n"

        f"Fields:\n{schema_lines}\n"

        f"Clean text:\n{clipped}"

    )

    text = chat(prompt, max_tokens=max_tokens)

    parsed = None

    try:

        import json

        parsed = json.loads(text)

    except Exception:

        parsed = None



    fields = {k: "" for k in FIELD_SCHEMA.keys()}

    if isinstance(parsed, dict):

        for k in fields.keys():

            if k in parsed and isinstance(parsed[k], str):

                fields[k] = clean_value(parsed.get(k, ""))

    else:

        fields["raw_extraction"] = text



    heur = heuristic_extract(clipped)

    for k, v in heur.items():

        if not fields.get(k):

            fields[k] = v



    fields = enforce_formats(fields)

    return fields



# Run once and keep the enhanced and structured versions for repeated queries

enhanced_context = enhance_text(raw_ocr_text)

structured_fields = extract_fields(enhanced_context)



# Build a QA context that combines structured fields plus cleaned text

qa_context = (

    "Structured fields:\n" + str(structured_fields) + "\n\n" + "Clean text:\n" + enhanced_context

)



print("--- Enhanced Context ---\n", enhanced_context)

print("\n--- Structured Fields ---\n", structured_fields)


--- Enhanced Context ---
 --- CLEANED TEXT ---
FakeDoc M.D.  
HEALTH INTAKE FORM  
Please fill out the questionnaire carefully. The information you provide will be used to complete your health profile and will be kept confidential.  

Date: 4/4/14  
Name: Da Walker  
DOB: 4/14/86  
Address: 24 Barnes Lane  
City: Josato  
State: I  
Zip: 7932  
Email: Sal.valed@gmail.com  
Phone #: (17) 393-80  
Gender: F  
Marital Status: Single  
Occupation: Software Engineer  
Referred By: AAA  
Emergency Contact: A. Aaver  
Emergency Contact Phone: (4) 5 334 1 0  

Describe your medical concerns (symptoms, diagnoses, etc):  
Pain and muscle issues in lower back.

--- Structured Fields ---
 {'name': 'Da Walker', 'dob': '4/14/86', 'address': '24 Barnes Lane', 'city': 'Josato', 'state': 'I', 'zip': '7932', 'phone': '1739380', 'email': 'sal.valed@gmail.com', 'gender': 'F', 'marital_status': 'Single', 'occupation': 'Software Engineer', 'emergency_contact_name': 'A. Aaver', 'emergency_contact_phone': '45

In [13]:
# Query the enhanced context repeatedly



def answer_query(question: str, context: str | None = None, max_tokens: int = 128) -> str:

    ctx = context or qa_context

    prompt = (

        "Answer the user's question using only the provided context from a filled form. "

        "Prefer the structured fields; if a field is empty, you may cite the clean text. "

        "If the answer is missing, say 'Not found in context.'\n"

        f"Context:\n{ctx}\n"

        f"Question: {question}"

    )

    return chat(prompt, max_tokens=max_tokens)



# Example queries (edit as needed)

example_questions = [

    "What is the customer's name?",

    "What is the date of birth?",

    "What is the emergency contact phone?",

]



for q in example_questions:

    print(f"Q: {q}")

    print("A:", answer_query(q))

    print()


Q: What is the customer's name?
A: The customer's name is Da Walker.

Q: What is the date of birth?
A: The date of birth is 4/14/86.

Q: What is the emergency contact phone?
A: The emergency contact phone is 4533410.

