# LLM post-processing for OCR text

- Uses a free Hugging Face model (google/flan-t5-small) for cleaning and Q&A over OCR output.

- Install deps if needed: `pip install -q transformers accelerate sentencepiece`.

- Store enhanced text in `enhanced_context`; reuse it to answer multiple user queries.


In [1]:
# Optional installs (uncomment if not already installed)

# %pip install -q transformers accelerate sentencepiece



from transformers import pipeline

import torch



# Choose a more capable free model (larger = better quality, slower). Examples:

#   - google/flan-t5-large  (better than small/base; needs ~3–5GB RAM)

#   - google/flan-t5-xl     (stronger; needs ~8–12GB RAM)

# Defaulting to flan-t5-large for better accuracy.

MODEL_NAME = "google/flan-t5-large"

DEVICE = 0 if torch.cuda.is_available() else -1



# Create a single shared pipeline (reused for cleaning and Q&A)

llm = pipeline("text2text-generation", model=MODEL_NAME, tokenizer=MODEL_NAME, device=DEVICE)


generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu


In [5]:

raw_ocr_text = """--- OCR TEXT ---
FakeDoc M.D.
HEALTH INTAKE FORM
Please fill out the questionnaire carefully. The information you provide will be used to complete
your health profile and will be kept confidential.
Date: 4 [ 4 : 14
d
Name: Da Walker DOB: dF | v4 [14 86
Address: 24 Barnes Lune City: Josato State: I Zip: 7932
Email: Sal}, val ed® Cmatl-t09 Phone #: ie) 17-3938 0
Gender: f Marital Status: S14 4 Le Occupation: Sol vure Ta Ancer
Referred By: AAA
Emergency Contact: = fA_Aaver Emergency Contact Phone: ( 4) 5 ) 334 dt lho
Describe your medical concerns (symptoms, diagnoses, etc):
Paan A%M®.e Mitas In lwo. wl phy ss
a /
ALACS Chile S bred
"""


In [None]:
# Enhance OCR text (cleanup + structured extraction) and store for reuse

from typing import Dict

import re



# Ensure raw_ocr_text exists (paste your OCR output if empty)

try:

    raw_ocr_text

except NameError:

    raw_ocr_text = """<paste your OCR output here>"""



def enhance_text(raw_text: str, max_new_tokens: int = 256) -> str:

    prompt = (

        "Clean and normalize this OCR text from a filled form. "

        "Keep all fields, fix spacing/casing, and remove obvious OCR artifacts only. "

        "Do NOT invent values. Return the cleaned text.\n"

        f"OCR text:\n{raw_text}"\

    )

    result = llm(prompt, max_new_tokens=max_new_tokens, do_sample=False)[0]["generated_text"]

    return result.strip()



FIELD_SCHEMA = {

    "name": "",

    "dob": "",

    "address": "",

    "city": "",

    "state": "",

    "zip": "",

    "phone": "",

    "email": "",

    "gender": "",

    "marital_status": "",

    "occupation": "",

    "emergency_contact_name": "",

    "emergency_contact_phone": "",

    "policy_number": "",

    "date": "",

}



def clean_value(text: str) -> str:

    # Strip trailing punctuation and obvious leftovers

    return re.sub(r"\s+", " ", text).strip(" ,;:-")



def heuristic_extract(clean_text: str) -> Dict[str, str]:

    out = {}

    # Stronger regex anchors for names and phones

    name_match = re.search(r"Name[:\s]+([A-Za-z][A-Za-z\s.'-]{1,40})", clean_text, re.IGNORECASE)

    dob_match = re.search(r"DOB[:\s]+([0-9]{1,2}[\-/][0-9]{1,2}[\-/][0-9]{2,4})", clean_text, re.IGNORECASE)

    email_match = re.search(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", clean_text)

    phone_match = re.search(r"Phone[:\s#]*([+\d][\d\s()\-]{7,20})", clean_text, re.IGNORECASE)

    if name_match:

        out["name"] = clean_value(name_match.group(1))

    if dob_match:

        out["dob"] = clean_value(dob_match.group(1))

    if email_match:

        out["email"] = clean_value(email_match.group(0))

    if phone_match:

        out["phone"] = clean_value(phone_match.group(1))

    return out



def extract_fields(clean_text: str, max_new_tokens: int = 256) -> Dict[str, str]:

    schema_lines = "\n".join([f"- {k}" for k in FIELD_SCHEMA.keys()])

    prompt = (

        "Extract the following fields from the cleaned OCR form text. "

        "Respond ONLY as a single-line JSON object with exactly these keys. "

        "If a field is missing, use an empty string. Do NOT add text outside JSON.\n"

        f"Fields:\n{schema_lines}\n"

        f"Clean text:\n{clean_text}"

    )

    result = llm(prompt, max_new_tokens=max_new_tokens, do_sample=False)[0]["generated_text"]

    text = result.strip()

    parsed = None

    try:

        import json

        parsed = json.loads(text)

    except Exception:

        parsed = None



    fields = {k: "" for k in FIELD_SCHEMA.keys()}

    if isinstance(parsed, dict):

        for k in fields.keys():

            if k in parsed and isinstance(parsed[k], str):

                fields[k] = clean_value(parsed.get(k, ""))

    else:

        fields["raw_extraction"] = text



    # Fill missing with heuristic extraction

    heur = heuristic_extract(clean_text)

    for k, v in heur.items():

        if not fields.get(k):

            fields[k] = v

    return fields



# Run once and keep the enhanced and structured versions for repeated queries

enhanced_context = enhance_text(raw_ocr_text)

structured_fields = extract_fields(enhanced_context)



# Build a QA context that combines structured fields plus cleaned text

qa_context = (

    "Structured fields:\n" + str(structured_fields) + "\n\n" + "Clean text:\n" + enhanced_context

)



print("--- Enhanced Context ---\n", enhanced_context)

print("\n--- Structured Fields ---\n", structured_fields)


In [None]:
# Query the enhanced context repeatedly



def answer_query(question: str, context: str | None = None, max_new_tokens: int = 128) -> str:

    ctx = context or qa_context

    prompt = (

        "Answer the user's question using only the provided context from a filled form. "

        "Prefer the structured fields; if a field is empty, you may cite the clean text. "

        "If the answer is missing, say 'Not found in context.'\n"

        f"Context:\n{ctx}\n"

        f"Question: {question}"

    )

    result = llm(prompt, max_new_tokens=max_new_tokens, do_sample=False)[0]["generated_text"]

    return result.strip()



# Example queries (edit as needed)

example_questions = [

    "What is the customer's name?",

    "What is the policy number?",

    "What is the date of birth?",

    "What is the emergency contact phone?",

]



for q in example_questions:

    print(f"Q: {q}")

    print("A:", answer_query(q))

    print()


Q: What is the customer's name?
A: Da Walker DOB: dF | v4 [14 86 Address: 24 Barnes Lune City: Josato State: I Zip: 7932 Email: Sal, val ed® Cmatl-t09 Phone #: ie) 17-3938 0 Gender: f Marital Status: S14 4 Le Occupation: Sol vure Ta Ancer Referred By: AAA Emergency Contact: = fA_Aaver Emergency Contact Phone: ( 4) 5 ) 334 dt lho

Q: What is the policy number?


KeyboardInterrupt: 