# Tesseract OCR for Filled Forms

- Install the Tesseract engine (Windows installer) and note the `tesseract.exe` path.

- If needed, `pip install pytesseract opencv-python pillow numpy pdf2image` in this environment.

- For PDFs, install Poppler for Windows and set `POPPLER_PATH` to its `bin` folder if not on PATH.

- Update the `TESSERACT_CMD` variable below if Tesseract is not on PATH.

- The pipeline: load image/PDF → clean (grayscale, denoise, threshold) → OCR with layout-friendly PSM/OEM values.


In [14]:
# Library imports and Tesseract/Poppler path config

from pathlib import Path

import cv2

import numpy as np

import pytesseract

from PIL import Image

from pdf2image import convert_from_path



# Point this to your Tesseract executable if it's not already on PATH.

TESSERACT_CMD = r"C:\\Program Files\\Tesseract-OCR\\tesseract.exe"

if Path(TESSERACT_CMD).exists():

    pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD



# Poppler path for PDF to image conversion (set to your extracted poppler\bin directory)

# Example for your download: C:\\Users\\tsheikh\\Downloads\\Release-25.12.0-0\\poppler-25.12.0\\Library\\bin

POPPLER_PATH = r"C:\\Users\\tsheikh\\Downloads\\Release-25.12.0-0\\poppler-25.12.0\\Library\\bin"

if POPPLER_PATH and not Path(POPPLER_PATH).exists():

    POPPLER_PATH = None  # fall back to PATH if not found



def assert_tesseract_available() -> None:

    """Raise a helpful error if Tesseract is missing."""

    cmd = Path(pytesseract.pytesseract.tesseract_cmd or "tesseract")

    if not cmd.exists() and cmd.name == "tesseract":

        raise FileNotFoundError(

            "Tesseract not found. Install it and/or set TESSERACT_CMD to the exe path."

        )

    if cmd.exists():

        return

    raise FileNotFoundError(f"Tesseract binary not found at: {cmd}")


In [15]:
# Image loading and preprocessing helpers

def load_image(path: str) -> np.ndarray:

    img_path = Path(path)

    if not img_path.exists():

        raise FileNotFoundError(f"Image not found: {img_path}")

    # cv2.imread loads in BGR

    image = cv2.imread(str(img_path))

    if image is None:

        raise ValueError(f"Failed to read image: {img_path}")

    return image



def preprocess_image(image: np.ndarray) -> np.ndarray:

    """Lightweight cleanup tuned for filled forms (handwritten + printed)."""

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Reduce noise while keeping edges

    denoised = cv2.fastNlMeansDenoising(gray, h=15)

    # Adaptive threshold to separate ink from paper

    thresh = cv2.adaptiveThreshold(

        denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 15

    )

    # Slight dilation to strengthen thin strokes

    kernel = np.ones((2, 2), np.uint8)

    processed = cv2.dilate(thresh, kernel, iterations=1)

    return processed


In [16]:
# Document loader (images or PDFs)

def load_document_pages(path: str) -> list[np.ndarray]:

    doc_path = Path(path)

    if not doc_path.exists():

        raise FileNotFoundError(f"Document not found: {doc_path}")



    if doc_path.suffix.lower() == ".pdf":

        # convert_from_path returns list of PIL Images (RGB). Convert to OpenCV BGR.

        pil_pages = convert_from_path(doc_path, poppler_path=POPPLER_PATH)

        pages = []

        for pil_img in pil_pages:

            rgb = np.array(pil_img)

            bgr = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)

            pages.append(bgr)

        return pages



    # Otherwise treat as image

    return [load_image(str(doc_path))]


In [17]:
# OCR wrappers for single images and multi-page docs

def run_ocr_on_image(

    image: np.ndarray,

    lang: str = "eng",

    psm: int = 6,

    oem: int = 3,

    extra_config: str | None = None,

) -> dict:

    assert_tesseract_available()

    preprocessed = preprocess_image(image)



    config_parts = [f"--psm {psm}", f"--oem {oem}"]

    if extra_config:

        config_parts.append(extra_config)

    config = " ".join(config_parts)



    text = pytesseract.image_to_string(preprocessed, lang=lang, config=config)

    data = pytesseract.image_to_data(

        preprocessed, lang=lang, config=config, output_type=pytesseract.Output.DICT

    )



    return {

        "text": text.strip(),

        "raw_data": data,

        "config_used": config,

    }





def extract_text(

    image_path: str,

    lang: str = "eng",

    psm: int = 6,

    oem: int = 3,

    extra_config: str | None = None,

) -> dict:

    """

    OCR for a single image file (kept for backward compatibility).

    """

    image = load_image(image_path)

    return run_ocr_on_image(image, lang=lang, psm=psm, oem=oem, extra_config=extra_config)





def extract_text_document(

    doc_path: str,

    lang: str = "eng",

    psm: int = 6,

    oem: int = 3,

    extra_config: str | None = None,

) -> dict:

    """

    OCR for PDFs or images. Returns per-page results and combined text.

    """

    pages = load_document_pages(doc_path)

    results = []

    for idx, page in enumerate(pages, start=1):

        page_result = run_ocr_on_image(page, lang=lang, psm=psm, oem=oem, extra_config=extra_config)

        results.append({"page": idx, **page_result})



    combined_text = "\n\n".join(r["text"] for r in results)

    return {"pages": results, "combined_text": combined_text, "page_count": len(pages)}


In [21]:
# Example usage: pass either an image or PDF path

from pprint import pprint



# Update this to your actual file (PDF or image)

document_path = "C:/Users/tsheikh/Downloads/Sample filled form - Onlinesbi.pdf"



if Path(document_path).exists():

    if Path(document_path).suffix.lower() == ".pdf":

        doc_result = extract_text_document(document_path, psm=6, oem=3, lang="eng")

        print(f"Pages processed: {doc_result['page_count']}")

        for page_info in doc_result["pages"]:

            print(f"\n--- OCR TEXT (page {page_info['page']}) ---")

            print(page_info["text"])

        print("\n--- FIRST PAGE PER-WORD DATA (first 5 rows) ---")

        preview = {k: v[:5] for k, v in doc_result["pages"][0]["raw_data"].items()}

        pprint(preview)

    else:

        result = extract_text(document_path, psm=6, oem=3, lang="eng")

        print("--- OCR TEXT ---")

        print(result["text"])

        print("\n--- PER-WORD DATA (first 5 rows) ---")

        preview = {k: v[:5] for k, v in result["raw_data"].items()}

        pprint(preview)

else:

    print("Update document_path to a valid image or PDF before running.")


Pages processed: 2

--- OCR TEXT (page 1) ---
Sample filled form
St No. EZOOQOOT
dia
QEneers [Noocme [SS—*d
With sow «
+ Help Desk: 1800 22 0488 (Toll free for MTNUBSNL users) / Ph.: 022-26592123 * Fax : 022-26592127 » Email : querydp@sbi.co.In
eZ-Instruction Registration Form
To,
STATE BANK OF INDIA dee 13 11 10 1132 [01019
Depository Participant Centralised Processing Cell,
CMC House, C-18, Bandra-Kurla Complex,
Bandra (East), Mumbai 400 051
eorcient'© | 1]3 }o [119] 3]o Joo fo jolo (4 [3 [2 [3]
1} We have submitted form for opening a Demat Account with you. IAve wish to register for e2-Instruction tAve wish to avail mysetf/ourselves of the facility to
give Instruction through Internet and/or any means of cammunicabons as intimated by SBI from time to time to tansfer secunties from the above mentioned
source account to the following target accounts.
Sr.No Target DP -IO /CM-BP.ID Target DP Name # CM Name Target Account Number
(Not applicable fof Clearing Member)
a 12017200 00000001
IN

## Tips for better accuracy

- Try `psm` values: 4 (sparse text), 6 (blocks), 11 (single line), 12 (handwriting). 

- For multilingual forms, set `lang="eng+ara"` (or other traineddata you installed).

- If forms are skewed, deskew first with OpenCV (Hough lines or moments) before calling `extract_text`.

- To process many files, loop over a folder and call `extract_text` per image; save `raw_data` to CSV for auditing.
