In [1]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
from google.cloud import vision

def pdf_to_images(pdf_path, output_folder, dpi=300):
    """
    Converts a PDF to images for OCR processing.
    """
    pdf_document = fitz.open(pdf_path)
    image_paths = []

    for i in range(len(pdf_document)):
        page = pdf_document[i]
        pix = page.get_pixmap(dpi=dpi)
        image_path = f"{output_folder}/page_{i+1}.png"
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        img.save(image_path, format="PNG")
        image_paths.append(image_path)

    return image_paths

def extract_text_tesseract(image_path):
    """
    Extracts printed and structured text using Tesseract OCR.
    """
    img = Image.open(image_path)
    return pytesseract.image_to_string(img, config="--psm 6")

def extract_text_google_vision(image_path):
    """
    Extracts handwritten text using Google Vision API.
    """
    client = vision.ImageAnnotatorClient()
    with io.open(image_path, "rb") as image_file:
        content = image_file.read()

    image = vision.Image(content=content)
    response = client.document_text_detection(image=image)

    return response.full_text_annotation.text

def process_pdf(pdf_path, output_folder):
    """
    Extracts both typed and handwritten text from a PDF.
    """
    images = pdf_to_images(pdf_path, output_folder)
    extracted_data = {}

    for img_path in images:
        print(f"Processing {img_path}...")
        typed_text = extract_text_tesseract(img_path)
        handwritten_text = extract_text_google_vision(img_path)

        extracted_data[img_path] = {
            "typed_text": typed_text,
            "handwritten_text": handwritten_text
        }

    return extracted_data

# Example Usage
pdf_path = ""
output_folder = "output_images"
extracted_text = process_pdf(pdf_path, output_folder)

# Print the extracted text
import json
print(json.dumps(extracted_text, indent=4, ensure_ascii=False))


{}


In [2]:
!pip install google-cloud-vision
!pip install --upgrade pip




In [3]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/akhsrip/Desktop/Projects:Assignments/AidenAI_Assign/regal-groove-451305-h1-6029feec77cc.json"


In [4]:
from google.cloud import vision

client = vision.ImageAnnotatorClient()
print("Google Vision API is successfully authenticated!")


Google Vision API is successfully authenticated!


In [5]:
!pip install torch torchvision torchaudio transformers accelerate
!pip install Pillow numpy requests tqdm matplotlib




In [6]:
from transformers import AutoProcessor, LlavaForConditionalGeneration
import torch
from PIL import Image

# Load LLaVA Model
model_name = "liuhaotian/llava-v1.5-7b"  # Change to "llava-v1.5-13b" for higher accuracy
processor = AutoProcessor.from_pretrained(model_name)
model = LlavaForConditionalGeneration.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

# Load a Sample Image
image_path = "sample.jpg"  # Replace with an actual document image
image = Image.open(image_path).convert("RGB")

# Define Prompt
prompt = "Extract all handwritten and printed text from this document."

# Process Image with LLaVA
inputs = processor(prompt, image, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")

# Generate Response
output = model.generate(**inputs)
print(processor.decode(output[0], skip_special_tokens=True))


OSError: liuhaotian/llava-v1.5-7b does not appear to have a file named preprocessor_config.json. Checkout 'https://huggingface.co/liuhaotian/llava-v1.5-7b/tree/main' for available files.

In [7]:
import fitz  # PyMuPDF
import pytesseract
import cv2
import numpy as np
from PIL import Image, ImageEnhance, ImageFilter
import os
import json
import ollama
import torch
from transformers import DonutProcessor, VisionEncoderDecoderModel

def pdf_to_image(pdf_path, output_folder, page_number=None, dpi=300):
    """
    Converts a PDF page (or all pages) to images for OCR processing.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    pdf_document = fitz.open(pdf_path)
    image_paths = []

    pages = [page_number - 1] if page_number else range(len(pdf_document))
    for i in pages:
        page = pdf_document[i]
        pix = page.get_pixmap(dpi=dpi)
        image_path = os.path.join(output_folder, f"page_{i+1}.png")
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        img.save(image_path, format="PNG")
        image_paths.append(image_path)

    return image_paths

def preprocess_image(image_path):
    """
    Loads and preprocesses the image to improve OCR accuracy.
    """
    img = Image.open(image_path).convert("L")
    img = ImageEnhance.Contrast(img).enhance(2)
    return img

def extract_text_tesseract(image_path):
    """
    Extracts printed and handwritten text using Tesseract OCR.
    """
    img = preprocess_image(image_path)
    text = pytesseract.image_to_string(img, config="--psm 6")
    return text

def detect_checkboxes(image_path):
    """
    Detects checkboxes and determines if they are checked or unchecked.
    """
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    _, binary = cv2.threshold(img, 180, 255, cv2.THRESH_BINARY_INV)
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    checkbox_results = {}
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        if 0.8 <= w / float(h) <= 1.2 and 12 <= w <= 35 and 12 <= h <= 35:
            roi = binary[y:y+h, x:x+w]
            filled_ratio = cv2.countNonZero(roi) / float(w * h)
            status = "Checked" if filled_ratio > 0.3 else "Unchecked"
            checkbox_results[f"Checkbox at ({x},{y})"] = status

    return checkbox_results

import json

def extract_text_llava(image_path):
    """
    Uses LLaVA (Large Language and Vision Assistant) for structured document extraction.
    """
    response = ollama.generate(
        model="llava:7b",
        prompt="Extract all structured information, including text, handwritten fields, checkboxes, and labels from this image in JSON format.",
        images=[image_path]
    )

    # Ensure response exists
    if not response or "response" not in response or not response["response"].strip():
        return {"error": "No response from LLaVA"}

    # Try to parse JSON, otherwise return an error
    try:
        return json.loads(response["response"])
    except json.JSONDecodeError:
        return {"error": "Invalid JSON format from LLaVA"}


def extract_text_donut(image_path):
    """
    Uses Donut (Vision-Language Model) for more accurate handwritten and text extraction.
    """
    processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")
    model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base")
    img = Image.open(image_path).convert("RGB")
    pixel_values = processor(img, return_tensors="pt").pixel_values

    with torch.no_grad():
        output = model.generate(pixel_values)
    extracted_text = processor.batch_decode(output, skip_special_tokens=True)[0]
    return extracted_text

def extract_structured_data(image_path):
    """
    Extracts text, checkboxes, and handwritten text dynamically using multiple models.
    """
    text_tesseract = extract_text_tesseract(image_path)
    checkboxes = detect_checkboxes(image_path)
    text_llava = extract_text_llava(image_path)
    text_donut = extract_text_donut(image_path)

    structured_data = {
        "ocr_text_tesseract": text_tesseract if text_tesseract else "No data extracted",
        "checkboxes": checkboxes if checkboxes else {},
        "llava_extracted_data": text_llava if text_llava else {"error": "LLaVA extraction failed"},
        "vision_language_model_data": text_donut if text_donut else "No data extracted"
    }

    return structured_data

def process_document():
    """
    Prompts user for input, processes a PDF or image file, extracts structured data, and saves as JSON.
    """
    file_path = input("Enter the path of the PDF or image file: ").strip()
    output_folder = "output_images"
    page_number = input("Enter page number to extract (or press Enter for all pages): ").strip()
    page_number = int(page_number) if page_number else None

    try:
        if file_path.lower().endswith(".pdf"):
            image_paths = pdf_to_image(file_path, output_folder, page_number)
        else:
            image_paths = [file_path]

        extracted_data = {}
        for img_path in image_paths:
            extracted_data[img_path] = extract_structured_data(img_path)

        # Convert to JSON safely
        try:
            json_output = json.dumps(extracted_data, indent=4, ensure_ascii=False)
        except Exception as json_error:
            print(f"Error converting extracted data to JSON: {json_error}")
            return

        # Print and save output
        print(json_output)
        try:
            with open("extracted_text.json", "w", encoding="utf-8") as f:
                f.write(json_output)
        except Exception as file_error:
            print(f"Error saving JSON file: {file_error}")
            return

        return json_output

    except Exception as e:
        print(f"An error occurred: {e}")
        return

# Run the script
if __name__ == "__main__":
    process_document()


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Config of the encoder: <class 'transformers.models.donut.modeling_donut_swin.DonutSwinModel'> is overwritten by shared encoder config: DonutSwinConfig {
  "attention_probs_dropout_prob": 0.0,
  "depths": [
    2,
    2,
    14,
    2
  ],
  "drop_path_rate": 0.1,
  "embed_dim": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": [
    2560,
    1920
  ],
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-05,
  "mlp_ratio": 4.0,
  "model_type": "donut-swin",
  "num_channels": 3,
  "num_heads": [
    4,
    8,
    16,
    32
  ],
  "num_layers": 4,
  "patch_size": 4,
  "path_norm": true,
  "qkv_bias": true,
  "transformers_

{
    "/Users/akhsrip/Desktop/Projects:Assignments/AidenAI_Assign/Extracting-handwritten-information-2.jpg": {
        "ocr_text_tesseract": "SPONSOR _‘ Strectear Musee 7 caTEcory_$@? fw\nFO. box deri Baer MD Zz y aaa\nSPONSOR'S MAILING ADDRESS _190/ Falls RD BAT mp ZIP Dizi _ __\ncontact PeRsO\\_ ro\nBOX OFFICE PHONE\nWEB ADDRESS_ a E-MAIL ADDRESS\nCODE DATES OF EVENT NO. OF ADMISSIONS. VOLUNTEER'S mitrars\nS44 “Sly aype OF EVENT (circle one) LOCATION OF EVENT\na us€ Concert Musical\n— f Dance Opera Pxlt, Mas cr Vrain\n~ - Fm Play\nST  oulidea tour Recital address (90S alle Pol.\n— ~C*‘«é@d inner There (Tour?\na —~—«—tzher outy Bldg. a\nas 4 TIME: 7AM PM\nge Yo? . _ 4\nrime Ferker 9 The Kale Norn - TM Lieu Stanley |\n9. Pa EZ =F,\nDESCRIPTION__ yp SPS: Tune 5 - fet, Satur dega\n50 -0TF wah SBC,\n- Qi Cardin nth A Pine,\nShans the Foun, 16 an Misatear (eka ene Por OY 7\nYaw fener ALk Veggies oo ple SO gat thy Vinabin Cover\nAisin S 80 mee aa $6 % bf Senda” bate, ne paraee, (ALSO fdr pe

In [15]:
import cv2
import pytesseract
import numpy as np
from PIL import Image

# Set Tesseract Path if needed
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Uncomment for Windows

def preprocess_image(image_path):
    """Preprocess image to enhance text recognition"""
    # Load image in grayscale
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    
    # Apply Gaussian Blur to reduce noise
    blurred = cv2.GaussianBlur(image, (5, 5), 0)
    
    # Adaptive thresholding for better contrast
    thresholded = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    
    # Apply morphological operations to enhance text features
    kernel = np.ones((2, 2), np.uint8)
    processed_image = cv2.morphologyEx(thresholded, cv2.MORPH_CLOSE, kernel)
    
    return processed_image

def extract_handwritten_text(image_path):
    """Extract handwritten text from image using Tesseract OCR"""
    processed_image = preprocess_image(image_path)
    
    # OCR configuration optimized for handwriting
    custom_config = "--oem 3 --psm 6"
    extracted_text = pytesseract.image_to_string(processed_image, config=custom_config)
    
    return extracted_text

# Example Usage
image_path = "/Users/akhsrip/Desktop/Projects:Assignments/AidenAI_Assign/YdUqv.jpg"  # Ensure correct path
extracted_text = extract_handwritten_text(image_path)
print("Extracted Handwritten Text:\n", extracted_text)


Extracted Handwritten Text:
 13 Sours of Income Please sedect, |v] as eppicadie
Seay | . Caphat Gains
frcome Eom Dusinets /Profetsion — ushesuProfetsion code [_[_] (For cous: Reter ratctona} Income from Omer sources
Ircome from Houte property . : No income

134 Representative Assacese(RA} 0” . . . i
Ful nama, address of the Repeasentatve Assesses, who i assessible under the Income Tax Act In respect of the person, whose paticutars eve:
Deen piven inthe colun £13, .

++ Full Masse (Full expanded name: Initials are not permitted) :
Prasessiocttie [Jes arpucatta [T]sxa []sme [furan [7 )sa. .
Last Nama }Sumama + SRR
a OD
Made kame COery rrr ry rrr ree a a

«Fist Reom! Dost Block Ho. (DloloIRT IWoy- Ti qul-1Gr-T_ Ts TRI RIETE Te TAIN fe]

_ |) Mareotprinises rowcg ivi ISTH TID UIST TAIL, LT hw. TTT PTA}
Rood Steet LanesPost Orica rhitl IefomMioLT che men aT Ett tt
jranteabyiTaawso-ortin CITT TREN MEM ILININIEL | TEIGMIOMel L} -
Town f Cry fist [Tier tri rey er ri rer etri ii e
jen Prncooe.


In [18]:
!pip install transformers torchvision timm pytorch-lightning


Collecting timm
  Downloading timm-1.0.14-py3-none-any.whl.metadata (50 kB)
Collecting pytorch-lightning
  Downloading pytorch_lightning-2.5.0.post0-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.6.1-py3-none-any.whl.metadata (21 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.12.0-py3-none-any.whl.metadata (5.6 kB)
Downloading timm-1.0.14-py3-none-any.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytorch_lightning-2.5.0.post0-py3-none-any.whl (819 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m819.3/819.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.12.0-py3-none-any.whl (28 kB)
Downloading torchmetrics-1.6.1-py3-none-any.whl (927 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m927

In [2]:
from transformers import DonutProcessor, VisionEncoderDecoderModel

# Use a publicly available model
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")

print("Model loaded successfully!")


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Config of the encoder: <class 'transformers.models.donut.modeling_donut_swin.DonutSwinModel'> is overwritten by shared encoder config: DonutSwinConfig {
  "attention_probs_dropout_prob": 0.0,
  "depths": [
    2,
    2,
    14,
    2
  ],
  "drop_path_rate": 0.1,
  "embed_dim": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": [
    1280,
    960
  ],
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-05,
  "mlp_ratio": 4.0,
  "model_type": "donut-swin",
  "num_channels": 3,
  "num_heads": [
    4,
    8,
    16,
    32
  ],
  "num_layers": 4,
  "patch_size": 4,
  "path_norm": true,
  "qkv_bias": true,
  "transformers_v

Model loaded successfully!


In [3]:
!pip install transformers torch torchvision timm pillow




In [11]:
!pip install pytesseract opencv-python pdf2image numpy pillow




In [13]:
import cv2
import pytesseract
from PIL import Image
import numpy as np

# Load the image
image_path = "/Users/akhsrip/Desktop/Projects:Assignments/AidenAI_Assign/YdUqv.jpg"
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

# Preprocess: Increase contrast and remove noise
image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

# Optional: Denoising
image = cv2.fastNlMeansDenoising(image, h=10)

# Save and load with PIL for Tesseract
preprocessed_image_path = "temp_processed.jpg"
cv2.imwrite(preprocessed_image_path, image)
image = Image.open(preprocessed_image_path)

# Run Tesseract OCR
custom_config = r'--oem 3 --psm 6'  # OCR Engine Mode (OEM) and Page Segmentation Mode (PSM)
extracted_text = pytesseract.image_to_string(image, config=custom_config)

print("\n📜 Extracted Text:\n", extracted_text)



📜 Extracted Text:
 12 Soures of Income _ Please select,| ~| as applicable
salay Capital Gains
Income from Business / Profession Suslness/Professton code [__|_] {For Code: Refer instructions} Income from Other sources
Income from House property No Income
44 Roprnsantative Asseasce (RA)
Full name, address of the Representative Assessee, who Is assessible under the income Tax Actin respect of the person, whose particulars have
‘been given in the cokimn 1-13.
Full Name {Full expanded name : Initials are not permittad)
mreaso soc te [Y/]a0 spotcable [| ]sna _[_]smt.  []xuman_[_]ois .
taatname: Surana CPT rrr r TT PTT tre rrr et rt
cet Na TPT Tyrer rrr yr er
Middle Nore CETErTryTrrT rt yr rrp eer
Address
Flat Roarn/ Door Black No. DlofoRT Wier] Alu IBE [STARTED TANT E|
Name of Premises Butcing vilogs STAT LVINIDIUISTIRIVIAILT INE T hw. Trl TIP IAI
Road Street /Lane/Post Offce THT WRloiW Os] Ice me TA TTT | TTT |
Aroa/Localiy Teka! Sup-Onisen FE[V TT URTEIINTERT LIAIWIFL> | TElemfoliet | |
T

In [1]:
import os
import cv2
import pytesseract
import torch
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image

# Load TrOCR model (for handwriting recognition)
trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-handwritten")
trocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-handwritten")

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
trocr_model.to(device)

# **1️⃣ Fix Image Path**
image_path = "/Users/akhsrip/Desktop/Projects:Assignments/AidenAI_Assign/images.jpeg"
if not os.path.exists(image_path):
    raise FileNotFoundError(f"Error: The file {image_path} does not exist!")

# **2️⃣ Load & Validate Image**
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
if image is None:
    raise ValueError(f"❌ Error: Unable to load image from {image_path}")

# **3️⃣ Preprocess Image for OCR**
image_tesseract = cv2.adaptiveThreshold(
    image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
)

# **4️⃣ Save Preprocessed Image & Verify**
preprocessed_image_path = "temp_processed.jpg"
cv2.imwrite(preprocessed_image_path, image_tesseract)

if not os.path.exists(preprocessed_image_path):
    raise FileNotFoundError(f"❌ Preprocessed image not saved at {preprocessed_image_path}")

print(f"✅ Preprocessed image saved successfully at {preprocessed_image_path}")

# **5️⃣ Reload Image for OCR**
image_pil = Image.open(preprocessed_image_path).convert("RGB")

# **6️⃣ Extract Text using Tesseract OCR**
custom_config = r'--oem 3 --psm 6'
tesseract_text = pytesseract.image_to_string(image_pil, config=custom_config)

# **7️⃣ Use TrOCR to Enhance Handwritten Text Recognition**
pixel_values_trocr = trocr_processor(image_pil, return_tensors="pt").pixel_values.to(device)
with torch.no_grad():
    generated_ids_trocr = trocr_model.generate(pixel_values_trocr)
    extracted_text_trocr = trocr_processor.batch_decode(generated_ids_trocr, skip_special_tokens=True)[0]

# **8️⃣ Convert Extracted Text to Markdown Format**
markdown_text = (
    "# Extracted Markdown from Handwritten Text\n\n"
    "## 📝 Tesseract OCR Output:\n"
    "```\n"
    f"{tesseract_text.strip()}\n"
    "```\n\n"
    "## ✍ TrOCR Output (Enhanced Handwriting Recognition):\n"
    "```\n"
    f"{extracted_text_trocr.strip()}\n"
    "```\n"
)

# **9️⃣ Save Extracted Text as Markdown**
markdown_path = "extracted_text.md"
with open(markdown_path, "w", encoding="utf-8") as md_file:
    md_file.write(markdown_text)

print(f"\n✅ OCR completed! Extracted text saved as `{markdown_path}`")


  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.48.1"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_tro

✅ Preprocessed image saved successfully at temp_processed.jpg

✅ OCR completed! Extracted text saved as `extracted_text.md`


In [2]:
from ultralytics import YOLO
import cv2
import numpy as np
from paddleocr import PaddleOCR
from PIL import Image
import torch
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

# Load YOLOv8 model (pre-trained on forms)
yolo_model = YOLO("yolov8n.pt")  # Smallest YOLOv8 model, you can use "yolov8m.pt" for better accuracy

# Load OCR models
paddle_ocr = PaddleOCR(use_angle_cls=True, lang="en")
trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-handwritten")
trocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-handwritten")

# Move TrOCR model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
trocr_model.to(device)


Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt to 'yolov8n.pt'...
⚠️ Download failure, retrying 1/3 https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt...


######################################################################## 100.0%


download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /Users/akhsrip/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 3910/3910 [00:13<00:00, 295.27it/s] 


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /Users/akhsrip/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar


100%|██████████| 10000/10000 [00:13<00:00, 745.32it/s]


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /Users/akhsrip/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2138/2138 [00:02<00:00, 924.17it/s] 


[2025/02/18 16:17:39] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/Users/akhsrip/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/Users/akhsrip/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_tex

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.48.1"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 1024,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decod

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-23): 24 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=False)
              (key): Linear(in_features=1024, out_features=1024, bias=False)
              (value): Linear(in_features=1024, out_features=1024, bias=False)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dens

In [3]:
def detect_boxes(image_path):
    """
    Uses YOLOv8 to detect boxed regions containing handwritten text.
    """
    image = cv2.imread(image_path)
    results = yolo_model(image)

    boxes = []
    for result in results:
        for box in result.boxes:
            x1, y1, x2, y2 = map(int, box.xyxy[0])  # Get bounding box coordinates
            boxes.append((x1, y1, x2, y2))
    
    return boxes


In [4]:
def extract_handwritten_text(image_path, boxes):
    """
    Extracts handwritten text from detected boxed regions using PaddleOCR + TrOCR.
    """
    image = Image.open(image_path).convert("RGB")
    image_cv2 = cv2.imread(image_path)

    extracted_texts = []

    for idx, (x1, y1, x2, y2) in enumerate(boxes):
        cropped_image = image.crop((x1, y1, x2, y2))
        
        # PaddleOCR for handwriting
        paddle_result = paddle_ocr.ocr(np.array(cropped_image), cls=True)
        paddle_text = " ".join([word[1][0] for line in paddle_result for word in line])

        # TrOCR for handwriting
        pixel_values = trocr_processor(cropped_image, return_tensors="pt").pixel_values.to(device)
        with torch.no_grad():
            generated_ids = trocr_model.generate(pixel_values)
            trocr_text = trocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

        extracted_texts.append({
            "box_id": idx + 1,
            "coordinates": (x1, y1, x2, y2),
            "paddleocr_text": paddle_text,
            "trocr_text": trocr_text
        })
    
    return extracted_texts


In [5]:
if __name__ == "__main__":
    image_path = "/Users/akhsrip/Desktop/Projects:Assignments/AidenAI_Assign/YdUqv.jpg"  # Replace with your form image

    # Detect boxed fields
    boxes = detect_boxes(image_path)
    print(f"🟩 Detected {len(boxes)} boxed regions.")

    # Extract text from boxes
    results = extract_handwritten_text(image_path, boxes)

    # Print results
    for res in results:
        print(f"\n📌 Box {res['box_id']} at {res['coordinates']}:")
        print(f"📝 PaddleOCR: {res['paddleocr_text']}")
        print(f"✍ TrOCR: {res['trocr_text']}")



0: 384x640 1 laptop, 42.8ms
Speed: 3.2ms preprocess, 42.8ms inference, 5.3ms postprocess per image at shape (1, 3, 384, 640)
🟩 Detected 1 boxed regions.
[2025/02/18 16:19:43] ppocr DEBUG: dt_boxes num : 41, elapsed : 0.25136494636535645
[2025/02/18 16:19:43] ppocr DEBUG: cls num  : 41, elapsed : 0.20580506324768066
[2025/02/18 16:19:50] ppocr DEBUG: rec_res num  : 41, elapsed : 7.159555196762085

📌 Box 1 at (9, 0, 894, 520):
📝 PaddleOCR: 13 Source of Income Please select, as applicable Salary Capital Gains Income from Business/Profession Business/Profession code [For Code:Refer instructions] Income from Other sources Income from House property No income 14 Representative Assessee (RA) Full name,address of the Representative Assessee,who is assessible under the Income Tax Act in respect of the person. whose particulars have been given in the column 1-13 Full Name (Full expanded name : initials are not permitted) Please select titeas applicable Shri Smt. Kumari M/s Last Name/Sumame Firs

In [6]:
!pip install ultralytics paddleocr transformers numpy opencv-python pytorch torchvision


Collecting pytorch
  Downloading pytorch-1.0.2.tar.gz (689 bytes)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: pytorch
  Building wheel for pytorch (pyproject.toml) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mBuilding wheel for pytorch [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[23 lines of output][0m
  [31m   [0m Traceback (most recent call last):
  [31m   [0m   File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pip/_vendor/pyproject_hooks/_in_process/_in_process.py", line 389, in <module>
  [31m   [0m     main()
  [31m   [0m   File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pip/_vendor/pyproject_hooks/_in_p

In [7]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu


Looking in indexes: https://download.pytorch.org/whl/cpu


In [8]:
import os
import cv2
import numpy as np
import torch
import json
from ultralytics import YOLO
from paddleocr import PaddleOCR
from PIL import Image
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

# **1️⃣ Load Models**
# Load YOLOv8 model (for detecting boxed text fields)
yolo_model = YOLO("yolov8n.pt")  # You can use "yolov8m.pt" for better accuracy

# Load PaddleOCR (optimized for handwriting detection)
paddle_ocr = PaddleOCR(use_angle_cls=True, lang="en")

# Load TrOCR (handwriting recognition model)
trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-handwritten")
trocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-handwritten")

# Move TrOCR model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
trocr_model.to(device)


# **2️⃣ Function to Detect Boxes Using YOLO**
def detect_boxes(image_path):
    """
    Uses YOLOv8 to detect boxed regions containing handwritten text.
    """
    image = cv2.imread(image_path)
    results = yolo_model(image)

    boxes = []
    for result in results:
        for box in result.boxes:
            x1, y1, x2, y2 = map(int, box.xyxy[0])  # Get bounding box coordinates
            boxes.append((x1, y1, x2, y2))
    
    return boxes


# **3️⃣ Function to Extract Handwritten Text from Detected Boxes**
def extract_handwritten_text(image_path, boxes):
    """
    Extracts handwritten text from detected boxed regions using PaddleOCR + TrOCR.
    """
    image = Image.open(image_path).convert("RGB")
    extracted_texts = []

    for idx, (x1, y1, x2, y2) in enumerate(boxes):
        cropped_image = image.crop((x1, y1, x2, y2))
        
        # PaddleOCR for handwriting
        paddle_result = paddle_ocr.ocr(np.array(cropped_image), cls=True)
        paddle_text = " ".join([word[1][0] for line in paddle_result for word in line])

        # TrOCR for handwriting
        pixel_values = trocr_processor(cropped_image, return_tensors="pt").pixel_values.to(device)
        with torch.no_grad():
            generated_ids = trocr_model.generate(pixel_values)
            trocr_text = trocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

        extracted_texts.append({
            "box_id": idx + 1,
            "coordinates": (x1, y1, x2, y2),
            "paddleocr_text": paddle_text,
            "trocr_text": trocr_text
        })
    
    return extracted_texts


# **4️⃣ Run Full Pipeline**
if __name__ == "__main__":
    image_path = "/Users/akhsrip/Desktop/Projects:Assignments/AidenAI_Assign/YdUqv.jpg"  # Replace with your form image

    # Detect boxed fields
    boxes = detect_boxes(image_path)
    print(f"🟩 Detected {len(boxes)} boxed regions.")

    # Extract text from boxes
    results = extract_handwritten_text(image_path, boxes)

    # Print results
    for res in results:
        print(f"\n📌 Box {res['box_id']} at {res['coordinates']}:")
        print(f"📝 PaddleOCR: {res['paddleocr_text']}")
        print(f"✍ TrOCR: {res['trocr_text']}")

    # Save extracted data as JSON
    output_file = "handwritten_text_results.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    print(f"\n✅ Extracted text saved in {output_file}")


[2025/02/18 16:27:16] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/Users/akhsrip/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/Users/akhsrip/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_tex

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.48.1"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 1024,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decod


0: 384x640 1 laptop, 31.7ms
Speed: 2.2ms preprocess, 31.7ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)
🟩 Detected 1 boxed regions.
[2025/02/18 16:27:29] ppocr DEBUG: dt_boxes num : 41, elapsed : 0.2421889305114746
[2025/02/18 16:27:29] ppocr DEBUG: cls num  : 41, elapsed : 0.20537424087524414
[2025/02/18 16:27:36] ppocr DEBUG: rec_res num  : 41, elapsed : 7.0373570919036865

📌 Box 1 at (9, 0, 894, 520):
📝 PaddleOCR: 13 Source of Income Please select, as applicable Salary Capital Gains Income from Business/Profession Business/Profession code [For Code:Refer instructions] Income from Other sources Income from House property No income 14 Representative Assessee (RA) Full name,address of the Representative Assessee,who is assessible under the Income Tax Act in respect of the person. whose particulars have been given in the column 1-13 Full Name (Full expanded name : initials are not permitted) Please select titeas applicable Shri Smt. Kumari M/s Last Name/Sumame Firs

In [5]:
!pip install easyocr


Collecting easyocr
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.6-cp312-cp312-macosx_11_0_arm64.whl.metadata (4.9 kB)
Collecting ninja (from easyocr)
  Downloading ninja-1.11.1.3-py3-none-macosx_10_9_universal2.whl.metadata (5.3 kB)
Downloading easyocr-1.7.2-py3-none-any.whl (2.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ninja-1.11.1.3-py3-none-macosx_10_9_universal2.whl (279 kB)
Downloading python_bidi-0.6.6-cp312-cp312-macosx_11_0_arm64.whl (262 kB)
Installing collected packages: python-bidi, ninja, easyocr
Successfully installed easyocr-1.7.2 ninja-1.11.1.3 python-bidi-0.6.6


In [6]:
import cv2
import numpy as np
import easyocr
from PIL import Image
import torch
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from pdf2image import convert_from_path
import logging
from typing import List, Union, Dict, Optional
import os
from pathlib import Path

class ImprovedTextExtractor:
    def __init__(self, languages=['en']):
        """
        Initialize the text extractor with EasyOCR and TrOCR.
        
        Args:
            languages: List of language codes for EasyOCR
        """
        # Set up logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
        
        # Initialize EasyOCR for printed text
        self.logger.info("Initializing EasyOCR...")
        self.reader = easyocr.Reader(languages, gpu=torch.cuda.is_available())
        
        # Initialize TrOCR for handwritten text
        self.logger.info("Loading TrOCR model...")
        self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-handwritten")
        self.model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-handwritten")
        
        # Set device
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)
        
        self.logger.info("Text Extractor initialized successfully")

    def enhance_image(self, image: np.ndarray) -> np.ndarray:
        """
        Apply advanced image enhancement techniques.
        
        Args:
            image: Input image
            
        Returns:
            Enhanced image
        """
        # Convert to grayscale if needed
        if len(image.shape) == 3:
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        else:
            gray = image

        # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
        enhanced = clahe.apply(gray)

        # Denoise
        denoised = cv2.fastNlMeansDenoising(enhanced)

        # Adaptive thresholding
        binary = cv2.adaptiveThreshold(
            denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
        )

        return binary

    def detect_text_regions(self, image: np.ndarray) -> List[Dict]:
        """
        Detect text regions using EasyOCR's built-in detection.
        
        Args:
            image: Input image
            
        Returns:
            List of dictionaries containing region information
        """
        # Use EasyOCR to detect text regions
        results = self.reader.readtext(image)
        
        regions = []
        for box, text, conf in results:
            # Convert box coordinates to x,y,w,h format
            box = np.array(box, dtype=np.int32)
            x, y = box.min(axis=0)
            w, h = box.max(axis=0) - box.min(axis=0)
            
            # Determine if region is likely handwritten
            is_handwritten = self._check_if_handwritten(
                image[y:y+h, x:x+w]
            )
            
            regions.append({
                'coords': (x, y, w, h),
                'type': 'handwritten' if is_handwritten else 'printed',
                'confidence': conf
            })
        
        return regions

    def _check_if_handwritten(self, region: np.ndarray) -> bool:
        """
        Improved handwritten text detection using multiple features.
        
        Args:
            region: Image region to analyze
            
        Returns:
            Boolean indicating if region is likely handwritten
        """
        try:
            # Convert to grayscale if needed
            if len(region.shape) == 3:
                region = cv2.cvtColor(region, cv2.COLOR_BGR2GRAY)

            # Calculate stroke width variation
            edges = cv2.Canny(region, 100, 200)
            contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            
            if not contours:
                return False

            # Calculate contour features
            contour_features = []
            for contour in contours:
                # Calculate contour area and perimeter
                area = cv2.contourArea(contour)
                perimeter = cv2.arcLength(contour, True)
                
                if perimeter == 0:
                    continue
                    
                # Calculate circularity
                circularity = 4 * np.pi * area / (perimeter * perimeter)
                contour_features.append(circularity)

            if not contour_features:
                return False

            # Handwritten text typically has more variance in these features
            variance = np.var(contour_features)
            
            # Threshold determined empirically
            return variance > 0.1

        except Exception as e:
            self.logger.error(f"Error in handwritten detection: {str(e)}")
            return False

    def extract_printed_text(self, image: np.ndarray) -> str:
        """
        Extract printed text using EasyOCR.
        
        Args:
            image: Input image region
            
        Returns:
            Extracted text
        """
        try:
            results = self.reader.readtext(image)
            return ' '.join([text for _, text, conf in results if conf > 0.5])
        except Exception as e:
            self.logger.error(f"Error in printed text extraction: {str(e)}")
            return ""

    def extract_handwritten_text(self, image: np.ndarray) -> str:
        """
        Extract handwritten text using TrOCR.
        
        Args:
            image: Input image region
            
        Returns:
            Extracted text
        """
        try:
            # Convert numpy array to PIL Image
            pil_image = Image.fromarray(image)
            
            # Prepare image for model
            pixel_values = self.processor(pil_image, return_tensors="pt").pixel_values
            pixel_values = pixel_values.to(self.device)
            
            # Generate text
            generated_ids = self.model.generate(
                pixel_values,
                max_length=128,
                num_beams=4,
                length_penalty=2.0,
                early_stopping=True
            )
            
            generated_text = self.processor.batch_decode(
                generated_ids, skip_special_tokens=True
            )[0]
            
            return generated_text.strip()
        except Exception as e:
            self.logger.error(f"Error in handwritten text extraction: {str(e)}")
            return ""

    def process_image(self, image_path: Union[str, Path, np.ndarray]) -> Dict[str, List[str]]:
        """
        Process an image and extract both printed and handwritten text.
        
        Args:
            image_path: Path to image file or numpy array
            
        Returns:
            Dictionary containing extracted text
        """
        try:
            # Handle different input types
            if isinstance(image_path, (str, Path)):
                image = cv2.imread(str(image_path))
                if image is None:
                    raise ValueError(f"Could not read image at {image_path}")
            else:
                image = image_path

            # Enhance image
            enhanced_image = self.enhance_image(image)
            
            # Detect text regions
            regions = self.detect_text_regions(enhanced_image)
            
            # Extract text from each region
            printed_text = []
            handwritten_text = []
            
            for region in regions:
                x, y, w, h = region['coords']
                region_image = enhanced_image[y:y+h, x:x+w]
                
                if region['type'] == 'printed':
                    text = self.extract_printed_text(region_image)
                    if text:
                        printed_text.append(text)
                else:
                    text = self.extract_handwritten_text(region_image)
                    if text:
                        handwritten_text.append(text)
            
            return {
                'printed_text': printed_text,
                'handwritten_text': handwritten_text
            }
            
        except Exception as e:
            self.logger.error(f"Error processing image: {str(e)}")
            raise

def main():
    """Example usage of the ImprovedTextExtractor."""
    # Initialize the extractor
    extractor = ImprovedTextExtractor()
    
    # Process an image
    image_path = "/Users/akhsrip/Desktop/Projects:Assignments/AidenAI_Assign/YdUqv.jpg"  # Replace with your image path
    try:
        results = extractor.process_image(image_path)
        
        print("Printed Text:")
        for text in results['printed_text']:
            print(text)
            
        print("\nHandwritten Text:")
        for text in results['handwritten_text']:
            print(text)
            
    except Exception as e:
        print(f"Error: {str(e)}")

if __name__ == "__main__":
    main()

INFO:__main__:Initializing EasyOCR...


Progress: |██████████████████████████████████████████████████| 100.0% Complete

INFO:easyocr.easyocr:Download complete


Progress: |█████████████████████████████████████████████████-| 99.2% Complete

INFO:easyocr.easyocr:Download complete.


Progress: |██████████████████████████████████████████████████| 100.0% Complete

INFO:__main__:Loading TrOCR model...
Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.48.1"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 1024,
  "d_model": 1024,
  "decoder_attention_heads": 16

Printed Text:
#Pplicabla:
Kuma
Addros
{Road
Plncode
submted

Handwritten Text:
