## Installations

In [7]:
# !pip install ollama-ocr
# !pip install sentence_transformers
# !pip install chromadb

In [3]:
import fitz  # PyMuPDF for PDF processing
from PIL import Image
import os
import cv2

def convert_pdf_to_images(pdf_path, output_folder, dpi=300, image_format="png"):
    """
    Converts each page of a PDF to high-quality images.
    
    Args:
        pdf_path (str): Path to the input PDF file.
        output_folder (str): Output folder for images.
        dpi (int, optional): Resolution (higher is better). Defaults to 300.
        image_format (str, optional): Output format ('png', 'jpeg', 'tiff'). Defaults to 'png'.
    """
    valid_formats = ["png", "jpeg", "tiff"]
    image_format = image_format.lower()
    if image_format not in valid_formats:
        print(f"Warning: Invalid format '{image_format}'. Using 'png' instead.")
        image_format = "png"

    try:
        pdf_document = fitz.open(pdf_path)
    except FileNotFoundError:
        print(f"Error: PDF file not found at '{pdf_path}'")
        return
    except Exception as e:
        print(f"Error opening PDF: {e}")
        return

    os.makedirs(output_folder, exist_ok=True)
    zoom_matrix = fitz.Matrix(dpi / 72, dpi / 72)  # Scaling for higher DPI

    for page_number in range(pdf_document.page_count):
        page = pdf_document.load_page(page_number)
        pix = page.get_pixmap(matrix=zoom_matrix)

        image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        image_name = f"pdf_page_{page_number + 1}.{image_format}"
        image_path = os.path.join(output_folder, image_name)

        try:
            image.save(image_path, image_format.upper(), quality=90 if image_format == "jpeg" else None)
            print(f"✅ PDF Page {page_number + 1} saved as {image_path} ({image_format.upper()}, DPI={dpi})")
        except Exception as save_error:
            print(f"❌ Error saving PDF page {page_number + 1}: {save_error}")

    pdf_document.close()
    print(f"\n🎉 PDF '{pdf_path}' converted successfully to images in '{output_folder}'\n")


def process_image(image_path, output_folder, image_format="png"):
    """
    Processes an existing image (resizing, format conversion, preprocessing).
    
    Args:
        image_path (str): Path to the input image.
        output_folder (str): Output folder for processed images.
        image_format (str, optional): Output format ('png', 'jpeg', 'tiff'). Defaults to 'png'.
    """
    valid_formats = ["png", "jpeg", "tiff"]
    image_format = image_format.lower()
    if image_format not in valid_formats:
        print(f"Warning: Invalid format '{image_format}'. Using 'png' instead.")
        image_format = "png"

    os.makedirs(output_folder, exist_ok=True)

    try:
        image = cv2.imread(image_path)
        if image is None:
            print(f"❌ Error: Could not read image {image_path}")
            return

        # Convert to grayscale and enhance contrast
        processed_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        processed_image = cv2.threshold(processed_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

        image_name = f"processed_{os.path.basename(image_path).split('.')[0]}.{image_format}"
        image_path_out = os.path.join(output_folder, image_name)

        cv2.imwrite(image_path_out, processed_image)
        print(f"✅ Image {image_path} processed and saved as {image_path_out} ({image_format.upper()})")
    
    except Exception as e:
        print(f"❌ Error processing image {image_path}: {e}")


def process_files(input_path, output_folder, dpi=300, image_format="png"):
    """
    Processes both PDFs and images, converting them to high-quality images.

    Args:
        input_path (str): Path to the input file or folder.
        output_folder (str): Path to store processed images.
        dpi (int, optional): DPI for PDF conversion. Defaults to 300.
        image_format (str, optional): Output format ('png', 'jpeg', 'tiff'). Defaults to 'png'.
    """
    if os.path.isdir(input_path):  
        # If input is a directory, process all files inside
        for file_name in os.listdir(input_path):
            file_path = os.path.join(input_path, file_name)
            if file_name.lower().endswith(".pdf"):
                convert_pdf_to_images(file_path, output_folder, dpi, image_format)
            elif file_name.lower().endswith((".png", ".jpg", ".jpeg", ".tiff")):
                process_image(file_path, output_folder, image_format)
            else:
                print(f"❌ Skipping unsupported file: {file_name}")
    
    elif os.path.isfile(input_path):  
        # If input is a single file, process it
        if input_path.lower().endswith(".pdf"):
            convert_pdf_to_images(input_path, output_folder, dpi, image_format)
        elif input_path.lower().endswith((".png", ".jpg", ".jpeg", ".tiff")):
            process_image(input_path, output_folder, image_format)
        else:
            print(f"❌ Unsupported file type: {input_path}")

    else:
        print(f"❌ Error: '{input_path}' does not exist.")

    print("\n🎉 Processing complete!\n")


# --- Example Usage ---
if __name__ == "__main__":
    input_path = "/Users/akhsrip/Desktop/Projects:Assignments/AidenAI_Assign/images.jpeg"  # Folder containing PDFs & images
    output_dir = "processed_images"
    
    # Process PDFs and images in the folder
    process_files(input_path, output_dir, dpi=300, image_format="png")


✅ Image /Users/akhsrip/Desktop/Projects:Assignments/AidenAI_Assign/images.jpeg processed and saved as processed_images/processed_images.png (PNG)

🎉 Processing complete!



In [5]:
import os
import fitz  # PyMuPDF for PDFs
import cv2
from PIL import Image
from ollama_ocr import OCRProcessor

# Initialize OCR processor
ocr = OCRProcessor(model_name='llama3.2-vision:11b')

def convert_pdf_to_images(pdf_path, output_folder, dpi=300, image_format="jpeg"):
    """
    Converts each page of a PDF to images with high DPI.
    
    Args:
        pdf_path (str): Input PDF file path.
        output_folder (str): Folder to store output images.
        dpi (int): Resolution (higher means better quality). Defaults to 300.
        image_format (str): Output format ('png', 'jpeg', 'tiff'). Defaults to 'jpeg'.
    """
    os.makedirs(output_folder, exist_ok=True)
    pdf_document = fitz.open(pdf_path)
    zoom_matrix = fitz.Matrix(dpi / 72, dpi / 72)  # Scale for higher DPI

    for page_number in range(pdf_document.page_count):
        page = pdf_document.load_page(page_number)
        pix = page.get_pixmap(matrix=zoom_matrix)

        image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        image_name = f"pdf_page_{page_number + 1}.{image_format}"
        image_path = os.path.join(output_folder, image_name)

        image.save(image_path, image_format.upper(), quality=90 if image_format == "jpeg" else None)
        print(f"✅ PDF Page {page_number + 1} saved as {image_path}")

    pdf_document.close()
    print(f"\n🎉 PDF '{pdf_path}' converted successfully to images in '{output_folder}'\n")


def process_image(image_path, output_folder, image_format="jpeg"):
    """
    Preprocesses images for OCR by enhancing text visibility.
    
    Args:
        image_path (str): Path to input image.
        output_folder (str): Folder to store processed images.
        image_format (str): Output format ('png', 'jpeg', 'tiff'). Defaults to 'jpeg'.
    """
    os.makedirs(output_folder, exist_ok=True)
    
    image = cv2.imread(image_path)
    if image is None:
        print(f"❌ Error: Cannot read image {image_path}")
        return

    # Convert to grayscale & enhance contrast
    processed_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    processed_image = cv2.threshold(processed_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

    image_name = f"processed_{os.path.basename(image_path).split('.')[0]}.{image_format}"
    image_path_out = os.path.join(output_folder, image_name)

    cv2.imwrite(image_path_out, processed_image)
    print(f"✅ Image {image_path} processed and saved as {image_path_out}")
    
    return image_path_out


def perform_ocr(image_path):
    """
    Runs OCR on a processed image using Ollama's `llama3.2-vision:11b`.
    
    Args:
        image_path (str): Path to image for OCR.
    """
    try:
        result = ocr.process_image(image_path=image_path, format_type="markdown")
        markdown_path = image_path.replace(image_path.split('.')[-1], "md")
        with open(markdown_path, "w", encoding="utf-8") as md_file:
            md_file.write(result)
        
        print(f"\n📜 OCR completed for {image_path} and saved as {markdown_path}")
    except Exception as e:
        print(f"❌ Error running OCR on {image_path}: {e}")


def process_files(input_path, output_folder, dpi=300, image_format="jpeg"):
    """
    Processes PDFs and images, converts them, and runs OCR.
    
    Args:
        input_path (str): Path to a file or folder.
        output_folder (str): Output folder for processed images.
        dpi (int): DPI for PDF conversion. Defaults to 300.
        image_format (str): Output format ('png', 'jpeg', 'tiff'). Defaults to 'jpeg'.
    """
    if os.path.isdir(input_path):  
        for file_name in os.listdir(input_path):
            file_path = os.path.join(input_path, file_name)
            if file_name.lower().endswith(".pdf"):
                convert_pdf_to_images(file_path, output_folder, dpi, image_format)
            elif file_name.lower().endswith((".png", ".jpg", ".jpeg", ".tiff")):
                processed_image = process_image(file_path, output_folder, image_format)
                if processed_image:
                    perform_ocr(processed_image)
            else:
                print(f"❌ Skipping unsupported file: {file_name}")
    
    elif os.path.isfile(input_path):  
        if input_path.lower().endswith(".pdf"):
            convert_pdf_to_images(input_path, output_folder, dpi, image_format)
        elif input_path.lower().endswith((".png", ".jpg", ".jpeg", ".tiff")):
            processed_image = process_image(input_path, output_folder, image_format)
            if processed_image:
                perform_ocr(processed_image)
        else:
            print(f"❌ Unsupported file type: {input_path}")

    else:
        print(f"❌ Error: '{input_path}' does not exist.")

    print("\n🎉 Processing complete!\n")


# --- Example Usage ---
if __name__ == "__main__":
    input_path = "/Users/akhsrip/Desktop/Projects:Assignments/AidenAI_Assign/YdUqv.jpg"  # Folder with PDFs & images
    output_dir = "processed_ocr_output"
    
    process_files(input_path, output_dir, dpi=300, image_format="jpeg")


✅ Image /Users/akhsrip/Desktop/Projects:Assignments/AidenAI_Assign/YdUqv.jpg processed and saved as processed_ocr_output/processed_YdUqv.jpeg

📜 OCR completed for processed_ocr_output/processed_YdUqv.jpeg and saved as processed_ocr_output/processed_YdUqv.md

🎉 Processing complete!



In [7]:
import json
from ollama_ocr import OCRProcessor
import ollama
from pydantic import BaseModel, Field, validator, conlist
from typing import List, Optional, Dict, Any, Union

# Generalized Pydantic Models for Structured Document Output (No changes needed here)
class DataElement(BaseModel):
    type: str = Field(..., description="Type of data element (e.g., 'paragraph', 'heading', 'list', 'table', 'key_value', 'form_field_group', 'other')")
    content: Union[str, List['DataElement'], List[str], Dict[str, 'DataElement'], List[Dict[str, str]]] = Field(..., description="Content of the element; can be text, list of elements, list of strings, or key-value pairs")
    label: Optional[str] = Field(None, description="Optional label or title for the element (e.g., section heading, field group name)")

    class Config:
        arbitrary_types_allowed = True


class StructuredDocument(BaseModel):
    document_type: Optional[str] = Field(None, description="Inferred type of the document (e.g., 'receipt', 'invoice', 'page', 'report', 'form')")
    elements: List[DataElement] = Field(default_factory=list, description="List of structured data elements extracted from the document")
    other_details: Optional[Dict[str, Any]] = Field(None, description="Any other unstructured details or metadata extracted")


def perform_ocr(image_path, model_name='llama3.2-vision:11b', format_type='markdown'):
    """
    Performs OCR on an image using ollama_ocr and returns the output in specified format.
    """
    ocr = OCRProcessor(model_name=model_name)
    result = ocr.process_image(
        image_path=image_path,
        format_type=format_type
    )
    return result

def filter_and_structure_with_llm(markdown_text, llm_model_name = 'mistral:latest'): # You can change to a different text-based Ollama model
    prompt = f"""You are an expert in form processing and data extraction. Your task is to take a Markdown formatted text output from an OCR process, filter and clean the text to correct potential errors, and then analyze it to identify and extract relevant structured information from a form.

    Your goal is to understand the form structure and extract content into a generalized JSON format that is **strictly validated** against the following Pydantic models.  **It is critical that your JSON output is valid according to these models. Do not include any extra text or explanations outside the JSON.**

    Here are the Pydantic models you MUST adhere to for the JSON output:

    ```python
    class DataElement(BaseModel):
        type: str # e.g., 'paragraph', 'heading', 'list', 'table', 'key_value', 'form_field_group'
        content: Union[str, List['DataElement'], List[str], Dict[str, 'DataElement'], List[Dict[str, str]]]
        label: Optional[str] = None # For headings, field group names, etc.

    class StructuredDocument(BaseModel):
        document_type: Optional[str] = None # e.g., 'form', 'application'
        elements: List[DataElement]
        other_details: Optional[Dict[str, Any]] = None
    ```

    Analyze the text and identify the following types of form elements:

    - Form Title/Heading: The main title of the form. Represent as 'heading' type.
    - Section Headings:  Identify headings that divide the form into sections. Use 'section' type for these.
    - Form Field Groups:  Forms often group related fields.  Identify these groups and represent them as 'form_field_group' type. Use the group title as 'label'.
    - Form Fields (Key-Value Pairs): Within field groups or sections, identify individual form fields and their corresponding values. Represent these as 'key_value' type elements. Use the field label as the 'label' and the field value as 'content' (string).
    - Instructions/Help Text: Text that provides instructions or helpful information. Use 'paragraph' or 'other' type, depending on context.
    - Checkbox Options/Selections: If checkboxes and their associated labels are present, try to capture these as key-value pairs or lists.
    - Other Text: Any other text that doesn't fit into the above categories, use 'other' or 'paragraph' type elements.

    For the 'document_type', if you can identify it as a form or application, set it accordingly (e.g., 'form', 'application'). Otherwise, you can leave it as 'document' or null.

    Prioritize extracting form fields and their values into a structured format. Be dynamic and adapt the structure to the detected form layout.

    Here is the Markdown text from the OCR:

    ```markdown
    {markdown_text}
    ```

    Based on this Markdown text, create a JSON output that **strictly conforms** to the Pydantic `StructuredDocument` model. Extract what you consider to be the most relevant structured information from the form, focusing on form fields and their values.  **Provide only valid JSON in your response, with no extra text or explanations.**
    """

    response = ollama.chat(
        model = llm_model_name,
        format='json', # Ask Ollama to output JSON directly
        messages=[
            {
                'role': 'user',
                'content': prompt.format(markdown_text = markdown_text)
            }
        ]
    )
    json_output_str = response['message']['content']

    # Attempt to parse the JSON string with Pydantic, handle potential errors (no changes needed here)
    try:
        parsed_json = json.loads(json_output_str) # Parse JSON string to dict first
        structured_doc_model = StructuredDocument.parse_obj(parsed_json) # Parse with Pydantic
        return structured_doc_model.dict() # Return as dictionary
    except json.JSONDecodeError as e:
        print(f"\n--- JSONDecodeError ---")
        print(f"Error decoding JSON from LLM output: {e}")
        print("Raw LLM JSON output that caused the error:\n")
        print(json_output_str)
        print("\n--- End Raw JSON Output ---\n")
        return {"error": "Could not parse JSON from LLM", "raw_output": json_output_str}
    except Exception as e: # Catch Pydantic validation errors or other parsing issues
        print(f"\n--- Pydantic ValidationError ---")
        print(f"Pydantic validation error: {e}")
        print("Raw LLM JSON output that caused the error:\n")
        print(json_output_str)
        print("\n--- End Raw JSON Output ---\n")
        return {"error": "Pydantic validation error", "raw_output": json_output_str}


if __name__ == "__main__":
    image_file_path = "/Users/akhsrip/Desktop/Projects:Assignments/AidenAI_Assign/YdUqv.jpg" #  <- Still using page_7.jpeg, you might want to use the Fidelity form image
    # For testing with the Fidelity image, you should replace "output_images_advanced/page_7.jpeg"
    # with the path to the Fidelity form image you are using (e.g., "fidelity_form.jpeg" if you save it as that).
    # image_file_path = "fidelity_form.jpeg"  <- Uncomment and use your Fidelity image path

    # Stage 1: Perform OCR and get Markdown output
    print("Performing OCR...")
    markdown_output = perform_ocr(image_file_path)
    print("Raw Markdown Output from OCR:\n---")
    print(markdown_output)
    print("---\n")

    # Stage 2: Filter and structure Markdown with a second LLM to get structured JSON for forms
    print("Filtering and structuring with LLM for form data...")
    json_result = filter_and_structure_with_llm(markdown_output)

    print("\nFinal Structured Form JSON Output (Pydantic):\n---")
    if "error" in json_result:
        print("Error during JSON processing. See error details above.") # Error already printed in exception handler
    else:
        print(json.dumps(json_result, indent=2, ensure_ascii=False)) # Print nicely formatted JSON, ensure_ascii=False for Unicode
    print("---\n")

Performing OCR...
Raw Markdown Output from OCR:
---
# Income Tax Return Form

## Introduction

This form is used to report income and pay taxes in India.

### Section 1: Details of the Assessee

* **Full Name (as expanded name):** Not provided
* **Last Name/Surname:** Not applicable
* **First Name:** Not provided
* **Middle Name:** Not provided
* **Address:** 
	+ Flat/Room/Door/Block No.: 14-15 SH INDUSTRIAL FIL ROAD, CHELTA LANE
	+ Road/Street/Lane/Post Office: KENNET LANE
	+ Area/Locality/Taluka/Sub-Division: Not provided
	+ Town/City/District: Not provided
	+ State/Union Territory: Not provided

### Section 2: Source of Income

* **Salary:** Not applicable
* **Income from Business/Profession:** Not applicable
* **Income from House Property:** Not applicable

### Section 3: Representative Assessee (RA)

* **Full Name (as expanded name):** Not provided
* **Last Name/Surname:** Not applicable
* **First Name:** Not provided
* **Middle Name:** Not provided
* **Address:** 
	+ Flat/Room/Do

/var/folders/sf/_2zf9n757vn216nb6yzlkmfh0000gq/T/ipykernel_90282/2639529940.py:91: PydanticDeprecatedSince20: The `parse_obj` method is deprecated; use `model_validate` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  structured_doc_model = StructuredDocument.parse_obj(parsed_json) # Parse with Pydantic
/var/folders/sf/_2zf9n757vn216nb6yzlkmfh0000gq/T/ipykernel_90282/2639529940.py:92: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  return structured_doc_model.dict() # Return as dictionary


In [3]:
!pip install torch transformers ollama datasets




In [5]:
!pip install pytesseract pdf2image pillow




In [8]:
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
import json
import os
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments

# (For Windows Only) Set Tesseract path manually
# Uncomment and modify if you installed Tesseract on Windows
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# File path (Change this to your actual file path)
file_path = "/Users/akhsrip/Desktop/Projects:Assignments/AidenAI_Assign/ab.png"


def extract_text(file_path):
    """ Extract text using Tesseract OCR from an image or PDF """
    extracted_text = ""

    # If the file is a PDF, convert it to images first
    if file_path.lower().endswith(".pdf"):
        images = convert_from_path(file_path)
    else:
        images = [Image.open(file_path)]

    for img in images:
        # Perform OCR
        text = pytesseract.image_to_string(img)
        extracted_text += text + "\n"

    return extracted_text.strip()

print("Extracting text from the document...")
ocr_text = extract_text(file_path)
print("Extracted Text:\n", ocr_text)

# Save extracted text for fine-tuning
with open("ocr_pseudo_labels.json", "w") as f:
    json.dump({"text": ocr_text}, f, indent=2)

# Check if text extraction was successful
if not ocr_text.strip():
    print("❌ OCR did not extract any text. Please check the image quality or Tesseract installation.")
    exit()

# Load Pre-Trained Llama Model & Tokenizer
model_name = "facebook/llama-3b"
tokenizer = LlamaTokenizer.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(model_name)

# Prepare Training Data
inputs = tokenizer(ocr_text, padding=True, truncation=True, return_tensors="pt")

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_llama",
    per_device_train_batch_size=2,
    num_train_epochs=5,
    learning_rate=5e-5,
    save_total_limit=1,
    logging_dir="./logs",
)

# Train the Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=inputs,
)

print("Starting fine-tuning process...")
trainer.train()

# Save Fine-Tuned Model
model.save_pretrained("./fine_tuned_llama")
tokenizer.save_pretrained("./fine_tuned_llama")

# Deploy Fine-Tuned Model to Ollama
print("Deploying fine-tuned model to Ollama...")
os.system("ollama create llama3.2-vision-finetuned -m ./fine_tuned_llama")

# Evaluate Fine-Tuned Model
def evaluate_ocr(image_path, model_name="llama3.2-vision-finetuned"):
    """Evaluate the fine-tuned OCR model."""
    result = pytesseract.image_to_string(Image.open(image_path))
    return result

# Test on the same image
print("Testing fine-tuned OCR model...")
ocr_output = evaluate_ocr(file_path)
print("Final OCR Output:\n", ocr_output)


Extracting text from the document...
Extracted Text:
 | DSO Ref No | DSD case location
(Official Use) (Official Use)

Part 1 - Patient's details

Please use this part of the form to tell us about the patient: this may be you or the person on whose behalf you
are making the claim.

‘Surname Address
JonN TEST Read, NEWTOWN,
rocseaet WALE
AMY : Posteode TB US ACA
Title (Me/Mrs/Miss/Ms/Other): Email address - The NHSBSA may use this method to contact,
ms you in relation to this claim
Sex. Male Female [X| bese @ best. com
aeotbian [alo T/T Tals Tal matter open dg tats
National Insurance No. OF a3 W567 4

UP] Ola) BLY Ge) &

Part 2 - Details of NHS dental charges paid

Please send us original receipts. We cannot deal with your claim without them.
Iwish to claim a refund of [E for NHS dental charges

(if the course of treatment is ongoing, send in this form within three months of for it. If the treatment is being paid for
By instalments, send inthis for when payments have finished ) ee hag

Y

OSError: facebook/llama-3b is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [12]:
import pytesseract
import json
import os
import torch
from PIL import Image
from pdf2image import convert_from_path
from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments

# (For Windows Only) Set the Tesseract path manually
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# File path (Change this to your uploaded file path)
file_path = "/Users/akhsrip/Desktop/Projects:Assignments/AidenAI_Assign/ab.png"

def extract_text_with_tesseract(file_path):
    """ Extract text using Tesseract OCR from an image or PDF """
    extracted_text = ""

    # Convert PDF to images if needed
    if file_path.lower().endswith(".pdf"):
        images = convert_from_path(file_path)
    else:
        images = [Image.open(file_path)]

    for img in images:
        # Fix RGBA issue by converting to RGB before saving
        if img.mode == "RGBA":
            img = img.convert("RGB")

        text = pytesseract.image_to_string(img)  # Perform OCR
        extracted_text += text + "\n"

    return extracted_text.strip()

print("Extracting text using Tesseract OCR...")
ocr_text = extract_text_with_tesseract(file_path)
print("Extracted Text:\n", ocr_text)

# Save extracted text as training data
with open("ocr_pseudo_labels.json", "w") as f:
    json.dump({"text": ocr_text}, f, indent=2)

# Check if text extraction was successful
if not ocr_text.strip():
    print("❌ No text extracted. Please check the image or Tesseract installation.")
    exit()

# Load Pre-Trained Llama Model & Tokenizer
model_name ="meta-llama/Llama-2-7b-hf"
tokenizer = LlamaTokenizer.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(model_name)

# Prepare Training Data
inputs = tokenizer(ocr_text, padding=True, truncation=True, return_tensors="pt")

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_llama",
    per_device_train_batch_size=2,
    num_train_epochs=5,
    learning_rate=5e-5,
    save_total_limit=1,
    logging_dir="./logs",
)

# Train the Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=inputs,
)

print("Starting fine-tuning process...")
trainer.train()

# Save Fine-Tuned Model
model.save_pretrained("./fine_tuned_llama")
tokenizer.save_pretrained("./fine_tuned_llama")

# Deploy Fine-Tuned Model to Ollama
print("Deploying fine-tuned model to Ollama...")
os.system("ollama create llama3.2-vision-finetuned -m ./fine_tuned_llama")

# Evaluate Fine-Tuned Model
def evaluate_ocr(image_path, model_name="llama3.2-vision-finetuned"):
    """Evaluate the fine-tuned OCR model."""
    img = Image.open(image_path)

    # Convert RGBA to RGB before processing
    if img.mode == "RGBA":
        img = img.convert("RGB")

    text = pytesseract.image_to_string(img)
    return text

# Test on the same image
print("Testing fine-tuned OCR model...")
ocr_output = evaluate_ocr(file_path)
print("Final OCR Output:\n", ocr_output)


Extracting text using Tesseract OCR...
Extracted Text:
 | DSO Ref No | DSD case location
(Official Use) (Official Use)

Part 1 - Patient's details

Please use this part of the form to tell us about the patient: this may be you or the person on whose behalf you
are making the claim.

‘Surname Address
JonN TEST Read, NEWTOWN,
rocseaet WALE
AMY : Posteode TB US ACA
Title (Me/Mrs/Miss/Ms/Other): Email address - The NHSBSA may use this method to contact,
ms you in relation to this claim
Sex. Male Female [X| bese @ best. com
aeotbian [alo T/T Tals Tal matter open dg tats
National Insurance No. OF a3 W567 4

UP] Ola) BLY Ge) &

Part 2 - Details of NHS dental charges paid

Please send us original receipts. We cannot deal with your claim without them.
Iwish to claim a refund of [E for NHS dental charges

(if the course of treatment is ongoing, send in this form within three months of for it. If the treatment is being paid for
By instalments, send inthis for when payments have finished ) ee hag


OSError: meta-llama/Llama-2-7b-hf is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [1]:
import pytesseract
import json
import os
import torch
from PIL import Image
from pdf2image import convert_from_path
from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments

# (For Windows Only) Set the Tesseract path manually
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# File path (Change this to your uploaded file path)
file_path = "/Users/akhsrip/Desktop/Projects:Assignments/AidenAI_Assign/ab.png"

def extract_text_with_tesseract(file_path):
    """ Extract handwritten text using Tesseract OCR while preserving its fields """

    extracted_text = ""

    # Instruction prompt for LLaMA
    prompt = (
        "Extract handwritten text while preserving its fields in a structured format. "
        "Ensure that names, dates, numbers, and addresses are captured correctly."
    )

    # Convert PDF to images if needed
    if file_path.lower().endswith(".pdf"):
        images = convert_from_path(file_path)
    else:
        images = [Image.open(file_path)]

    for img in images:
        # Convert RGBA to RGB before processing
        if img.mode == "RGBA":
            img = img.convert("RGB")

        # Perform OCR with structured data extraction
        text = pytesseract.image_to_string(img, config="--psm 6")  
        
        extracted_text += text + "\n"

    # Return structured input for LLaMA
    return f"{prompt}\nExtracted Handwritten Text:\n{text.strip()}"

print("Extracting handwritten text while preserving fields...")
ocr_text = extract_text_with_tesseract(file_path)
print("Extracted Text:\n", ocr_text)

# Save extracted text as training data
with open("ocr_pseudo_labels.json", "w") as f:
    json.dump({"text": ocr_text}, f, indent=2)

# Check if text extraction was successful
if not ocr_text.strip():
    print("❌ No text extracted. Please check the image or Tesseract installation.")
    exit()

# Load Pre-Trained Llama 3.2 Vision Model & Tokenizer
model_name ="meta-llama/llama3.2-vision"
tokenizer = LlamaTokenizer.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(model_name)

# Prepare Training Data (with explicit instruction)
inputs = tokenizer(ocr_text, padding=True, truncation=True, return_tensors="pt")

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_llama",
    per_device_train_batch_size=2,
    num_train_epochs=5,
    learning_rate=5e-5,
    save_total_limit=1,
    logging_dir="./logs",
)

# Train the Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=inputs,
)

print("Starting fine-tuning process...")
trainer.train()

# Save Fine-Tuned Model
model.save_pretrained("./fine_tuned_llama")
tokenizer.save_pretrained("./fine_tuned_llama")

# Deploy Fine-Tuned Model to Ollama
print("Deploying fine-tuned model to Ollama...")
os.system("ollama create llama3.2-vision-finetuned -m ./fine_tuned_llama")

# Evaluate Fine-Tuned Model
def evaluate_ocr(image_path, model_name="llama3.2-vision-finetuned"):
    """Evaluate the fine-tuned OCR model."""
    img = Image.open(image_path)

    # Convert RGBA to RGB before processing
    if img.mode == "RGBA":
        img = img.convert("RGB")

    text = pytesseract.image_to_string(img, config="--psm 6")

    # Use fine-tuned LLaMA 3.2 Vision for structured extraction
    input_text = (
        "Process the following handwritten text and extract it in a structured format:\n"
        f"{text.strip()}"
    )
    
    inputs = tokenizer(input_text, padding=True, truncation=True, return_tensors="pt")
    outputs = model.generate(**inputs)
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test on the same image
print("Testing fine-tuned OCR model...")
ocr_output = evaluate_ocr(file_path)
print("Final OCR Output:\n", ocr_output)


  from .autonotebook import tqdm as notebook_tqdm


Extracting handwritten text while preserving fields...
Extracted Text:
 Extract handwritten text while preserving its fields in a structured format. Ensure that names, dates, numbers, and addresses are captured correctly.
Extracted Handwritten Text:
[ovate tte “Ypspeaseloction ==
[Oficial Use) (Official Use)
Part 1 - Patient's details
Please use this part of the form to tell us about the patient: this may be you or the person on whose behalf you
are making the claim.
Surname address
JOHN TEST Road, NEWTOWN,
Forename(s) w
ALES
Posteode TBE 2CA
Tile (MenarsrMissiAs/Other): Email address - The NHSBSA may use this method to contact
Sex Mae[] Female DX)
one oni re
neotbith [afol Tyfal/ Talololo] rtm ote esosgngat hans
National Insurance No.
= 07183 456764
Of)
Part 2 - Details of NHS dental charges paid
Please send us original receipts. We cannot deal with your claim without them.
I wish to claim a refund of [EJ for nits dental charges
(if the course of treatment is ongoing, send in ths form

OSError: meta-llama/llama3.2-vision is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [2]:
!pip install torch transformers datasets accelerate bitsandbytes


Collecting bitsandbytes
  Downloading bitsandbytes-0.42.0-py3-none-any.whl.metadata (9.9 kB)
Downloading bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.42.0


In [2]:
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
import torch

# Load the model and processor
processor = AutoProcessor.from_pretrained("google/paligemma2-3b-mix-448")
model = AutoModelForImageTextToText.from_pretrained("google/paligemma2-3b-mix-448")

# Load an image (replace 'image.jpg' with your image file)
image_path = "/Users/akhsrip/Desktop/Projects:Assignments/AidenAI_Assign/Data/test.jpg"
image = Image.open(image_path).convert("RGB")

# Define the prompt
prompt = "Extract all handwritten and printed text from the image."

# Preprocess the image and text prompt
inputs = processor(images=image, text=prompt, return_tensors="pt")

# Generate text output from the model
with torch.no_grad():
    output = model.generate(**inputs)

# Decode and print the extracted text
extracted_text = processor.batch_decode(output, skip_special_tokens=True)[0]
print("Extracted Text:", extracted_text)


RuntimeError: Only a single TORCH_LIBRARY can be used to register the namespace prims; please put all of your definitions in a single TORCH_LIBRARY block.  If you were trying to specify implementations, consider using TORCH_LIBRARY_IMPL (which can be duplicated).  If you really intended to define operators for a single namespace in a distributed way, you can use TORCH_LIBRARY_FRAGMENT to explicitly indicate this.  Previous registration of TORCH_LIBRARY was registered at /dev/null:488; latest registration was registered at /dev/null:488

In [3]:
!pip install --upgrade torch torchvision torchaudio transformers


Collecting torch
  Downloading torch-2.6.0-cp312-none-macosx_11_0_arm64.whl.metadata (28 kB)
Collecting torchvision
  Downloading torchvision-0.21.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading torchaudio-2.6.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.6 kB)
Collecting transformers
  Using cached transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Downloading torch-2.6.0-cp312-none-macosx_11_0_arm64.whl (66.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.5/66.5 MB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading torchvision-0.21.0-cp312-cp312-macosx_11_0_arm64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torchaudio-2.6.0-cp312-cp312-macosx_11_0_arm64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25

In [5]:
!pip install torch==2.1.0


[31mERROR: Could not find a version that satisfies the requirement torch==2.1.0 (from versions: 2.2.0, 2.2.1, 2.2.2, 2.3.0, 2.3.1, 2.4.0, 2.4.1, 2.5.0, 2.5.1, 2.6.0)[0m[31m
[0m[31mERROR: No matching distribution found for torch==2.1.0[0m[31m
[0m

In [1]:
from transformers import AutoProcessor, AutoModelForImageTextToText
import torch
from PIL import Image

# Load the processor
processor = AutoProcessor.from_pretrained("google/paligemma2-3b-mix-448")

# Load the model with efficient device mapping
model = AutoModelForImageTextToText.from_pretrained(
    "google/paligemma2-3b-mix-448",
    device_map="auto"  # Automatically assigns model to GPU/CPU as needed
)

# Load an image
image_path = "/Users/akhsrip/Desktop/Projects:Assignments/AidenAI_Assign/Data/test.jpg"
image = Image.open(image_path).convert("RGB")

# Define the prompt
prompt = "Extract all handwritten and printed text from the image."

# Preprocess image and prompt
inputs = processor(images=image, text=prompt, return_tensors="pt")

# Move inputs to the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputs = {k: v.to(device) for k, v in inputs.items()}
model.to(device)

# Generate text output
with torch.no_grad():
    output = model.generate(**inputs)

# Decode and print extracted text
extracted_text = processor.batch_decode(output, skip_special_tokens=True)[0]
print("Extracted Text:", extracted_text)


  from .autonotebook import tqdm as notebook_tqdm
Downloading shards: 100%|██████████| 2/2 [04:06<00:00, 123.36s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.95s/it]
You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.


Extracted Text: Extract all handwritten and printed text from the image.
Sorry, as a base VLM I am not trained to answer this question.


In [3]:
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
import torch

# Load model and processor
processor = AutoProcessor.from_pretrained("google/paligemma2-3b-mix-448")
model = AutoModelForImageTextToText.from_pretrained("google/paligemma2-3b-mix-448", device_map="auto")

# Load and preprocess the image
image_path = "/Users/akhsrip/Desktop/Projects:Assignments/AidenAI_Assign/Data/test.jpg"  # Replace with your image path
image = Image.open(image_path).convert("RGB")

# **Corrected Prompt with `<image>` Token**
prompt = "<image> Extract all handwritten and printed text from the image."

# Process image and prompt together
inputs = processor(images=image, text=prompt, return_tensors="pt")

# Move inputs to the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputs = {k: v.to(device) for k, v in inputs.items()}
model.to(device)

# Generate text output
with torch.no_grad():
    output = model.generate(**inputs)

# Decode and print the extracted text
extracted_text = processor.batch_decode(output, skip_special_tokens=True)[0]
print("Extracted Text:", extracted_text)


Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.38s/it]


Extracted Text:  Extract all handwritten and printed text from the image.
Sorry, as a base VLM I am not trained to answer this question.


In [4]:
from transformers import pipeline
from PIL import Image

# Load the model as a pipeline
pipe = pipeline("image-text-to-text", model="Qwen/Qwen2-VL-7B-Instruct")

# Load an image (Replace 'image.jpg' with your actual image file)
image_path = "/Users/akhsrip/Desktop/Projects:Assignments/AidenAI_Assign/Data/test.jpg"
image = Image.open(image_path).convert("RGB")

# Define the input prompt with an instruction
messages = [
    {"role": "user", "content": "<image> Extract all handwritten and printed text from the image."}
]

# Run inference
result = pipe({"image": image, "messages": messages})

# Print the extracted text
print("Extracted Text:", result[0]['generated_text'])


Downloading shards: 100%|██████████| 5/5 [11:21<00:00, 136.25s/it]
Loading checkpoint shards: 100%|██████████| 5/5 [00:34<00:00,  6.86s/it]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use mps:0


RuntimeError: MPS backend out of memory (MPS allocated: 36.11 GB, other allocations: 384.00 KB, max allowed: 36.27 GB). Tried to allocate 259.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [5]:
from transformers import pipeline
from PIL import Image

# Load the model as a pipeline
pipe = pipeline("image-text-to-text", model="microsoft/Phi-3.5-vision-instruct", trust_remote_code=True)

# Load an image (Replace 'image.jpg' with your actual image file)
image_path = "/Users/akhsrip/Desktop/Projects:Assignments/AidenAI_Assign/Data/test.jpg"
image = Image.open(image_path).convert("RGB")

# Define the instruction prompt
messages = [
    {"role": "user", "content": "<image> Extract all handwritten and printed text from the image."}
]

# Run inference
result = pipe({"image": image, "messages": messages})

# Print the extracted text
print("Extracted Text:", result[0]['generated_text'])


A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-vision-instruct:
- configuration_phi3_v.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


ValueError: Could not load model microsoft/Phi-3.5-vision-instruct with any of the following classes: (<class 'transformers.models.auto.modeling_auto.AutoModelForImageTextToText'>,). See the original errors:

while loading with AutoModelForImageTextToText, an error is thrown:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/transformers/pipelines/base.py", line 290, in infer_framework_load_model
    model = model_class.from_pretrained(model, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py", line 567, in from_pretrained
    raise ValueError(
ValueError: Unrecognized configuration class <class 'transformers_modules.microsoft.Phi-3.5-vision-instruct.4a0d683eba9f1d0cbfb6151705d1ee73c25a80ca.configuration_phi3_v.Phi3VConfig'> for this kind of AutoModel: AutoModelForImageTextToText.
Model type should be one of AriaConfig, BlipConfig, Blip2Config, ChameleonConfig, Emu3Config, FuyuConfig, GitConfig, GotOcr2Config, IdeficsConfig, Idefics2Config, Idefics3Config, InstructBlipConfig, Kosmos2Config, LlavaConfig, LlavaNextConfig, LlavaOnevisionConfig, MllamaConfig, PaliGemmaConfig, Pix2StructConfig, PixtralVisionConfig, Qwen2_5_VLConfig, Qwen2VLConfig, UdopConfig, VipLlavaConfig, VisionEncoderDecoderConfig.




HF_TOKEN = "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"

In [4]:
!pip install transformers pillow torch




In [1]:
from transformers import pipeline
from PIL import Image

# Hugging Face token (Prompting for security)
HF_TOKEN = input("Enter your Hugging Face token: ").strip()

# ✅ Use the correct pipeline type: "image-text-to-text"
pipe = pipeline("image-text-to-text", model="meta-llama/Llama-3.2-11B-Vision-Instruct", token=HF_TOKEN)

# Load the image (Replace with actual file path)
image_path = "/Users/akhsrip/Desktop/Projects:Assignments/AidenAI_Assign/Data/test.jpg"
image = Image.open(image_path).convert("RGB")

# Define the instruction prompt
messages = [{"role": "user", "content": "Extract all handwritten and printed text from this image."}]

# ✅ Use pipeline correctly (No deprecated `prompt` argument)
result = pipe(image, messages)

# Print extracted text
print("Extracted Text:", result[0]['generated_text'])


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 5/5 [00:31<00:00,  6.20s/it]
Device set to use mps:0


RuntimeError: MPS backend out of memory (MPS allocated: 36.26 GB, other allocations: 384.00 KB, max allowed: 36.27 GB). Tried to allocate 224.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [2]:
from transformers import pipeline
from PIL import Image

# Hugging Face token (Prompting for security)
HF_TOKEN = input("Enter your Hugging Face token: ").strip()

# ✅ Use the correct pipeline type: "image-text-to-text"
pipe = pipeline("image-text-to-text", model="Qwen/Qwen2-VL-7B-Instruct", token=HF_TOKEN)

# Load the image (Replace with actual file path)
image_path = "/Users/akhsrip/Desktop/Projects:Assignments/AidenAI_Assign/Data/test.jpg"
image = Image.open(image_path).convert("RGB")

# Define the instruction prompt
messages = [{"role": "user", "content": "Extract all handwritten and printed text from this image."}]

# ✅ Use pipeline correctly (No deprecated `prompt` argument)
result = pipe(image, messages)

# Print extracted text
print("Extracted Text:", result[0]['generated_text'])


Loading checkpoint shards: 100%|██████████| 5/5 [00:36<00:00,  7.36s/it]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use mps:0


RuntimeError: MPS backend out of memory (MPS allocated: 36.26 GB, other allocations: 384.00 KB, max allowed: 36.27 GB). Tried to allocate 18.75 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [6]:
from transformers import pipeline
from PIL import Image
import os

# Use environment variable or replace with a new token (do NOT expose it publicly)
HF_TOKEN = "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"# Set this in your environment variables

# Load the pipeline with authentication
pipe = pipeline(
    "image-text-to-text", 
    model="OpenGVLab/InternVL2_5-8B", 
    trust_remote_code=True, 
    use_auth_token=HF_TOKEN  # Fixed variable name
)

# Load an image (Fixed invalid path issue)
image_path = "/Users/akhsrip/Desktop/Projects/Assignments/AidenAI_Assign/Data/test.jpg"  
image = Image.open(image_path).convert("RGB")

# Instruction for text extraction
messages = [
    {"role": "user", "content": "Extract the text from this image."},
]

# Run the pipeline
output = pipe(image, messages)

# Print the extracted text
print("Extracted Text:", output)


NameError: name 'hf_token' is not defined