In [1]:
%pip install pillow numpy opencv-python easyocr accelerate transformers hf_xet jupyter ipywidgets

Collecting torch
  Downloading torch-2.8.0-cp311-cp311-win_amd64.whl.metadata (30 kB)
Collecting pillow
  Using cached pillow-11.3.0-cp311-cp311-win_amd64.whl.metadata (9.2 kB)
Collecting numpy
  Downloading numpy-2.3.3-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting opencv-python
  Downloading opencv_python-4.12.0.88-cp37-abi3-win_amd64.whl.metadata (19 kB)
Collecting easyocr
  Using cached easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting accelerate
  Downloading accelerate-1.10.1-py3-none-any.whl.metadata (19 kB)
Collecting filelock (from torch)
  Using cached filelock-3.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Using cached networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2025.9.0-py3-none-any.

In [2]:
%pip install torch torchvision --index-url https://download.pytorch.org/whl/cu129

Looking in indexes: https://download.pytorch.org/whl/cu129
Collecting torch
  Downloading https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp311-cp311-win_amd64.whl.metadata (29 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu129/torchvision-0.23.0%2Bcu129-cp311-cp311-win_amd64.whl.metadata (6.3 kB)
Downloading https://download.pytorch.org/whl/cu129/torch-2.8.0%2Bcu129-cp311-cp311-win_amd64.whl (3571.8 MB)
   ---------------------------------------- 0.0/3.6 GB ? eta -:--:--
   ---------------------------------------- 0.0/3.6 GB ? eta -:--:--
   ---------------------------------------- 0.0/3.6 GB ? eta -:--:--
   ---------------------------------------- 0.0/3.6 GB 1.9 MB/s eta 0:31:03
   ---------------------------------------- 0.0/3.6 GB 3.1 MB/s eta 0:18:55
   ---------------------------------------- 0.0/3.6 GB 3.5 MB/s eta 0:16:50
   ---------------------------------------- 0.0/3.6 GB 4.3 MB/s eta 0:13:48
   --------------------------------------

# EasyOCR Only

In [1]:
from PIL import Image, ImageDraw
import numpy as np
import cv2
import re
import os
import json
from typing import Dict, List, Tuple
import easyocr
import torch

class GenericHandwritingOCR:
    def __init__(self):
        """Initializes the OCR class and sets the device."""
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device}")
        self.text_detector = None
        self.load_model()

    def load_model(self):
        """Load the EasyOCR model."""
        try:
            print("Loading EasyOCR text detection and recognition model...")
            self.text_detector = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
            print("✅ EasyOCR model loaded successfully.")
        except Exception as e:
            print(f"❌ Model loading failed: {e}")

    def clean_text(self, text: str) -> str:
        """Enhanced text cleaning for values."""
        if not text or len(text.strip()) == 0:
            return ""
        
        # Basic cleaning
        text = text.strip()
        text = re.sub(r'\s+', ' ', text)  # Multiple spaces to single space
        
        # Remove trailing periods that are common OCR artifacts
        text = re.sub(r'\s*\.\s*$', '', text)
        
        # Clean up common OCR errors for specific patterns
        # Phone numbers
        if re.match(r'[\d\-\s]+', text):
            text = re.sub(r'[^\d\-]', '', text)
        
        # Email addresses
        if '@' in text:
            text = re.sub(r'[^\w@.-]', '', text)
        
        # Dates
        if re.match(r'[\d\-/\\s]+', text):
            text = re.sub(r'[^\d\-/]', '', text)
        
        # General cleanup - keep alphanumeric, spaces, and common punctuation
        text = re.sub(r'[^\w\s@.-]', '', text)
        
        return text.strip()

    def create_debug_visualization(self, image_path: str, regions: List[Dict]):
        """Create debug image showing detected regions."""
        image = Image.open(image_path)
        debug_image = image.copy()
        draw = ImageDraw.Draw(debug_image)
        
        colors = ['red', 'blue', 'green', 'orange', 'purple', 'brown', 
                  'pink', 'gray', 'olive', 'navy', 'cyan', 'magenta']
        
        for i, region in enumerate(regions):
            color = colors[i % len(colors)]
            x1, y1, x2, y2 = region['bbox']
            draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
            region_id = region.get('id', f'region_{i}')
            draw.text((x1, y1 - 15), region_id, fill=color)
        
        debug_path = 'debug_detection.png'
        debug_image.save(debug_path)
        print(f"🔍 Debug image saved: {debug_path}")

    def pair_labels_with_values(self, regions: List[Dict]) -> Dict[str, str]:
        """Pair label regions with their corresponding value regions based on spatial proximity."""
        # Sort regions by position (top to bottom, left to right)
        sorted_regions = sorted(regions, key=lambda x: (x['position']['y'], x['position']['x']))
        
        key_value_pairs = {}
        used_regions = set()
        
        # First pass: identify potential labels and values
        labels = []
        values = []
        
        for i, region in enumerate(sorted_regions):
            text = region['text'].lower().strip()
            
            # Enhanced label detection
            label_keywords = ['name', 'first', 'middle', 'last', 'gender', 'date', 'birth', 
                            'address', 'line', 'city', 'state', 'phone', 'email', 'code', 'pin', 'plin']
            
            is_label = any(keyword in text for keyword in label_keywords)
            
            if not is_label and len(text.split()) <= 3 and any(char.isalpha() for char in text):
                if region['position']['x'] < 200:  # Assuming labels are on the left
                    is_label = True
            
            if is_label:
                labels.append((i, region))
            else:
                values.append((i, region))
        
        print(f"🏷️  Found {len(labels)} potential labels and {len(values)} potential values")
        
        # Second pass: pair labels with values
        for label_idx, label_region in labels:
            if label_idx in used_regions:
                continue
            
            label_center_y = label_region['position']['y'] + label_region['position']['height'] / 2
            label_right = label_region['position']['x'] + label_region['position']['width']
            
            best_value = None
            best_value_idx = -1
            min_distance = float('inf')
            
            for value_idx, value_region in values:
                if value_idx in used_regions:
                    continue
                
                value_center_y = value_region['position']['y'] + value_region['position']['height'] / 2
                value_left = value_region['position']['x']
                
                vertical_distance = abs(value_center_y - label_center_y)
                horizontal_distance = abs(value_left - label_right)
                
                is_same_row = vertical_distance < 50
                is_to_the_right = value_left > label_right - 50
                
                if is_same_row and is_to_the_right:
                    distance = horizontal_distance + vertical_distance * 0.1
                    if distance < min_distance:
                        min_distance = distance
                        best_value = value_region
                        best_value_idx = value_idx
            
            if best_value and min_distance < 300:
                clean_label = self.format_label(label_region['text'])
                clean_value = self.clean_text(best_value['text'])
                
                if clean_label and clean_value:
                    # Avoid overwriting a field with a less likely candidate
                    if clean_label not in key_value_pairs:
                        key_value_pairs[clean_label] = clean_value
                        used_regions.add(label_idx)
                        used_regions.add(best_value_idx)
                        print(f"✅ Paired: '{clean_label}' -> '{clean_value}'")
        
        return key_value_pairs

    def format_label(self, text: str) -> str:
        """Format label text into proper field names."""
        text = text.lower().strip()
        text = re.sub(r'[^\w\s]', '', text)
        
        label_mappings = {
            'first name': 'First Name', 'first': 'First Name',
            'middle name': 'Middle Name', 'middle': 'Middle Name', 'midde': 'Middle Name', 'manne': 'Middle Name',
            'last name': 'Last Name', 'last': 'Last Name',
            'date of birth': 'Date of Birth', 'birth': 'Date of Birth', 'date': 'Date of Birth',
            'address line 1': 'Address Line 1', 'line 1': 'Address Line 1', 'address': 'Address Line 1',
            'address line 2': 'Address Line 2', 'line 2': 'Address Line 2',
            'city': 'City',
            'state': 'State',
            'pin code': 'Pin Code', 'plin code': 'Pin Code',
            'phone number': 'Phone Number', 'phone': 'Phone Number', 'mumbers': 'Phone Number',
            'email id': 'Email ID', 'email': 'Email ID',
            'gender': 'Gender'
        }
        
        if text in label_mappings:
            return label_mappings[text]
        
        for key, value in label_mappings.items():
            if key in text or text in key:
                return value
        
        return ' '.join(word.capitalize() for word in text.split())

    def process_image(self, image_path: str) -> Dict:
        """Main processing function: detect and recognize text using EasyOCR."""
        print(f"\n🔍 Processing: {os.path.basename(image_path)}")
        
        if self.text_detector is None:
            return {'error': 'Model not loaded'}
        
        # Step 1: Use EasyOCR to get bounding boxes, text, and confidence in one go
        try:
            results = self.text_detector.readtext(image_path, detail=1, paragraph=False)
            print(f"🔍 Detected {len(results)} text regions")
        except Exception as e:
            print(f"❌ Text detection and recognition failed: {e}")
            return {'error': f'EasyOCR failed: {e}'}

        if not results:
            return {'error': 'No text regions detected'}

        # Step 2: Format the results into the required structure
        processed_regions = []
        for i, (bbox, text, confidence) in enumerate(results):
            cleaned_text = self.clean_text(text)
            if not cleaned_text:
                print(f"Region {i} ('{text}') was empty after cleaning.")
                continue

            # Convert bbox to standard format (x1, y1, x2, y2)
            bbox_array = np.array(bbox)
            x1, y1 = bbox_array.min(axis=0).astype(int)
            x2, y2 = bbox_array.max(axis=0).astype(int)

            processed_regions.append({
                'id': f'text_region_{i}',
                'text': cleaned_text,
                'bbox': [int(x1), int(y1), int(x2), int(y2)],
                'confidence': float(confidence),
                'position': {
                    'x': int(x1),
                    'y': int(y1),
                    'width': int(x2 - x1),
                    'height': int(y2 - y1)
                }
            })
            print(f"Processing text_region_{i}... '{cleaned_text}'")

        # Create debug visualization
        self.create_debug_visualization(image_path, processed_regions)
        
        # Pair labels and values
        key_value_pairs = self.pair_labels_with_values(processed_regions)
        
        output_data = {
            'image_path': image_path,
            'total_regions': len(processed_regions),
            'form_fields': key_value_pairs,
            'raw_regions': {}
        }
        
        # Include raw regions for debugging
        sorted_regions_for_output = sorted(processed_regions, key=lambda x: (x['position']['y'], x['position']['x']))
        for i, region in enumerate(sorted_regions_for_output):
            key = f"text_field_{i+1}"
            output_data['raw_regions'][key] = {
                'value': region['text'],
                'confidence': region['confidence'],
                'bbox': region['bbox'],
                'position': region['position']
            }
        
        return output_data

def main():
    """Main function to process handwritten text and output to JSON."""
    print("🎯 Handwriting OCR with EasyOCR")
    
    ocr = GenericHandwritingOCR()
    
    if ocr.text_detector is None:
        print("❌ Cannot proceed without the model.")
        return
    
    # Use os.getcwd() for compatibility with notebooks
    script_dir = os.getcwd()
    
    possible_paths = [
        os.path.join(script_dir, "Images", "Handwriting", "image.png"),
        os.path.join(script_dir, "images", "handwriting", "image.png"),
        os.path.join(script_dir, "image.png"),
    ]
    
    image_path = None
    for path in possible_paths:
        if os.path.exists(path):
            image_path = path
            break
    
    if not image_path:
        print("❌ Image not found. Please ensure 'image.png' is in one of the following locations:")
        for path in possible_paths:
            print(f"   {path}")
        return
    
    print(f"📸 Found image: {image_path}")
    
    # Process the image
    results = ocr.process_image(image_path)
    
    if 'error' in results:
        print(f"❌ Error: {results['error']}")
        return
    
    output_file = 'output.json'
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    
    print(f"\n📋 EXTRACTED FORM FIELDS:")
    print("=" * 50)
    
    if results.get('form_fields'):
        for field_name, field_value in results['form_fields'].items():
            print(f"{field_name}: {field_value}")
    else:
        print("No form fields detected")
    
    print("=" * 50)
    print(f"✅ Successfully processed {results['total_regions']} text regions.")
    print(f"📁 Results saved to: {output_file}")

if __name__ == "__main__":
    main()

🎯 Handwriting OCR with EasyOCR
Using device: cuda
Loading EasyOCR text detection and recognition model...
✅ EasyOCR model loaded successfully.
📸 Found image: c:\Users\Darsh Veer Singh\Documents\GitHub\MOSIP-TextReading\MOSIP-TextReading\Images\Handwriting\image.png

🔍 Processing: image.png
🔍 Detected 35 text regions
Processing text_region_0... 'Fiust'
Processing text_region_1... 'Ma'
Processing text_region_2... 'Abigail'
Processing text_region_3... 'Midde'
Processing text_region_4... 'Mone'
Processing text_region_5... 'Guuce'
Processing text_region_6... 'Leust'
Processing text_region_7... 'Mame'
Processing text_region_8... 'Summ4'
Processing text_region_9... 'Gundu1'
Processing text_region_10... 'Femal'
Processing text_region_11... 'Datt a'
Processing text_region_12... 'Bith'
Processing text_region_13... '27-092000'
Processing text_region_14... 'Aderess'
Processing text_region_15... 'Lin'
Processing text_region_16... 'Raad 1'
Processing text_region_17... '2'
Processing text_region_18..

# Kosmos 2.5

In [None]:
import torch
from PIL import Image, ImageDraw
import re
import os
import json
from transformers import AutoProcessor, AutoModelForVision2Seq
from typing import Dict, List

class KosmosOcr:
    def __init__(self):
        """Initializes the OCR class and loads the KOSMOS-2.5 model."""
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device}")
        self.model = None
        self.processor = None
        self.load_model()

    def load_model(self):
        """Load the KOSMOS-2.5 model and processor."""
        try:
            model_id = "microsoft/kosmos-2.5"
            print(f"Loading KOSMOS-2.5 model from {model_id}...")
            self.model = AutoModelForVision2Seq.from_pretrained(model_id).to(self.device)
            self.processor = AutoProcessor.from_pretrained(model_id)
            print("✅ KOSMOS-2.5 model loaded successfully.")
        except Exception as e:
            print(f"❌ Model loading failed: {e}")

    def parse_kosmos_output(self, text_output: str) -> List[Dict]:
        """Parses the raw text output from KOSMOS-2.5 to extract text and bounding boxes."""
        
        # Regex to find text and its corresponding bounding box
        # It captures the text and the four coordinates inside the <box_...> tag
        pattern = re.compile(r"([^<]+)<box_(\d+),(\d+),(\d+),(\d+)>\s*")
        
        parsed_regions = []
        matches = pattern.findall(text_output)
        
        for i, match in enumerate(matches):
            text, x1, y1, x2, y2 = match
            text = text.strip()
            
            # The model provides coordinates in a 1000x1000 grid, so we'll store them as is
            # and scale them later if needed.
            bbox = [int(x1), int(y1), int(x2), int(y2)]
            
            if text:
                parsed_regions.append({
                    'id': f'text_region_{i}',
                    'text': text,
                    'bbox_1000': bbox, # Store the original 1000x1000 coordinates
                    'position': {
                        'x': bbox[0], 'y': bbox[1], 
                        'width': bbox[2] - bbox[0], 'height': bbox[3] - bbox[1]
                    }
                })
        return parsed_regions

    def create_debug_visualization(self, image_path: str, regions: List[Dict]):
        """Create a debug image showing the detected regions, scaling boxes to the image size."""
        image = Image.open(image_path)
        original_width, original_height = image.size
        debug_image = image.copy()
        draw = ImageDraw.Draw(debug_image)
        
        colors = ['red', 'blue', 'green', 'orange', 'purple', 'brown']

        for i, region in enumerate(regions):
            color = colors[i % len(colors)]
            
            # Scale the 1000x1000 bounding box to the original image dimensions
            b_1000 = region['bbox_1000']
            x1 = int(b_1000[0] * original_width / 1000)
            y1 = int(b_1000[1] * original_height / 1000)
            x2 = int(b_1000[2] * original_width / 1000)
            y2 = int(b_1000[3] * original_height / 1000)
            
            region['bbox'] = [x1, y1, x2, y2] # Add scaled bbox to dict
            
            draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
            draw.text((x1, y1 - 15), region['text'], fill=color)
        
        debug_path = 'debug_detection_kosmos.png'
        debug_image.save(debug_path)
        print(f"🔍 Debug image saved: {debug_path}")
        
    def process_image(self, image_path: str) -> Dict:
        """Main processing function to extract key-value pairs using KOSMOS-2.5."""
        print(f"\n🔍 Processing with KOSMOS-2.5: {os.path.basename(image_path)}")
        
        if self.model is None or self.processor is None:
            return {'error': 'Model not loaded'}

        image = Image.open(image_path).convert("RGB")
        prompt = "<grounding>Perform OCR on this image and provide bounding boxes for each piece of text."

        inputs = self.processor(text=prompt, images=image, return_tensors="pt").to(self.device)

        # --- DO NOT CONVERT TO FLOAT16 ---

        # --- START: SIMPLIFIED GENERATE CALL ---
        # The **inputs unpacking is the most robust way to pass arguments.
        # It removes the need for manual key handling.
        with torch.no_grad():
            generated_ids = self.model.generate(
                **inputs,
                use_cache=True,
                max_new_tokens=512,
            )
        # --- END: SIMPLIFIED GENERATE CALL ---
        
        decoded_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        parsed_answer = decoded_text.replace(prompt, "").strip()
        print(f"🤖 KOSMOS-2.5 Raw Output:\n{parsed_answer}")

        raw_regions = self.parse_kosmos_output(parsed_answer)

        key_value_pairs = {}
        for region in raw_regions:
            text = region['text']
            if ':' in text:
                parts = text.split(':', 1)
                key = parts[0].strip()
                value = parts[1].strip()
                key_value_pairs[key] = value
        
        self.create_debug_visualization(image_path, raw_regions)
        
        output_data = {
            'image_path': image_path,
            'total_regions': len(raw_regions),
            'form_fields': key_value_pairs,
            'raw_regions': {}
        }

        for i, region in enumerate(raw_regions):
            key = f"text_field_{i+1}"
            output_data['raw_regions'][key] = {
                'value': region['text'],
                'bbox': region.get('bbox', []),
                'position': region['position']
            }
        
        return output_data

def main():
    """Main function to process handwritten text and output to JSON."""
    print("🎯 Handwriting Form OCR with KOSMOS-2.5")
    
    ocr = KosmosOcr()
    
    if ocr.model is None:
        print("❌ Cannot proceed without the model.")
        return
    
    script_dir = os.getcwd()
    image_path = os.path.join(script_dir, "Images", "Handwriting", "image.png")
    
    if not os.path.exists(image_path):
        image_path = "image.png" # Fallback to current directory
        if not os.path.exists(image_path):
             print(f"❌ Image not found. Please ensure 'image.png' is present.")
             return
    
    print(f"📸 Found image: {image_path}")
    
    results = ocr.process_image(image_path)
    
    if 'error' in results:
        print(f"❌ Error: {results['error']}")
        return
    
    output_file = 'output_kosmos.json'
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    
    print(f"\n📋 EXTRACTED FORM FIELDS:")
    print("=" * 50)
    
    if results.get('form_fields'):
        for field_name, field_value in results['form_fields'].items():
            print(f"{field_name}: {field_value}")
    else:
        print("No structured form fields were extracted. Check raw output.")
    
    print("=" * 50)
    print(f"✅ Successfully processed {results['total_regions']} text regions.")
    print(f"📁 Results saved to: {output_file}")

if __name__ == "__main__":
    main()

🎯 Handwriting Form OCR with KOSMOS-2.5
Using device: cuda
Loading KOSMOS-2.5 model from microsoft/kosmos-2.5...


KeyboardInterrupt: 

: 