In [40]:
import os
import base64
from pathlib import Path
from typing import List, Dict, Optional, Union
from dotenv import load_dotenv
import json
import re
from datetime import datetime

# LangChain imports
from langchain_core.messages import HumanMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic

# For image processing
from PIL import Image
import io

print("All required libraries imported successfully!")

All required libraries imported successfully!


In [41]:
# Global Configuration Variables for Image Processing
# Set to False to disable image resizing/optimization
ENABLE_IMAGE_OPTIMIZATION = False
# Resize ratio (0.5 = 50% of original size, 1.0 = no resize)
IMAGE_RESIZE_RATIO = 0.5
MAX_IMAGE_DIMENSION = 1024  # Maximum width or height in pixels
MAX_FILE_SIZE_MB = 4  # Maximum file size in MB before optimization
JPEG_QUALITY = 85  # JPEG compression quality (1-100)

# JSON Parsing and Formatting Utilities
def parse_json_from_text(text: str):
    """
    Extract and parse JSON from text that might contain markdown code blocks or other formatting.
    """
    try:
        # First try to parse directly
        return json.loads(text)
    except json.JSONDecodeError:
        pass
    
    # Try to extract JSON from markdown code blocks
    json_patterns = [
        r'```json\s*(\{.*?\})\s*```',  # ```json ... ```
        r'```\s*(\{.*?\})\s*```',      # ``` ... ```
        r'(\{.*?\})',                   # Just the JSON object
    ]
    
    for pattern in json_patterns:
        matches = re.findall(pattern, text, re.DOTALL)
        for match in matches:
            try:
                return json.loads(match.strip())
            except json.JSONDecodeError:
                continue
    
    # If all parsing fails, return the original text
    return text

def format_extraction_result(result: Dict) -> Dict:
    """
    Format the extraction result with parsed JSON data for better readability.
    """
    if result.get("success") and "extracted_data" in result:
        try:
            # Try to parse the extracted data as JSON
            parsed_data = parse_json_from_text(result["extracted_data"])
            
            # If successfully parsed, add it as a separate field
            if isinstance(parsed_data, (dict, list)):
                result["extracted_data_parsed"] = parsed_data
                result["extraction_format"] = "json"
            else:
                result["extraction_format"] = "text"
                
        except Exception as e:
            result["extraction_format"] = "text"
            result["parsing_error"] = str(e)
    
    return result


class OCRBenchmark:
    """
    A model-agnostic OCR system using LangChain that can extract data from images
    using different AI models (Gemini, OpenAI GPT-4V, Claude).
    """

    def __init__(self, model_name: str = "gemini", temperature: float = 0.1):
        """
        Initialize the OCR Benchmark system.

        Args:
            model_name: The model to use ("gemini", "openai", "anthropic")
            temperature: Temperature for model generation
        """
        # Load environment variables
        load_dotenv()

        self.model_name = model_name
        self.temperature = temperature
        self.model = None

        # Supported image formats
        self.supported_formats = {'.png', '.jpg', '.jpeg', '.webp', '.gif'}

        # MIME type mapping for correct image format handling
        self.mime_type_map = {
            '.jpg': 'jpeg',
            '.jpeg': 'jpeg',
            '.png': 'png',
            '.webp': 'webp',
            '.gif': 'gif'
        }

        # Image optimization settings (now using global variables)
        self.enable_optimization = ENABLE_IMAGE_OPTIMIZATION
        self.resize_ratio = IMAGE_RESIZE_RATIO
        self.max_image_size = (MAX_IMAGE_DIMENSION, MAX_IMAGE_DIMENSION)
        self.max_file_size_mb = MAX_FILE_SIZE_MB
        self.jpeg_quality = JPEG_QUALITY

        # Try to initialize model, but don't fail if API key is missing
        try:
            self.model = self._initialize_model()
        except ValueError as e:
            print(f"‚ö†Ô∏è Warning: {e}")
            print(
                f"Model '{model_name}' will be initialized when needed if API key becomes available.")

    def _initialize_model(self):
        """Initialize the specified model."""
        if self.model_name.lower() == "gemini":
            api_key = os.getenv("GOOGLE_API_KEY")
            if not api_key or api_key == "your_google_api_key_here":
                raise ValueError(
                    "GOOGLE_API_KEY not found or not set. Please set your GOOGLE_API_KEY in the .env file")
            return ChatGoogleGenerativeAI(
                model="gemini-2.5-flash",  # Updated to newer model
                temperature=self.temperature,
                google_api_key=api_key,
                timeout=60  # Set reasonable timeout
            )

        elif self.model_name.lower() == "openai":
            api_key = os.getenv("OPENAI_API_KEY")
            if not api_key or api_key == "your_openai_api_key_here":
                raise ValueError(
                    "OPENAI_API_KEY not found or not set. Please set your OPENAI_API_KEY in the .env file")
            return ChatOpenAI(
                model="gpt-4o",
                temperature=self.temperature,
                timeout=60
            )

        elif self.model_name.lower() == "anthropic":
            return ChatAnthropic(
                model_name="claude-3-5-sonnet-20241022",
                temperature=self.temperature,
                timeout=60,
                stop=None
            )
        else:
            raise ValueError(f"Unsupported model: {self.model_name}")

    def _optimize_image(self, image_path: str) -> bytes:
        """Optimize image size and quality for API processing."""
        try:
            # Check if optimization is enabled
            if not self.enable_optimization:
                print(f"üîß Image optimization disabled - using original file")
                with open(image_path, "rb") as f:
                    return f.read()

            # Check file size first
            file_size_mb = os.path.getsize(image_path) / (1024 * 1024)
            print(f"üìè Original file size: {file_size_mb:.2f} MB")

            with Image.open(image_path) as img:
                # Convert to RGB if necessary
                if img.mode in ('RGBA', 'P'):
                    img = img.convert('RGB')

                original_size = img.size
                print(
                    f"üìê Original dimensions: {original_size[0]}x{original_size[1]}")

                # Apply resize ratio if specified
                if self.resize_ratio != 1.0:
                    new_width = int(img.size[0] * self.resize_ratio)
                    new_height = int(img.size[1] * self.resize_ratio)
                    img = img.resize((new_width, new_height),
                                     Image.Resampling.LANCZOS)
                    print(
                        f"üîß Resized by ratio {self.resize_ratio}: {img.size[0]}x{img.size[1]}")

                # Additional resize if still too large
                elif (img.size[0] > self.max_image_size[0] or
                      img.size[1] > self.max_image_size[1] or
                      file_size_mb > self.max_file_size_mb):

                    img.thumbnail(self.max_image_size,
                                  Image.Resampling.LANCZOS)
                    print(
                        f"üîß Thumbnail resize to: {img.size[0]}x{img.size[1]}")

                # Save to bytes with optimized quality
                img_byte_arr = io.BytesIO()

                # Choose format and quality based on original
                if Path(image_path).suffix.lower() in ['.jpg', '.jpeg']:
                    img.save(img_byte_arr, format='JPEG',
                             quality=self.jpeg_quality, optimize=True)
                    print(f"üíæ JPEG quality: {self.jpeg_quality}")
                else:
                    img.save(img_byte_arr, format='PNG', optimize=True)
                    print(f"üíæ PNG optimized")

                optimized_data = img_byte_arr.getvalue()
                optimized_size_mb = len(optimized_data) / (1024 * 1024)
                print(f"‚úÖ Final optimized size: {optimized_size_mb:.2f} MB")

                return optimized_data

        except Exception as e:
            print(f"‚ö†Ô∏è Image optimization failed, using original: {e}")
            # Fallback to original file
            with open(image_path, "rb") as f:
                return f.read()

    def _ensure_model_initialized(self):
        """Ensure model is initialized before use."""
        if self.model is None:
            try:
                self.model = self._initialize_model()
            except ValueError as e:
                raise ValueError(
                    f"Cannot initialize model '{self.model_name}': {e}")

    def encode_image(self, image_path: str) -> str:
        """Encode image to base64 string."""
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')

    def extract_data_from_image(self, image_path: str, prompt: str) -> Dict:
        """
        Extract data from a single image using the specified prompt.

        Args:
            image_path: Path to the image file
            prompt: Custom prompt for data extraction

        Returns:
            Dictionary containing the extracted data and metadata
        """
        try:
            print(f"üöÄ Starting image processing...")

            # Ensure model is initialized
            self._ensure_model_initialized()

            # Validate image file
            if not os.path.exists(image_path):
                return {
                    "error": f"Image file not found: {image_path}",
                    "success": False
                }

            # Check file extension
            file_ext = Path(image_path).suffix.lower()
            if file_ext not in self.supported_formats:
                return {
                    "error": f"Unsupported image format: {file_ext}",
                    "success": False
                }

            # Get correct MIME type
            mime_type = self.mime_type_map.get(file_ext, file_ext[1:])
            print(
                f"üîç File extension: {file_ext} ‚Üí MIME type: image/{mime_type}")

            # Optimize image
            print(f"üñºÔ∏è Optimizing image...")
            image_data = self._optimize_image(image_path)

            # Encode to base64
            print(f"üìù Encoding to base64...")
            base64_image = base64.b64encode(image_data).decode()
            base64_size_mb = len(base64_image) / (1024 * 1024)
            print(f"üìä Base64 size: {base64_size_mb:.2f} MB")

            # Construct data URL
            data_url = f"data:image/{mime_type};base64,{base64_image}"

            # Prepare message
            print(f"ü§ñ Sending to {self.model_name.upper()}...")
            message = HumanMessage(
                content=[
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {"url": data_url}
                    }
                ]
            )

            # Get response with timeout handling
            if self.model is not None:
                import time
                start_time = time.time()
                response = self.model.invoke([message])
                end_time = time.time()
                print(f"‚è±Ô∏è API call completed in {end_time - start_time:.2f} seconds")
            else:
                raise ValueError("Model not initialized")

            result = {
                "image_path": image_path,
                "model_used": self.model_name,
                "prompt": prompt,
                "extracted_data": response.content,
                "success": True,
                "timestamp": datetime.now().isoformat(),
                "processing_time_seconds": end_time - start_time if 'end_time' in locals() else None
            }
            
            # Format the result with parsed JSON data
            return format_extraction_result(result)

        except Exception as e:
            print(f"‚ùå Error during processing: {str(e)}")
            return {
                "image_path": image_path,
                "error": str(e),
                "success": False,
                "timestamp": datetime.now().isoformat()
            }

    def get_images_from_directory(self, directory_path: str) -> List[str]:
        """
        Get all supported image files from a directory.

        Args:
            directory_path: Path to the directory containing images

        Returns:
            List of image file paths
        """
        directory = Path(directory_path)
        if not directory.exists():
            raise ValueError(f"Directory not found: {directory_path}")

        image_files = []
        for file_path in directory.iterdir():
            if file_path.is_file() and file_path.suffix.lower() in self.supported_formats:
                image_files.append(str(file_path))

        return sorted(image_files)

    def save_results(self, results: List[Dict], output_file: str):
        """Save results to a JSON file."""
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2, ensure_ascii=False)

    def save_results_formatted(self, results: List[Dict], output_file: str, 
                             include_parsed_data: bool = True):
        """
        Save results with enhanced formatting and optional separation of parsed data.
        
        Args:
            results: List of OCR results
            output_file: Path to save the formatted results
            include_parsed_data: Whether to include parsed JSON data separately
        """
        # Create formatted results
        formatted_results = []
        
        for result in results:
            formatted_result = result.copy()
            
            # If we have parsed JSON data, optionally restructure
            if include_parsed_data and "extracted_data_parsed" in result:
                # Move parsed data to top level for easier access
                formatted_result["structured_data"] = result["extracted_data_parsed"]
                
                # Keep original text for reference but make it shorter in display
                original_text = result["extracted_data"]
                if len(original_text) > 200:
                    formatted_result["extracted_data_preview"] = original_text[:200] + "..."
                    formatted_result["extracted_data_full"] = original_text
                    del formatted_result["extracted_data"]
                
            formatted_results.append(formatted_result)
        
        # Save with proper formatting
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(formatted_results, f, indent=2, ensure_ascii=False)
        
        print(f"üìÑ Formatted results saved to: {output_file}")
        
        # Also create a summary file with just the structured data
        if include_parsed_data:
            summary_file = output_file.replace('.json', '_structured_only.json')
            structured_only = []
            
            for result in formatted_results:
                if "structured_data" in result:
                    structured_only.append({
                        "image_path": result["image_path"],
                        "model_used": result["model_used"],
                        "success": result["success"],
                        "structured_data": result["structured_data"],
                        "timestamp": result["timestamp"]
                    })
            
            with open(summary_file, 'w', encoding='utf-8') as f:
                json.dump(structured_only, f, indent=2, ensure_ascii=False)
            
            print(f"üìã Structured data summary saved to: {summary_file}")

    def process_directory(self, directory_path: str, prompt: str,
                          output_file: Optional[str] = None,
                          save_formatted: bool = True) -> List[Dict]:
        """
        Process all images in a directory with enhanced result formatting.
        """
        image_files = self.get_images_from_directory(directory_path)

        if not image_files:
            print(f"No supported image files found in {directory_path}")
            return []

        print(f"Found {len(image_files)} image(s) to process...")

        results = []
        for i, image_path in enumerate(image_files, 1):
            print(f"Processing image {i}/{len(image_files)}: {Path(image_path).name}")
            result = self.extract_data_from_image(image_path, prompt)
            results.append(result)

            if result["success"]:
                print(f"‚úÖ Successfully processed {Path(image_path).name}")
                # Show preview of structured data if available
                if "extracted_data_parsed" in result:
                    print(f"üìä Extracted structured data format: {result.get('extraction_format', 'unknown')}")
            else:
                print(f"‚ùå Failed to process {Path(image_path).name}: {result.get('error', 'Unknown error')}")

        # Save results with enhanced formatting
        if output_file:
            if save_formatted:
                self.save_results_formatted(results, output_file, include_parsed_data=True)
            else:
                self.save_results(results, output_file)
            print(f"Results saved to {output_file}")

        return results

    def change_model(self, model_name: str):
        """Change the model being used."""
        self.model_name = model_name
        self.model = None  # Reset model
        try:
            self.model = self._initialize_model()
            print(f"‚úÖ Model changed to: {model_name}")
        except ValueError as e:
            print(f"‚ö†Ô∏è Warning: Failed to initialize {model_name}: {e}")
            print(f"Model will be initialized when needed if API key becomes available.")

    def check_api_keys(self) -> Dict[str, bool]:
        """Check which API keys are available."""
        keys_status = {}

        google_key = os.getenv("GOOGLE_API_KEY")
        keys_status["gemini"] = bool(
            google_key and google_key != "your_google_api_key_here")

        openai_key = os.getenv("OPENAI_API_KEY")
        keys_status["openai"] = bool(
            openai_key and openai_key != "your_openai_api_key_here")

        anthropic_key = os.getenv("ANTHROPIC_API_KEY")
        keys_status["anthropic"] = bool(
            anthropic_key and anthropic_key != "your_anthropic_api_key_here")

        return keys_status


print("OCRBenchmark class defined successfully!")
print(
    f"üîß Global settings: Optimization={ENABLE_IMAGE_OPTIMIZATION}, Resize ratio={IMAGE_RESIZE_RATIO}")

OCRBenchmark class defined successfully!
üîß Global settings: Optimization=False, Resize ratio=0.5


In [42]:
# Example usage and demonstration
def create_sample_prompts():
    """Create sample prompts for different OCR tasks."""
    return {
        "general_ocr": "Extract all text from this image and format it nicely. Preserve the structure and layout as much as possible.",

        "payslip_extraction": """
        This is a payslip/salary slip image. Please extract the following information in JSON format:
        {
          "employee_name": "",
          "employee_id": "",
          "pay_period": "",
          "gross_salary": "",
          "net_salary": "",
          "deductions": [],
          "company_name": ""
        }
        If any field is not found, use null.
        """,

        "structured_data": """
        Extract all structured data from this image and organize it in a clear, readable format. 
        Identify tables, forms, or any structured information and present it clearly.
        """,

        "key_value_pairs": """
        Extract all key-value pairs from this image. Present them as:
        Key: Value
        Key: Value
        etc.
        """,

        "cash_expense_extraction": """
You are an expert document processing assistant. Your task is to extract structured data from invoice/receipt images and return the information in a specific JSON format.

## Instructions:
1. Analyze the provided image carefully to identify all relevant financial information
2. Extract the data according to the JSON schema provided below
3. If any required field is not visible or unclear in the image, use reasonable defaults or null values
4. For monetary amounts, use numbers without currency symbols
5. For dates, use the format found in the document or convert to YYYY-MM-DD if possible
6. Be precise with calculations - ensure totals match the extracted item details
7. The document may contain text in both English and Bangla (Bengali). For Bangla text, provide English transliteration (not translation) - write the Bangla words using English letters to show how they sound. For Bangla numbers convert them to
    English numerals (e.g., "‡ßß‡ß®‡ß©" becomes "123").
## Required JSON Schema:
Return your response as a valid JSON object following this exact structure:

{
  "payment_details": {
    "supplier": "string - Name of the supplier/vendor",
    "payment_account": "string - Account used for payment",
    "payment_date": "string - Date of payment",
    "payment_method": "string - Mode of payment (cash, card, bank transfer, etc.)",
    "ref_no": "string - Reference/invoice number",
    "tags": ["array of relevant tags"]
  },
  "item_details": [
    {
      "category": "string - Expense category",
      "description": "string - Item description (optional)",
      "quantity": "number - Quantity of items",
      "unit_price": "number - Price per unit",
      "total": "number - Total amount for this item"
    },
    {
      "category": "string - Expense category for item 2",
      "description": "string - Item description (optional)",
      "quantity": "number - Quantity of items",
      "unit_price": "number - Price per unit", 
      "total": "number - Total amount for this item"
    }
    // ... additional items as found in the document
  ],
  "attachment": "string - File reference if mentioned",
  "memo": "string - Any additional notes or memo",
  "totals": {
    "sub_total": "number - Subtotal before tax and discount",
    "sales_tax": {
      "is_percentage": "boolean - true if tax is %, false if fixed amount",
      "tax": "number - Tax rate / VAT rate (%)",
      "amount": "number - Calculated tax amount"
    },
    "discount": {
      "is_percentage": "boolean - true if discount is %, false if fixed amount", 
      "discount": "number - Discount rate (%)",
      "calculated_amount": "number - Calculated discount amount"
    },
    "total_amount": "number - Final total amount"
  }
}

## Extraction Guidelines:
- **Supplier**: Look for business name, vendor name, or "Bill To" information
- **Payment Account**: Extract account numbers, card details, or payment method info
- **Payment Date**: Find transaction date, invoice date, or payment date
- **Payment Method**: Identify if paid by cash, card, check, bank transfer, etc.
- **Reference Number**: Look for invoice #, receipt #, transaction ID, or reference number
- **Tags**: Generate relevant tags based on the business type or expense category
- **Item Details**: Extract ALL line items from the document - create a separate object for each item/product/service listed
- **Categories**: Classify expenses (office supplies, travel, meals, equipment, etc.)
- **Calculations**: Verify that item totals sum to subtotal, and final calculations are accurate
- **Language Handling**: If you encounter Bangla/Bengali text, transliterate it into English letters (e.g., "‡¶ü‡¶æ‡¶ï‡¶æ" becomes "taka", "‡¶®‡¶æ‡¶Æ" becomes "naam"). Do not translate the meaning, just write how the Bangla words sound in English.

## Important Notes:
- Return ONLY the JSON object, no additional text or explanations
- If information is missing, use null for strings and 0 for numbers
- Ensure all numbers are numeric values, not strings
- **Extract every single line item** - the item_details array should contain one object for each product/service listed in the document
- Double-check mathematical accuracy of totals and ensure all item totals sum to the subtotal
- VAT and tax should be considered same.
- If no tax/VAT or discount is present, set the respective amounts to 0

Now please analyze the provided image and extract the data according to this schema.
        """,
        "payslip_extraction": """
        This is a payslip/salary slip image. Please extract the following information in JSON format:
        {
          "employee_name": "",
          "employee_id": "",
          "pay_period": "",
          "gross_salary": "",
          "net_salary": "",
          "deductions": [],
          "company_name": ""
        }
        If any field is not found, use null.
        """,

        "structured_data": """
        Extract all structured data from this image and organize it in a clear, readable format. 
        Identify tables, forms, or any structured information and present it clearly.
        """,

        "key_value_pairs": """
        Extract all key-value pairs from this image. Present them as:
        Key: Value
        Key: Value
        etc.
        """,

        "cash_expense_extraction_V2": """
You are an expert document processing assistant. Your task is to extract structured data from invoice/receipt images and return the information in a specific JSON format.

## Instructions:
1. Analyze the provided image carefully to identify all relevant financial information
2. Extract the data according to the JSON schema provided below
3. If any required field is not visible or unclear in the image, use reasonable defaults or null values
4. For monetary amounts, use numbers without currency symbols
5. For dates, use the format found in the document or convert to YYYY-MM-DD if possible
6. Be precise with calculations - ensure totals match the extracted item details
7. The document may contain text in both English and Bangla (Bengali). For Bangla text, provide English transliteration (not translation) - write the Bangla words using English letters to show how they sound. For Bangla numbers convert them to
    English numerals (e.g., "‡ßß‡ß®‡ß©" becomes "123").
## Required JSON Schema:
Return your response as a valid JSON object following this exact structure:

{
  "payment_details": {
    "supplier": "string - Name of the supplier/vendor",
    "payment_account": "string - Account used for payment",
    "payment_date": "string - Date of payment",
    "payment_method": "string - Mode of payment (cash, card, bank transfer, etc.)",
    "ref_no": "string - Reference/invoice number",
    "tags": ["array of relevant tags"]
  },
  "item_details": [
    {
      "category": "string - Expense category",
      "description": "string - Item description (optional)",
      "quantity": "number - Quantity of items",
      "unit_price": "number - Price per unit",
      "total": "number - Total amount for this item"
    },
    {
      "category": "string - Expense category for item 2",
      "description": "string - Item description (optional)",
      "quantity": "number - Quantity of items",
      "unit_price": "number - Price per unit", 
      "total": "number - Total amount for this item"
    }
    // ... additional items as found in the document
  ],
  "attachment": "string - File reference if mentioned",
  "memo": "string - Any additional notes or memo",
  "totals": {
    "sub_total": "number - Subtotal before tax and discount",
    "sales_tax": {
      "is_percentage": "boolean - true if tax/VAT is %, false if fixed amount",
      "tax": "number - Tax/VAT rate (%)", 
      "amount": "number - Calculated tax/VAT amount"
    },
    "discount": {
      "is_percentage": "boolean - true if discount is %, false if fixed amount", 
      "discount": "number - Discount rate (%)",
      "calculated_amount": "number - Calculated discount amount"
    },
    "total_amount": "number - Final total amount"
  }
}

## Extraction Guidelines:
- **Supplier**: Look for business name, vendor name, or "Bill To" information
- **Payment Account**: Extract account numbers, card details, or payment method info
- **Payment Date**: Find transaction date, invoice date, or payment date
- **Payment Method**: Identify if paid by cash, card, check, bank transfer, etc.
- **Reference Number**: Look for invoice #, receipt #, transaction ID, or reference number
- **Tags**: Generate relevant tags based on the business type or expense category
- **Item Details**: Extract ALL line items from the document - create a separate object for each item/product/service listed
- **Categories**: Classify expenses (office supplies, travel, meals, equipment, etc.)
- **Calculations**: Verify that item totals sum to subtotal, and final calculations are accurate
- **Language Handling**: If you encounter Bangla/Bengali text, transliterate it into English letters (e.g., "‡¶ü‡¶æ‡¶ï‡¶æ" becomes "taka", "‡¶®‡¶æ‡¶Æ" becomes "naam"). Do not translate the meaning, just write how the Bangla words sound in English.
- **Tax/VAT Recognition**: Look for any tax-related terms including "Tax", "VAT", "Sales Tax", "Service Tax", "GST", or similar terms in both English and Bangla. Treat all of these as sales tax in the JSON structure.


## Important Notes:
- Return ONLY the JSON object, no additional text or explanations
- If information is missing, use null for strings and 0 for numbers
- Ensure all numbers are numeric values, not strings
- **Extract every single line item** - the item_details array should contain one object for each product/service listed in the document
- Double-check mathematical accuracy of totals and ensure all item totals sum to the subtotal
- VAT and tax should be considered same.
- If no tax/VAT or discount is present, set the respective amounts to 0

Now please analyze the provided image and extract the data according to this schema.
        """
    }


# Create sample prompts
sample_prompts = create_sample_prompts()

print("Sample prompts created:")
for name, prompt in sample_prompts.items():
    print(f"\nüìù {name.upper()}:")
    print(f"   {prompt[:1000]}...")

print("\nReady to use! Check the next cells for usage examples.")

Sample prompts created:

üìù GENERAL_OCR:
   Extract all text from this image and format it nicely. Preserve the structure and layout as much as possible....

üìù PAYSLIP_EXTRACTION:
   
        This is a payslip/salary slip image. Please extract the following information in JSON format:
        {
          "employee_name": "",
          "employee_id": "",
          "pay_period": "",
          "gross_salary": "",
          "net_salary": "",
          "deductions": [],
          "company_name": ""
        }
        If any field is not found, use null.
        ...

üìù STRUCTURED_DATA:
   
        Extract all structured data from this image and organize it in a clear, readable format. 
        Identify tables, forms, or any structured information and present it clearly.
        ...

üìù KEY_VALUE_PAIRS:
   
        Extract all key-value pairs from this image. Present them as:
        Key: Value
        Key: Value
        etc.
        ...

üìù CASH_EXPENSE_EXTRACTION:
   
You are an 

In [47]:
# Initialize the OCR Benchmark system
# By default, it uses Gemini model

try:
    # Initialize with Gemini (default)
    ocr = OCRBenchmark(model_name="gemini", temperature=0.1)
    print("‚úÖ OCR Benchmark system initialized successfully with Gemini!")
    print(f"Current model: {ocr.model_name}")
    
    # Check if we have images to process
    pay_slip_dir = "/home/tanjim_noor/Work/AI OCR BenchMark/Good Pictures"
    if os.path.exists(pay_slip_dir):
        image_files = ocr.get_images_from_directory(pay_slip_dir)
        print(f"Found {len(image_files)} image(s) in Pay Slip directory")
        if image_files:
            print("Sample images:")
            for i, img in enumerate(image_files):
                print(f"  {i+1}. {Path(img).name}")
        
except Exception as e:
    print(f"‚ùå Error initializing OCR system: {e}")
    print("Please make sure to set your API keys in the .env file")

‚úÖ OCR Benchmark system initialized successfully with Gemini!
Current model: gemini
Found 11 image(s) in Pay Slip directory
Sample images:
  1. IMG_20250826_154715.jpg
  2. IMG_20250826_154715_1.jpg
  3. IMG_20250826_154929.jpg
  4. IMG_20250826_155237.jpg
  5. IMG_20250826_155355.jpg
  6. IMG_20250826_155901.jpg
  7. IMG_20250826_160504.jpg
  8. IMG_20250826_160529.jpg
  9. IMG_20250826_160544.jpg
  10. IMG_20250826_160721.jpg
  11. IMG_20250826_160928.jpg


In [51]:
# Example 1: Process a single image
# Let's try to process one image from the Pay Slip directory

try:
    pay_slip_dir = "/home/tanjim_noor/Work/AI OCR BenchMark/Good Pictures"
    if os.path.exists(pay_slip_dir):
        image_files = ocr.get_images_from_directory(pay_slip_dir)
        
        if image_files:
            # Process the first image with payslip extraction prompt
            first_image = image_files[0]
            print(f"Processing: {Path(first_image).name}")
            print("Using general OCR prompt...")

            result = ocr.extract_data_from_image(first_image, sample_prompts["cash_expense_extraction_V2"])
            
            if result["success"]:
                print("\n‚úÖ SUCCESS!")
                print(f"Model used: {result['model_used']}")
                print(f"Extracted data:\n{result['extracted_data']}")
            else:
                print(f"\n‚ùå FAILED: {result.get('error', 'Unknown error')}")
        else:
            print("No images found in the directory")
    else:
        print("Pay Slip directory not found")
        
except Exception as e:
    print(f"Error: {e}")
    print("Make sure your API key is set correctly in the .env file")

Processing: IMG_20250826_154715.jpg
Using general OCR prompt...
üöÄ Starting image processing...
üîç File extension: .jpg ‚Üí MIME type: image/jpeg
üñºÔ∏è Optimizing image...
üîß Image optimization disabled - using original file
üìù Encoding to base64...
üìä Base64 size: 5.75 MB
ü§ñ Sending to GEMINI...
‚è±Ô∏è API call completed in 24.77 seconds

‚úÖ SUCCESS!
Model used: gemini
Extracted data:
```json
{
  "payment_details": {
    "supplier": "SHWAPNO",
    "payment_account": null,
    "payment_date": "2025-06-26",
    "payment_method": "City Bank",
    "ref_no": "D0612506260221",
    "tags": [
      "grocery",
      "supermarket",
      "food",
      "retail"
    ]
  },
  "item_details": [
    {
      "category": "Groceries",
      "description": "Fresh Soyabean Oil 5 Ltr",
      "quantity": 1,
      "unit_price": 922.00,
      "total": 922.00
    },
    {
      "category": "Dairy & Refrigerated",
      "description": "ULTRA Sweet Curd 500Gm",
      "quantity": 1,
      "unit_pr

In [45]:
# Example 2: Process all images in directory (Batch Processing)
# This will process all images in the Pay Slip directory

# Uncomment the following code to process all images:

try:
    pay_slip_dir = "/home/tanjim_noor/Work/AI OCR BenchMark/For benchmark"
    output_file = "/home/tanjim_noor/Work/AI OCR BenchMark/OCR_Output/ocr_result_1.json"
    
    print("Starting batch processing...")
    print(f"Input directory: {pay_slip_dir}")
    print(f"Output file: {output_file}")
    print("=" * 50)
    
    # Process all images with payslip extraction prompt
    results = ocr.process_directory(
        directory_path=pay_slip_dir,
        prompt=sample_prompts["cash_expense_extraction_V2"],
        output_file=output_file
    )
    
    # Summary
    successful = sum(1 for r in results if r["success"])
    failed = len(results) - successful
    
    print("=" * 50)
    print(f"üìä PROCESSING SUMMARY:")
    print(f"Total images: {len(results)}")
    print(f"Successful: {successful}")
    print(f"Failed: {failed}")
    print(f"Results saved to: {output_file}")
    
except Exception as e:
    print(f"Error during batch processing: {e}")


print("Batch processing code is commented out.")
#print("Uncomment the code above to process all images in the directory.")
#print("Note: This will use API calls for each image, so be mindful of costs!")

Starting batch processing...
Input directory: /home/tanjim_noor/Work/AI OCR BenchMark/For benchmark
Output file: /home/tanjim_noor/Work/AI OCR BenchMark/OCR_Output/ocr_result_1.json
Found 2 image(s) to process...
Processing image 1/2: IMG_20250826_154308~2.jpg
üöÄ Starting image processing...
üîç File extension: .jpg ‚Üí MIME type: image/jpeg
üñºÔ∏è Optimizing image...
üîß Image optimization disabled - using original file
üìù Encoding to base64...
üìä Base64 size: 1.77 MB
ü§ñ Sending to GEMINI...
‚è±Ô∏è API call completed in 12.05 seconds
‚úÖ Successfully processed IMG_20250826_154308~2.jpg
üìä Extracted structured data format: json
Processing image 2/2: IMG_20250826_155754.jpg
üöÄ Starting image processing...
üîç File extension: .jpg ‚Üí MIME type: image/jpeg
üñºÔ∏è Optimizing image...
üîß Image optimization disabled - using original file
üìù Encoding to base64...
üìä Base64 size: 4.35 MB
ü§ñ Sending to GEMINI...
‚è±Ô∏è API call completed in 12.05 seconds
‚úÖ Successful

In [17]:
output_json = {
    "payment_details": {
        "supplier": "Maa Fruits & Departmental Store",
        "payment_account": None,
        "payment_date": "2020-06-20",
        "payment_method": "Cash",
        "ref_no": None,
        "tags": [
            "Fruits",
            "Departmental Store",
            "Retail",
            "Wholesale"
        ]
    },
    "item_details": [
        {
            "category": "Fruits",
            "description": "Tomatoes",
            "quantity": 80,
            "unit_price": 7.5,
            "total": 600
        }
    ],
    "attachment": None,
    "memo": "Soles goods are not returnable. Welcome come again.",
    "totals": {
        "sub_total": 600,
        "sales_tax": {
            "is_percentage": False,
            "tax": 0,
            "amount": 0
        },
        "discount": {
            "is_percentage": False,
            "discount": 0,
            "calculated_amount": 0
        },
        "total_amount": 600
    }
}


actual_json = {
    "payment_details": {
        "supplier": "Maa Fruits & Departmental Store",
        "payment_account": None,
        "payment_date": "2020-06-20",
        "payment_method": "Cash",
        "ref_no": None,
        "tags": [
            "Fruits",
            "Departmental Store",
            "Retail",
            "Wholesale"
        ]
    },
    "item_details": [
        {
            "category": "Fruits",
            "description": "Tomatoes",
            "quantity": 80,
            "unit_price": 10,
            "total": 800
        }
    ],
    "attachment": None,
    "memo": "Soled goods are not returnable. Welcome come again.",
    "totals": {
        "sub_total": 800,
        "sales_tax": {
            "is_percentage": False,
            "tax": 0,
            "amount": 0
        },
        "discount": {
            "is_percentage": False,
            "discount": 0,
            "calculated_amount": 0
        },
        "total_amount": 800
    }
}



output_json_2 = {
  "payment_details": {
    "supplier": "SHWAPNO (ACI Logistics Limited)",
    "payment_account": "City Bank",
    "payment_date": "2025-06-26",
    "payment_method": "Bank Transfer/Card (implied by City Bank)",
    "ref_no": "D0612506260068",
    "tags": ["Groceries", "Supermarket", "Retail", "Food"]
  },
  "item_details": [
    {
      "category": "Groceries",
      "description": "Beef Premium Cube kg",
      "quantity": 6.33,
      "unit_price": 775.00,
      "total": 4901.88
    },
    {
      "category": "Groceries",
      "description": "Broiler Chicken Breast Bon",
      "quantity": 0.50,
      "unit_price": 534.00,
      "total": 267.00
    },
    {
      "category": "Groceries",
      "description": "Capsicum Green kg",
      "quantity": 1.01,
      "unit_price": 320.00,
      "total": 321.60
    },
    {
      "category": "Groceries",
      "description": "Carrot China (China Gajor) K",
      "quantity": 1.40,
      "unit_price": 160.00,
      "total": 224.00
    },
    {
      "category": "Groceries",
      "description": "Cucumber (Shosha) kg",
      "quantity": 2.69,
      "unit_price": 55.00,
      "total": 147.68
    },
    {
      "category": "Groceries",
      "description": "Green Chili (Kacha Morich)",
      "quantity": 0.67,
      "unit_price": 100.00,
      "total": 66.50
    },
    {
      "category": "Groceries",
      "description": "Green Papaya (Kacha Pepe)",
      "quantity": 3.00,
      "unit_price": 45.00,
      "total": 134.78
    },
    {
      "category": "Groceries",
      "description": "Lemon Long (Lomba Lebu) PC",
      "quantity": 8.00,
      "unit_price": 6.00,
      "total": 48.00
    },
    {
      "category": "Groceries",
      "description": "Long Bean (Boroboti) KG",
      "quantity": 1.83,
      "unit_price": 90.00,
      "total": 164.70
    },
    {
      "category": "Groceries",
      "description": "Piyaj Deshi Loose kg",
      "quantity": 2.66,
      "unit_price": 58.00,
      "total": 154.28
    },
    {
      "category": "Groceries",
      "description": "Shonalika Dressed Classic",
      "quantity": 1.65,
      "unit_price": 667.00,
      "total": 1097.22
    },
    {
      "category": "Groceries",
      "description": "ACI Pure Corn Flour 150g",
      "quantity": 1.00,
      "unit_price": 60.00,
      "total": 60.00
    },
    {
      "category": "Groceries",
      "description": "Shwapno Black Pepper Powde",
      "quantity": 1.00,
      "unit_price": 129.00,
      "total": 129.00
    }
  ],
  "attachment": False,
  "memo": "Thank you for shopping with SHWAPNO. Please visit www.shwapno.com for home delivery. Purchase of defected item must be exchanged by 24 hours with invoice. For any queries, suggestions or complaints, please call 16469 (9:00 AM - 6:00 PM). Earned points will expire within 6 months from the date of the transaction if not redeemed. VAT against this challan is payable through central registration. Prices inclusive of standard VAT except exempted items, VAT Payable TK. : 7.17.",
  "totals": {
    "sub_total": 7716.64,
    "sales_tax": {
      "is_percentage": False,
      "tax": 0,
      "amount": 0
    },
    "discount": {
      "is_percentage": False,
      "discount": 0,
      "calculated_amount": 0
    },
    "total_amount": 7716.64
  }
}

actual_json_2 = {"payment_details": {
    "supplier": "SHWAPNO (ACI Logistics Limited)",
    "payment_account": "City Bank",
    "payment_date": "2025-06-26",
    "payment_method": "Bank Transfer/Card (implied by City Bank)",
    "ref_no": "D0612506260068",
    "tags": ["Groceries", "Supermarket", "Retail", "Food"]
  },
  "item_details": [
    {
      "category": "Groceries",
      "description": "Beef Premium Cube kg",
      "quantity": 6.33,
      "unit_price": 775.00,
      "total": 4901.88
    },
    {
      "category": "Groceries",
      "description": "Broiler Chicken Breast Bon",
      "quantity": 0.50,
      "unit_price": 534.00,
      "total": 267.00
    },
    {
      "category": "Groceries",
      "description": "Capsicum Green kg",
      "quantity": 1.01,
      "unit_price": 320.00,
      "total": 321.60
    },
    {
      "category": "Groceries",
      "description": "Carrot China (China Gajor) K",
      "quantity": 1.40,
      "unit_price": 160.00,
      "total": 224.00
    },
    {
      "category": "Groceries",
      "description": "Cucumber (Shosha) kg",
      "quantity": 2.69,
      "unit_price": 55.00,
      "total": 147.68
    },
    {
      "category": "Groceries",
      "description": "Green Chili (Kacha Morich)",
      "quantity": 0.67,
      "unit_price": 100.00,
      "total": 66.50
    },
    {
      "category": "Groceries",
      "description": "Green Papaya (Kacha Pepe)",
      "quantity": 3.00,
      "unit_price": 45.00,
      "total": 134.78
    },
    {
      "category": "Groceries",
      "description": "Lemon Long (Lomba Lebu) PC",
      "quantity": 8.00,
      "unit_price": 6.00,
      "total": 48.00
    },
    {
      "category": "Groceries",
      "description": "Long Bean (Boroboti) KG",
      "quantity": 1.83,
      "unit_price": 90.00,
      "total": 164.70
    },
    {
      "category": "Groceries",
      "description": "Piyaj Deshi Loose kg",
      "quantity": 2.66,
      "unit_price": 58.00,
      "total": 154.28
    },
    {
      "category": "Groceries",
      "description": "Shonalika Dressed Classic",
      "quantity": 1.65,
      "unit_price": 667.00,
      "total": 1097.22
    },
    {
      "category": "Groceries",
      "description": "ACI Pure Corn Flour 150g",
      "quantity": 1.00,
      "unit_price": 60.00,
      "total": 60.00
    },
    {
      "category": "Groceries",
      "description": "Shwapno Black Pepper Powde",
      "quantity": 1.00,
      "unit_price": 129.00,
      "total": 129.00
    }
  ],
  "attachment": False,
  "memo": "Thank you for shopping with SHWAPNO. Please visit www.shwapno.com for home delivery. Purchase of defected item must be exchanged by 24 hours with invoice. For any queries, suggestions or complaints, please call 16469 (9:00 AM - 6:00 PM). Earned points will expire within 6 months from the date of the transaction if not redeemed. VAT against this challan is payable through central registration. Prices inclusive of standard VAT except exempted items, VAT Payable TK. : 7.17.",
  "totals": {
    "sub_total": 7716.64,
    "sales_tax": {
      "is_percentage": False,
      "tax": 0,
      "amount": 7.17
    },
    "discount": {
      "is_percentage": False,
      "discount": 0,
      "calculated_amount": 30.80
    },
    "total_amount": 7686.00
  }
}




In [89]:
# %pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.55.4-py3-none-any.whl.metadata (41 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.7.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.16.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (61 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting filelock (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading filelock-3.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting tokenizers<0.22,>=0.

In [6]:
import math
from typing import Any, Dict, List
from difflib import SequenceMatcher
import numpy as np

# Optional: install if not already
# pip install sentence-transformers

from sentence_transformers import SentenceTransformer

# Load a small model once (fast + lightweight)
embed_model = SentenceTransformer("all-MiniLM-L6-v2")


# --- helpers ---
def get_by_path(data: dict, path: str):
    """
    Supports:
    - normal dot paths ("totals.sub_total")
    - indexed paths ("item_details[0].quantity")
    - wildcard array paths ("item_details.quantity") -> returns list of values
    """
    keys = path.replace("]", "").split(".")
    val = data
    for k in keys:
        if isinstance(val, list):
            # Apply to each element if list
            if k.isdigit():
                val = val[int(k)]
            else:
                val = [v[k] for v in val]
        else:
            if "[" in k:  # explicit index
                field, idx = k.split("[")
                val = val[field][int(idx)]
            else:
                val = val[k]
    return val


# --- comparators ---
def exact_match(a, b) -> float:
    return 1.0 if str(a) == str(b) else 0.0

def numeric_tolerance(a, b, tolerance: float = 0.1) -> float:
    try:
        a, b = float(a), float(b)
    except (ValueError, TypeError):
        return 0.0
    if a == b:
        return 1.0
    rel_error = abs(a - b) / (abs(b) + 1e-8)
    return max(0.0, 1 - rel_error / tolerance)

def semantic_similarity(a, b) -> float:
    """Cheap semantic: edit similarity."""
    if not a or not b:
        return 0.0
    return SequenceMatcher(None, str(a).lower(), str(b).lower()).ratio()

def embedding_similarity(a, b) -> float:
    """Semantic similarity using embeddings + cosine similarity."""
    if not a or not b:
        return 0.0
    emb_a = embed_model.encode(str(a), convert_to_numpy=True)
    emb_b = embed_model.encode(str(b), convert_to_numpy=True)
    cos_sim = np.dot(emb_a, emb_b) / (np.linalg.norm(emb_a) * np.linalg.norm(emb_b))
    # map cosine [-1,1] to [0,1]
    return float((cos_sim + 1) / 2)


COMPARATORS = {
    "exact": exact_match,
    "numeric": numeric_tolerance,
    "semantic": semantic_similarity,   # lightweight string-based
    "embedding": embedding_similarity, # heavy but meaningful
}


# --- main scoring ---
def score_json(pred: dict, actual: dict, config: List[Dict[str, Any]]) -> Dict[str, Any]:
    total_weight = sum(field["weight"] for field in config)
    breakdown = []
    weighted_sum = 0.0

    for field in config:
        path = field["path"]
        weight = field.get("weight", 1.0)
        comparator = COMPARATORS[field["comparator"]]
        tolerance = field.get("tolerance", 0.1)

        pred_val = get_by_path(pred, path)
        actual_val = get_by_path(actual, path)

        # Case 1: scalar
        if not isinstance(pred_val, list):
            pred_val, actual_val = [pred_val], [actual_val]

        # If lengths differ, penalize missing items
        max_len = max(len(pred_val), len(actual_val))
        scores = []
        for i in range(max_len):
            pv = pred_val[i] if i < len(pred_val) else None
            av = actual_val[i] if i < len(actual_val) else None

            if field["comparator"] == "numeric":
                s = comparator(pv, av, tolerance)
            else:
                s = comparator(pv, av)
            scores.append(s)

            breakdown.append({
                "path": f"{path}[{i}]",
                "pred": pv,
                "actual": av,
                "comparator": field["comparator"],
                "score": round(s, 3),
                "weight": weight,
                "weighted": round(weight * s, 3)
            })

        avg_score = sum(scores) / max_len if max_len > 0 else 0
        weighted_sum += weight * avg_score

    final_score = weighted_sum / total_weight if total_weight > 0 else 0
    return {
        "final_score": round(final_score, 3),
        "breakdown": breakdown
    }


In [53]:
config = [
    {"path": "payment_details.supplier", "weight": 1, "comparator": "exact"},
    {"path": "payment_details.payment_date", "weight": 1, "comparator": "exact"},
    {"path": "payment_details.payment_account", "weight": 1, "comparator": "embedding"},
    {"path": "payment_details.payment_method", "weight": 1, "comparator": "exact"},
    {"path": "payment_details.ref_no", "weight": 1, "comparator": "exact"},
    {"path": "payment_details.tags", "weight": 1, "comparator": "semantic"},
    {"path": "item_details.category", "weight": 1, "comparator": "exact"},
    {"path": "item_details.description", "weight": 1, "comparator": "embedding"},
    {"path": "item_details.quantity", "weight": 1, "comparator": "numeric", "tolerance": 0.05},
    {"path": "item_details.unit_price", "weight": 1, "comparator": "numeric", "tolerance": 0.1},
    {"path": "item_details.total", "weight": 1, "comparator": "numeric", "tolerance": 0.1},
    {"path": "attachment", "weight": 1, "comparator": "exact"},
    {"path": "memo", "weight": 1, "comparator": "embedding"},
    {"path": "totals.sub_total", "weight": 1, "comparator": "numeric", "tolerance": 0.1},
    {"path": "totals.sales_tax.is_percentage", "weight": 1, "comparator": "exact"},
    {"path": "totals.sales_tax.tax", "weight": 1, "comparator": "numeric", "tolerance": 0.1},
    {"path": "totals.sales_tax.amount", "weight": 1, "comparator": "numeric", "tolerance": 0.1},
    {"path": "totals.discount.is_percentage", "weight": 1, "comparator": "exact"},
    {"path": "totals.discount.discount", "weight": 1, "comparator": "numeric", "tolerance": 0.1},
    {"path": "totals.discount.calculated_amount", "weight": 1, "comparator": "numeric", "tolerance": 0.1},
    {"path": "totals.total_amount", "weight": 1, "comparator": "numeric", "tolerance": 0.1},
]


result = score_json(output_json_2, actual_json_2, config)

print("Final Score:", result["final_score"])
for r in result["breakdown"]:
    print(r)

Final Score: 0.952
{'path': 'payment_details.supplier[0]', 'pred': 'SHWAPNO (ACI Logistics Limited)', 'actual': 'SHWAPNO (ACI Logistics Limited)', 'comparator': 'exact', 'score': 1.0, 'weight': 1, 'weighted': 1.0}
{'path': 'payment_details.payment_date[0]', 'pred': '2025-06-26', 'actual': '2025-06-26', 'comparator': 'exact', 'score': 1.0, 'weight': 1, 'weighted': 1.0}
{'path': 'payment_details.payment_account[0]', 'pred': 'City Bank', 'actual': 'City Bank', 'comparator': 'embedding', 'score': 1.0, 'weight': 1, 'weighted': 1.0}
{'path': 'payment_details.payment_method[0]', 'pred': 'Bank Transfer/Card (implied by City Bank)', 'actual': 'Bank Transfer/Card (implied by City Bank)', 'comparator': 'exact', 'score': 1.0, 'weight': 1, 'weighted': 1.0}
{'path': 'payment_details.ref_no[0]', 'pred': 'D0612506260068', 'actual': 'D0612506260068', 'comparator': 'exact', 'score': 1.0, 'weight': 1, 'weighted': 1.0}
{'path': 'payment_details.tags[0]', 'pred': 'Groceries', 'actual': 'Groceries', 'compar

In [54]:
# Batch scoring for all images in JSON files
import json
import os

def score_all_images(results_json_path, actual_json_path, config):
    """
    Score all images by comparing results JSON with actual JSON
    
    Args:
        results_json_path: Path to the OCR results JSON file
        actual_json_path: Path to the actual/ground truth JSON file
        config: Configuration list for scoring (same as above)
    
    Returns:
        Dict with overall statistics and individual image scores
    """
    
    # Load JSON files
    try:
        with open(results_json_path, 'r') as f:
            results_data = json.load(f)
        with open(actual_json_path, 'r') as f:
            actual_data = json.load(f)
    except FileNotFoundError as e:
        print(f"Error: File not found - {e}")
        return None
    except json.JSONDecodeError as e:
        print(f"Error: Invalid JSON - {e}")
        return None
    
    # Create lookup dictionary for actual data by image path
    actual_lookup = {item['image_path']: item for item in actual_data}
    
    # Score each image
    image_scores = []
    total_score = 0
    processed_count = 0
    
    print("=" * 80)
    print("BATCH SCORING RESULTS")
    print("=" * 80)
    
    for result_item in results_data:
        image_path = result_item['image_path']
        image_name = os.path.basename(image_path)
        
        # Find corresponding actual data
        if image_path not in actual_lookup:
            print(f"‚ùå No ground truth found for: {image_name}")
            continue
            
        actual_item = actual_lookup[image_path]
        
        # Extract structured data for comparison
        result_structured = result_item.get('structured_data', {})
        actual_structured = actual_item.get('structured_data', {})
        
        # Score this image
        if result_structured and actual_structured:
            score_result = score_json(result_structured, actual_structured, config)
            final_score = score_result['final_score']
            
            image_scores.append({
                'image_name': image_name,
                'image_path': image_path,
                'score': final_score,
                'breakdown': score_result['breakdown']
            })
            
            total_score += final_score
            processed_count += 1
            
            # Print score for this image
            print(f"üìä {image_name}: {final_score:.3f}")
            
        else:
            print(f"‚ö†Ô∏è  Missing structured data for: {image_name}")
    
    # Calculate overall statistics
    if processed_count > 0:
        average_score = total_score / processed_count
        
        print("\n" + "=" * 80)
        print("SUMMARY STATISTICS")
        print("=" * 80)
        print(f"üìà Total Images Processed: {processed_count}")
        print(f"üìä Average Score: {average_score:.3f}")
        print(f"üéØ Total Score Sum: {total_score:.3f}")
        print(f"üìâ Lowest Score: {min(item['score'] for item in image_scores):.3f}")
        print(f"üìà Highest Score: {max(item['score'] for item in image_scores):.3f}")
        
        # Show breakdown of scores
        score_ranges = {'0.0-0.2': 0, '0.2-0.4': 0, '0.4-0.6': 0, '0.6-0.8': 0, '0.8-1.0': 0}
        for item in image_scores:
            score = item['score']
            if score < 0.2:
                score_ranges['0.0-0.2'] += 1
            elif score < 0.4:
                score_ranges['0.2-0.4'] += 1
            elif score < 0.6:
                score_ranges['0.4-0.6'] += 1
            elif score < 0.8:
                score_ranges['0.6-0.8'] += 1
            else:
                score_ranges['0.8-1.0'] += 1
        
        print("\nüìä Score Distribution:")
        for range_name, count in score_ranges.items():
            print(f"   {range_name}: {count} images")
            
    else:
        average_score = 0
        print("‚ùå No images were processed successfully")
    
    return {
        'processed_count': processed_count,
        'average_score': average_score,
        'total_score': total_score,
        'image_scores': image_scores
    }

# Example usage:
# Set the paths to your JSON files
results_json_path = "/home/tanjim_noor/Work/AI OCR BenchMark/OCR_Output/ocr_result_1_structured_only.json"
actual_json_path = "/home/tanjim_noor/Work/AI OCR BenchMark/OCR_Output/ocr_actual_1_structured_only.json"

# Use the same config as defined above
batch_results = score_all_images(results_json_path, actual_json_path, config)

# Optionally, save detailed results to a file
if batch_results:
    output_path = "/home/tanjim_noor/Work/AI OCR BenchMark/OCR_Output/batch_scoring_results.json"
    with open(output_path, 'w') as f:
        json.dump(batch_results, f, indent=2)
    print(f"\nüíæ Detailed results saved to: {output_path}")

BATCH SCORING RESULTS
üìä IMG_20250826_154308~2.jpg: 0.952
üìä IMG_20250826_155754.jpg: 0.846

SUMMARY STATISTICS
üìà Total Images Processed: 2
üìä Average Score: 0.899
üéØ Total Score Sum: 1.798
üìâ Lowest Score: 0.846
üìà Highest Score: 0.952

üìä Score Distribution:
   0.0-0.2: 0 images
   0.2-0.4: 0 images
   0.4-0.6: 0 images
   0.6-0.8: 0 images
   0.8-1.0: 2 images

üíæ Detailed results saved to: /home/tanjim_noor/Work/AI OCR BenchMark/OCR_Output/batch_scoring_results.json


In [None]:
# Example 2: Process all images in directory (Batch Processing)
# This will process all images in the Pay Slip directory

# Uncomment the following code to process all images:
"""
try:
    pay_slip_dir = "/home/tanjim_noor/Work/AI OCR BenchMark/Pay Slip"
    output_file = "/home/tanjim_noor/Work/AI OCR BenchMark/ocr_results.json"
    
    print("Starting batch processing...")
    print(f"Input directory: {pay_slip_dir}")
    print(f"Output file: {output_file}")
    print("=" * 50)
    
    # Process all images with payslip extraction prompt
    results = ocr.process_directory(
        directory_path=pay_slip_dir,
        prompt=sample_prompts["payslip_extraction"],
        output_file=output_file
    )
    
    # Summary
    successful = sum(1 for r in results if r["success"])
    failed = len(results) - successful
    
    print("=" * 50)
    print(f"üìä PROCESSING SUMMARY:")
    print(f"Total images: {len(results)}")
    print(f"Successful: {successful}")
    print(f"Failed: {failed}")
    print(f"Results saved to: {output_file}")
    
except Exception as e:
    print(f"Error during batch processing: {e}")
"""

print("Batch processing code is commented out.")
print("Uncomment the code above to process all images in the directory.")
print("Note: This will use API calls for each image, so be mindful of costs!")

In [None]:
# Example 3: Switch between different models
# Demonstrate model-agnostic capabilities

print("üîÑ Model Switching Examples")
print("=" * 40)

# Current model
print(f"Current model: {ocr.model_name}")

print("\nüìù Available models:")
print("1. gemini - Google Gemini 1.5 Flash")
print("2. openai - OpenAI GPT-4 Vision")  
print("3. anthropic - Claude 3.5 Sonnet")

print("\nüîß How to switch models:")
print("ocr.change_model('openai')    # Switch to OpenAI")
print("ocr.change_model('anthropic') # Switch to Claude")
print("ocr.change_model('gemini')    # Switch to Gemini")

print("\n‚ö†Ô∏è Note: Make sure to set the corresponding API keys in .env file")

# Example of switching (commented out)
"""
# Switch to OpenAI
try:
    ocr.change_model('openai')
    # Process an image with OpenAI
    result = ocr.extract_data_from_image(image_path, prompt)
except Exception as e:
    print(f"OpenAI error: {e}")

# Switch to Claude
try:
    ocr.change_model('anthropic')
    # Process an image with Claude
    result = ocr.extract_data_from_image(image_path, prompt)
except Exception as e:
    print(f"Claude error: {e}")
"""

In [None]:
# Example 4: Custom Prompts and Utilities

def create_custom_prompt(task_type: str, specific_fields: Optional[List[str]] = None) -> str:
    """
    Helper function to create custom prompts for different OCR tasks.
    
    Args:
        task_type: Type of document ('invoice', 'receipt', 'form', 'table', 'general')
        specific_fields: List of specific fields to extract
    """
    base_prompts = {
        'invoice': "Extract invoice information including invoice number, date, vendor, amount, tax, and line items.",
        'receipt': "Extract receipt information including store name, date, items purchased, prices, and total amount.",
        'form': "Extract all form fields and their values from this document.",
        'table': "Extract table data and organize it in a structured format with headers and rows.",
        'general': "Extract all text content from this image, preserving structure and formatting."
    }
    
    prompt = base_prompts.get(task_type, base_prompts['general'])
    
    if specific_fields:
        fields_str = ", ".join(specific_fields)
        prompt += f" Focus specifically on extracting: {fields_str}."
    
    prompt += " Present the information in a clear, structured format."
    return prompt

# Example custom prompts
print("üéØ Custom Prompt Examples")
print("=" * 30)

# Create some custom prompts
custom_prompts = {
    "payslip_detailed": create_custom_prompt("form", [
        "employee name", "employee ID", "salary period", 
        "gross pay", "net pay", "tax deductions", "company name"
    ]),
    
    "receipt_analysis": create_custom_prompt("receipt", [
        "store name", "purchase date", "items", "total amount", "payment method"
    ]),
    
    "table_extraction": create_custom_prompt("table"),
    
    "general_ocr": create_custom_prompt("general")
}

for name, prompt in custom_prompts.items():
    print(f"\nüìù {name.upper()}:")
    print(f"   {prompt}")

print("\n‚ú® You can use these prompts with:")
print("   result = ocr.extract_data_from_image(image_path, custom_prompts['payslip_detailed'])")

In [None]:
# Example 5: Benchmarking Multiple Models
# Compare results from different models on the same image

def benchmark_models(image_path: str, prompt: str, models: Optional[List[str]] = None) -> Dict:
    """
    Benchmark multiple models on the same image and prompt.
    
    Args:
        image_path: Path to the image file
        prompt: Prompt to use for extraction
        models: List of models to test (defaults to available models)
        
    Returns:
        Dictionary with results from each model
    """
    if models is None:
        models = ["gemini", "openai", "anthropic"]
    
    benchmark_results = {
        "image_path": image_path,
        "prompt": prompt,
        "timestamp": datetime.now().isoformat(),
        "results": {}
    }
    
    for model_name in models:
        print(f"Testing {model_name}...")
        try:
            # Create new OCR instance for each model
            test_ocr = OCRBenchmark(model_name=model_name, temperature=0.1)
            
            # Check if model was initialized successfully
            if test_ocr.model is None:
                print(f"‚ö†Ô∏è {model_name} - Skipped: API key not available")
                benchmark_results["results"][model_name] = {
                    "success": False,
                    "error": f"API key not available for {model_name}",
                    "model_used": model_name
                }
                continue
            
            result = test_ocr.extract_data_from_image(image_path, prompt)
            benchmark_results["results"][model_name] = result
            
            if result["success"]:
                print(f"‚úÖ {model_name} - Success")
            else:
                print(f"‚ùå {model_name} - Failed: {result.get('error', 'Unknown error')}")
                
        except Exception as e:
            print(f"‚ùå {model_name} - Error: {str(e)}")
            benchmark_results["results"][model_name] = {
                "success": False,
                "error": str(e),
                "model_used": model_name
            }
    
    return benchmark_results

# Utility function to compare and analyze results
def analyze_benchmark_results(benchmark_results: Dict):
    """Analyze and display benchmark results."""
    print("\nüìä BENCHMARK ANALYSIS")
    print("=" * 50)
    
    results = benchmark_results["results"]
    successful_models = [model for model, result in results.items() if result.get("success", False)]
    failed_models = [model for model, result in results.items() if not result.get("success", False)]
    
    print(f"Image: {Path(benchmark_results['image_path']).name}")
    print(f"Total models tested: {len(results)}")
    print(f"Successful: {len(successful_models)} - {successful_models}")
    print(f"Failed: {len(failed_models)} - {failed_models}")
    
    if successful_models:
        print("\nüìù Extracted Data Comparison:")
        for model in successful_models:
            print(f"\nü§ñ {model.upper()}:")
            extracted_data = results[model]["extracted_data"]
            # Show first 200 characters
            preview = extracted_data[:200] + "..." if len(extracted_data) > 200 else extracted_data
            print(f"   {preview}")

print("üöÄ Benchmarking utilities defined!")
print("Use benchmark_models() to compare multiple models on the same image.")

# Example usage (commented out):
"""
# Benchmark all models on a single image
image_path = "/path/to/your/image.png"
prompt = sample_prompts["payslip_extraction"]

benchmark_results = benchmark_models(image_path, prompt)
analyze_benchmark_results(benchmark_results)

# Save benchmark results
with open("benchmark_results.json", "w") as f:
    json.dump(benchmark_results, f, indent=2)
"""

# üöÄ Quick Start Guide

## Setup Instructions

1. **Set up API Keys** - Add your API keys to the `.env` file:
   ```bash
   GOOGLE_API_KEY=your_actual_google_api_key
   OPENAI_API_KEY=your_actual_openai_api_key  
   ANTHROPIC_API_KEY=your_actual_anthropic_api_key
   ```

2. **Initialize the OCR System**:
   ```python
   ocr = OCRBenchmark(model_name="gemini")  # Default: Gemini
   ```

3. **Process a Single Image**:
   ```python
   result = ocr.extract_data_from_image("path/to/image.png", "Extract all text")
   ```

4. **Process All Images in Directory**:
   ```python
   results = ocr.process_directory("path/to/directory", "Extract payslip data")
   ```

## Available Models
- **gemini** - Google Gemini 1.5 Flash (Default)
- **openai** - OpenAI GPT-4 Vision  
- **anthropic** - Claude 3.5 Sonnet

## Features
‚úÖ **Model Agnostic** - Switch between models easily  
‚úÖ **Batch Processing** - Process entire directories  
‚úÖ **Custom Prompts** - Flexible prompt engineering  
‚úÖ **JSON Output** - Save results to files  
‚úÖ **Error Handling** - Robust error management  
‚úÖ **Benchmarking** - Compare model performance  

## Next Steps
1. Set your API keys in `.env`
2. Run the initialization cell
3. Try processing a single image first
4. Experiment with different prompts
5. Compare results across models

In [33]:
# Check API Keys Availability
print("üîë API Keys Status Check")
print("=" * 30)

# Create a temporary OCR instance to check API keys
temp_ocr = OCRBenchmark()
api_status = temp_ocr.check_api_keys()

print("Available models:")
for model, available in api_status.items():
    status = "‚úÖ Available" if available else "‚ùå Missing API Key"
    print(f"  {model}: {status}")

print(f"\nRecommended model: {next((model for model, available in api_status.items() if available), 'None available')}")

if not any(api_status.values()):
    print("\n‚ö†Ô∏è No API keys found!")
    print("Please set at least one API key in your .env file:")
    print("  GOOGLE_API_KEY=your_key_here")
    print("  OPENAI_API_KEY=your_key_here") 
    print("  ANTHROPIC_API_KEY=your_key_here")
else:
    print(f"\n‚úÖ {sum(api_status.values())} out of {len(api_status)} models available!")

üîë API Keys Status Check
Available models:
  gemini: ‚úÖ Available
  openai: ‚ùå Missing API Key
  anthropic: ‚ùå Missing API Key

Recommended model: gemini

‚úÖ 1 out of 3 models available!


In [None]:
# Test JPG file support and MIME type handling
print("üîç Testing JPG File Support")
print("=" * 40)

# Show supported formats
print("Supported image formats:")
for fmt in sorted(ocr.supported_formats):
    print(f"  {fmt}")

print("\nMIME type mappings:")
for ext, mime in ocr.mime_type_map.items():
    print(f"  {ext} ‚Üí image/{mime}")

print("\nüìÅ Checking for JPG files in your directories...")

# Check both directories for JPG files
directories_to_check = [
    "/home/tanjim_noor/Work/AI OCR BenchMark/Pay Slip",
    "/home/tanjim_noor/Work/AI OCR BenchMark/test png"
]

for directory in directories_to_check:
    if os.path.exists(directory):
        print(f"\nüìÇ {directory}:")
        try:
            all_files = list(Path(directory).iterdir())
            jpg_files = [f for f in all_files if f.suffix.lower() in ['.jpg', '.jpeg']]
            
            if jpg_files:
                print(f"  Found {len(jpg_files)} JPG/JPEG file(s):")
                for jpg_file in jpg_files[:5]:  # Show first 5
                    file_ext = jpg_file.suffix.lower()
                    mime_type = ocr.mime_type_map.get(file_ext, file_ext[1:])
                    print(f"    {jpg_file.name} ({file_ext} ‚Üí image/{mime_type})")
                if len(jpg_files) > 5:
                    print(f"    ... and {len(jpg_files) - 5} more")
            else:
                print("  No JPG files found")
                
            # Show all supported image files
            supported_images = ocr.get_images_from_directory(directory)
            if supported_images:
                print(f"  Total supported images: {len(supported_images)}")
        except Exception as e:
            print(f"  Error reading directory: {e}")
    else:
        print(f"\nüìÇ {directory}: Directory not found")

print(f"\n‚úÖ JPG files are fully supported!")
print("The system will automatically convert .jpg extensions to 'image/jpeg' MIME type.")

In [32]:
# Quick MIME type test
print("üß™ Testing MIME Type Mapping")
print("=" * 30)

test_extensions = ['.jpg', '.jpeg', '.png', '.webp', '.gif']
for ext in test_extensions:
    mime_type = ocr.mime_type_map.get(ext, ext[1:])
    print(f"{ext} ‚Üí image/{mime_type}")

# Test with your specific file
test_file = "/home/tanjim_noor/Work/AI OCR BenchMark/Pay Slip/IMG_20250826_154052.jpg"
if os.path.exists(test_file):
    file_ext = Path(test_file).suffix.lower()
    mime_type = ocr.mime_type_map.get(file_ext, file_ext[1:])
    print(f"\nüìÅ Your file: {Path(test_file).name}")
    print(f"Extension: {file_ext}")
    print(f"MIME type: image/{mime_type}")
    print(f"Should be: image/jpeg")
    print(f"Mapping works: {'‚úÖ' if mime_type == 'jpeg' else '‚ùå'}")
else:
    print(f"\nüìÅ Test file not found: {test_file}")

üß™ Testing MIME Type Mapping
.jpg ‚Üí image/jpeg
.jpeg ‚Üí image/jpeg
.png ‚Üí image/png
.webp ‚Üí image/webp
.gif ‚Üí image/gif

üìÅ Your file: IMG_20250826_154052.jpg
Extension: .jpg
MIME type: image/jpeg
Should be: image/jpeg
Mapping works: ‚úÖ


In [31]:
# Diagnostic: Check image file sizes and dimensions
print("üîç Image Diagnostics")
print("=" * 40)

pay_slip_dir = "/home/tanjim_noor/Work/AI OCR BenchMark/Pay Slip"
if os.path.exists(pay_slip_dir):
    image_files = ocr.get_images_from_directory(pay_slip_dir)
    
    if image_files:
        print(f"Found {len(image_files)} images. Analyzing first few...")
        
        for i, img_path in enumerate(image_files[:3]):
            print(f"\nüìÅ {i+1}. {Path(img_path).name}")
            
            # File size
            file_size_mb = os.path.getsize(img_path) / (1024 * 1024)
            print(f"   File size: {file_size_mb:.2f} MB")
            
            # Image dimensions
            try:
                with Image.open(img_path) as img:
                    print(f"   Dimensions: {img.size[0]}x{img.size[1]} pixels")
                    print(f"   Mode: {img.mode}")
                    
                    # Estimate base64 size
                    estimated_base64_mb = file_size_mb * 1.33  # Base64 is ~33% larger
                    print(f"   Estimated base64 size: {estimated_base64_mb:.2f} MB")
                    
                    if file_size_mb > 4:
                        print(f"   ‚ö†Ô∏è Large file - will be optimized")
                    if img.size[0] > 1024 or img.size[1] > 1024:
                        print(f"   ‚ö†Ô∏è Large dimensions - will be resized")
                        
            except Exception as e:
                print(f"   ‚ùå Error reading image: {e}")
    else:
        print("No images found")
else:
    print("Directory not found")

üîç Image Diagnostics
Found 25 images. Analyzing first few...

üìÅ 1. IMG_20250826_154052.jpg
   File size: 2.98 MB
   Dimensions: 4624x3472 pixels
   Mode: RGB
   Estimated base64 size: 3.96 MB
   ‚ö†Ô∏è Large dimensions - will be resized

üìÅ 2. IMG_20250826_154131.jpg
   File size: 3.70 MB
   Dimensions: 4624x3472 pixels
   Mode: RGB
   Estimated base64 size: 4.92 MB
   ‚ö†Ô∏è Large dimensions - will be resized

üìÅ 3. IMG_20250826_154154.jpg
   File size: 3.52 MB
   Dimensions: 4624x3472 pixels
   Mode: RGB
   Estimated base64 size: 4.69 MB
   ‚ö†Ô∏è Large dimensions - will be resized


In [None]:
# Image Optimization Configuration Utility
print("üõ†Ô∏è Image Optimization Configuration")
print("=" * 50)

def update_global_settings(enable_optimization=None, resize_ratio=None, max_dimension=None, 
                          max_file_size_mb=None, jpeg_quality=None):
    """Update global image optimization settings."""
    global ENABLE_IMAGE_OPTIMIZATION, IMAGE_RESIZE_RATIO, MAX_IMAGE_DIMENSION
    global MAX_FILE_SIZE_MB, JPEG_QUALITY
    
    if enable_optimization is not None:
        ENABLE_IMAGE_OPTIMIZATION = enable_optimization
        print(f"‚úÖ Global optimization: {'Enabled' if enable_optimization else 'Disabled'}")
    
    if resize_ratio is not None:
        if 0.1 <= resize_ratio <= 1.0:
            IMAGE_RESIZE_RATIO = resize_ratio
            print(f"‚úÖ Global resize ratio: {resize_ratio}")
        else:
            print(f"‚ùå Invalid resize ratio: {resize_ratio} (must be 0.1-1.0)")
    
    if max_dimension is not None:
        MAX_IMAGE_DIMENSION = max_dimension
        print(f"‚úÖ Global max dimension: {max_dimension}px")
    
    if max_file_size_mb is not None:
        MAX_FILE_SIZE_MB = max_file_size_mb
        print(f"‚úÖ Global max file size: {max_file_size_mb}MB")
    
    if jpeg_quality is not None:
        if 1 <= jpeg_quality <= 100:
            JPEG_QUALITY = jpeg_quality
            print(f"‚úÖ Global JPEG quality: {jpeg_quality}")
        else:
            print(f"‚ùå Invalid JPEG quality: {jpeg_quality} (must be 1-100)")

def show_global_settings():
    """Display current global settings."""
    print("üåê Current Global Settings:")
    print(f"  ENABLE_IMAGE_OPTIMIZATION = {ENABLE_IMAGE_OPTIMIZATION}")
    print(f"  IMAGE_RESIZE_RATIO = {IMAGE_RESIZE_RATIO}")
    print(f"  MAX_IMAGE_DIMENSION = {MAX_IMAGE_DIMENSION}")
    print(f"  MAX_FILE_SIZE_MB = {MAX_FILE_SIZE_MB}")
    print(f"  JPEG_QUALITY = {JPEG_QUALITY}")

# Show current settings
show_global_settings()

print("\nüìñ Usage Examples:")
print("# Disable optimization completely:")
print("update_global_settings(enable_optimization=False)")
print("\n# Resize images to 25% of original size:")
print("update_global_settings(resize_ratio=0.25)")
print("\n# Set maximum dimension to 512px:")
print("update_global_settings(max_dimension=512)")
print("\n# Multiple settings at once:")
print("update_global_settings(enable_optimization=True, resize_ratio=0.3, jpeg_quality=70)")

In [None]:
# Enhanced processing with better formatting
try:
    pay_slip_dir = "/home/tanjim_noor/Work/AI OCR BenchMark/For benchmark"
    output_file = "/home/tanjim_noor/Work/AI OCR BenchMark/OCR_Output/ocr_result_formatted.json"
    
    print("üöÄ Starting enhanced batch processing with formatting...")
    print(f"Input directory: {pay_slip_dir}")
    print(f"Output file: {output_file}")
    print("=" * 50)
    
    # Process all images with payslip extraction prompt
    results = ocr.process_directory(
        directory_path=pay_slip_dir,
        prompt=sample_prompts["cash_expense_extraction_V2"],
        output_file=output_file,
        save_formatted=True  # Enable enhanced formatting
    )
    
    # Summary
    successful = sum(1 for r in results if r["success"])
    failed = len(results) - successful
    parsed_json = sum(1 for r in results if "extracted_data_parsed" in r)
    
    print("=" * 50)
    print(f"üìä ENHANCED PROCESSING SUMMARY:")
    print(f"Total images: {len(results)}")
    print(f"Successful: {successful}")
    print(f"Failed: {failed}")
    print(f"Successfully parsed JSON: {parsed_json}")
    print(f"üìÅ Files created:")
    print(f"  ‚Ä¢ Main results: {output_file}")
    print(f"  ‚Ä¢ Structured data only: {output_file.replace('.json', '_structured_only.json')}")
    
    # Show sample of structured data
    if results and "extracted_data_parsed" in results[0]:
        print(f"\nüìã Sample structured data from first image:")
        sample_data = results[0]["extracted_data_parsed"]
        print(json.dumps(sample_data, indent=2)[:500] + "..." if len(str(sample_data)) > 500 else json.dumps(sample_data, indent=2))
    
except Exception as e:
    print(f"Error during enhanced batch processing: {e}")

In [None]:
# Utility function to view formatted results
def view_formatted_results(results_file: str, show_full_data: bool = False):
    """
    View formatted OCR results in a readable way.
    
    Args:
        results_file: Path to the JSON results file
        show_full_data: Whether to show complete structured data or just summary
    """
    try:
        with open(results_file, 'r', encoding='utf-8') as f:
            results = json.load(f)
        
        print(f"üìä OCR Results Summary from {Path(results_file).name}")
        print("=" * 60)
        
        for i, result in enumerate(results, 1):
            print(f"\nüñºÔ∏è Image {i}: {Path(result['image_path']).name}")
            print(f"   Model: {result.get('model_used', 'Unknown')}")
            print(f"   Status: {'‚úÖ Success' if result.get('success') else '‚ùå Failed'}")
            
            if result.get('success'):
                if 'structured_data' in result:
                    structured = result['structured_data']
                    print(f"   Format: üìã Structured JSON")
                    
                    # Show key information from structured data
                    if isinstance(structured, dict):
                        if 'payment_details' in structured:
                            pd = structured['payment_details']
                            print(f"   Supplier: {pd.get('supplier', 'N/A')}")
                            print(f"   Date: {pd.get('payment_date', 'N/A')}")
                            print(f"   Total: {structured.get('totals', {}).get('total_amount', 'N/A')}")
                            print(f"   Items: {len(structured.get('item_details', []))}")
                        
                        if show_full_data:
                            print(f"\n   üìã Full Structured Data:")
                            print(json.dumps(structured, indent=4))
                    
                elif 'extraction_format' in result:
                    print(f"   Format: üìù {result['extraction_format'].upper()}")
                    if 'extracted_data_preview' in result:
                        print(f"   Preview: {result['extracted_data_preview']}")
                elif 'extracted_data' in result:
                    # Fallback for older format
                    preview = result['extracted_data'][:100] + "..." if len(result['extracted_data']) > 100 else result['extracted_data']
                    print(f"   Data Preview: {preview}")
            else:
                print(f"   Error: {result.get('error', 'Unknown error')}")
        
        print(f"\nüìà Summary: {sum(1 for r in results if r.get('success'))} successful out of {len(results)} total")
        
    except Exception as e:
        print(f"‚ùå Error reading results file: {e}")

print("üîç Enhanced result viewing utility created!")
print("\nUsage examples:")
print("# View summary of formatted results:")
print("view_formatted_results('/home/tanjim_noor/Work/AI OCR BenchMark/OCR_Output/ocr_result_formatted.json')")
print("\n# View with full structured data:")
print("view_formatted_results('/home/tanjim_noor/Work/AI OCR BenchMark/OCR_Output/ocr_result_formatted.json', show_full_data=True)")