# Imports

In [2]:
%%time

from IPython.display import clear_output

!pip install transformers==4.45.0
!pip install bitsandbytes==0.44.1 accelerate
! pip install einops flash_attn # florence 2
!pip install rouge_score


clear_output()

CPU times: user 1.41 s, sys: 403 ms, total: 1.82 s
Wall time: 1min 59s


In [None]:
import os
os.environ["HF_TOKEN"] = 'paste your token'

In [22]:
import os
import gc
import time
import json
import numpy as np
import torch
import matplotlib.pyplot as plt
from PIL import Image
from rouge_score import rouge_scorer
import re
import subprocess
import psutil
import pandas as pd

# Configuration class
class CFG:
    # Model
    model_name = 'microsoft/Florence-2-base'
    max_tokens = 512
    
    # Dataset paths
    receipt_dir = "/kaggle/input/handwritten-data-form-receipts/receipts/receipts"  # Folder with receipt images and txt files
    form_dir = "/kaggle/input/handwritten-data-form-receipts/forms/forms"        # Folder with form images and txt files
    
    # Output data
    output_dir = "./florence2_results"
    receipt_output = "florence2_receipt_results.json"
    form_output = "florence2_form_results.json"
    summary_output = "florence2_summary.json"
    
    # Image formats to process
    img_extensions = ['.jpg', '.jpeg', '.png', '.tif', '.tiff']

def get_memory_usage():
    """Get current memory usage of the process"""
    try:
        process = psutil.Process(os.getpid())
        memory_info = process.memory_info()
        return {
            'rss': memory_info.rss / (1024 * 1024),  # RSS in MB
            'vms': memory_info.vms / (1024 * 1024)   # VMS in MB
        }
    except:
        return {'rss': 0, 'vms': 0}

def get_cuda_memory_usage():
    """Get CUDA memory usage using torch"""
    if torch.cuda.is_available():
        return {
            'allocated': torch.cuda.memory_allocated() / (1024 * 1024),  # MB
            'reserved': torch.cuda.memory_reserved() / (1024 * 1024),    # MB
            'max_allocated': torch.cuda.max_memory_allocated() / (1024 * 1024)  # MB
        }
    return None

def build_model():
    """Build Florence-2 model"""
    print(f'\nLoading model: {CFG.model_name}\n')
    
    # Import required modules
    from transformers import AutoProcessor, AutoModelForCausalLM
    
    try:
        # Try to import optional dependencies
        subprocess.run(["pip", "install", "einops", "flash-attn", "--quiet"])
    except:
        pass
    
    # Load the processor
    processor = AutoProcessor.from_pretrained(
        CFG.model_name, 
        trust_remote_code=True
    )
    
    # Load the model without device_map="auto" since it's not supported
    model = AutoModelForCausalLM.from_pretrained(
        CFG.model_name,
        trust_remote_code=True,
        torch_dtype=torch.float32  # Using float32 consistently
    ).eval()
    # Move model to appropriate device
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    print(f"Model loaded successfully on device: {device}")
    
    return processor, model

def inference(image, model, processor):
    """Run inference with Florence-2 model - using OCR task"""
    start_time = time.time()
    
    # Record metrics before inference
    metrics_before = {
        'memory': get_memory_usage(),
        'cuda': get_cuda_memory_usage(),
        'timestamp': time.time()
    }
    
    try:
        # Determine if this is Florence model by checking for post_process_generation method
        is_florence = hasattr(processor, 'post_process_generation') and callable(getattr(processor, 'post_process_generation'))
        
        if is_florence:
            # Florence-specific way - using OCR task
            inputs = processor(
                text="<OCR>",  # Simple OCR task
                images=image,
                return_tensors="pt"
            )
            
            # Move inputs to device
            device = model.device
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            # Run generation
            with torch.no_grad():
                output_ids = model.generate(
                    **inputs,
                    max_new_tokens=CFG.max_tokens,
                    do_sample=False
                )
            
            # Process output using Florence-specific method
            output_text = processor.decode(output_ids[0], skip_special_tokens=False)
            result = processor.post_process_generation(
                output_text, 
                task="<OCR>", 
                image_size=(image.width, image.height)
            )
            
            # Extract result from OCR task output
            if isinstance(result, dict) and '<OCR>' in result:
                result = result['<OCR>']
        else:
            # Florence model should have post_process_generation but just in case
            inputs = processor(
                text="Extract all text from this image.",
                images=image,
                return_tensors="pt"
            )
            
            # Move inputs to device
            device = model.device
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            # Run generation
            with torch.no_grad():
                output_ids = model.generate(
                    **inputs,
                    max_new_tokens=CFG.max_tokens,
                    do_sample=False
                )
            
            # Generic decoding
            result = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
    
    except Exception as e:
        print(f"Error during inference: {str(e)}")
        result = f"Error: {str(e)}"
    
    # Record metrics after inference
    metrics_after = {
        'memory': get_memory_usage(),
        'cuda': get_cuda_memory_usage(),
        'timestamp': time.time()
    }
    
    # Calculate runtime
    inference_time = metrics_after['timestamp'] - metrics_before['timestamp']
    
    # Clear CUDA cache
    torch.cuda.empty_cache()
    gc.collect()
    
    # Return results and metrics
    return {
        'result': result,
        'metrics': {
            'before': metrics_before,
            'after': metrics_after,
            'inference_time': inference_time
        }
    }

def load_annotation_from_txt(txt_path):
    """Load annotation from a text file"""
    try:
        with open(txt_path, 'r', encoding='utf-8') as f:
            return f.read().strip()
    except Exception as e:
        print(f"Error reading annotation file {txt_path}: {e}")
        return ""

def normalize_text(text):
    """Normalize text for better ROUGE matching"""
    # Convert to lowercase
    text = text.lower()
    # Replace multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    # Remove non-alphanumeric chars except spaces
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # Trim leading/trailing spaces
    text = text.strip()
    return text

def calculate_rouge_scores(predicted_text, reference_text):
    """Calculate ROUGE scores between prediction and reference"""
    # Initialize the ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # Normalize texts
    normalized_reference = normalize_text(reference_text)
    normalized_prediction = normalize_text(predicted_text)
    
    # Skip if either text is empty
    if not normalized_reference or not normalized_prediction:
        return {
            'rouge1': 0,
            'rouge2': 0,
            'rougeL': 0,
            'debug': {
                'reference_empty': not normalized_reference,
                'prediction_empty': not normalized_prediction
            }
        }
    
    # Calculate scores
    scores = scorer.score(normalized_reference, normalized_prediction)
    
    # Debug info
    debug_info = {
        'reference_sample': normalized_reference[:100] + "..." if len(normalized_reference) > 100 else normalized_reference,
        'prediction_sample': normalized_prediction[:100] + "..." if len(normalized_prediction) > 100 else normalized_prediction,
        'reference_length': len(normalized_reference),
        'prediction_length': len(normalized_prediction)
    }
    
    return {
        'rouge1': scores['rouge1'].fmeasure,
        'rouge2': scores['rouge2'].fmeasure,
        'rougeL': scores['rougeL'].fmeasure,
        'debug': debug_info
    }

def find_image_txt_pairs(directory):
    """Find matching image and txt files in a directory"""
    # Get all files
    all_files = os.listdir(directory)
    
    # Find image files
    image_files = [f for f in all_files if any(f.lower().endswith(ext) for ext in CFG.img_extensions)]
    
    # Find matching txt files
    pairs = []
    for img_file in image_files:
        base_name = os.path.splitext(img_file)[0]
        txt_file = f"{base_name}.txt"
        
        if txt_file in all_files:
            pairs.append({
                'image': os.path.join(directory, img_file),
                'txt': os.path.join(directory, txt_file),
                'base_name': base_name
            })
    
    return pairs

def process_dataset(dataset_dir, output_file, model, processor, dataset_name):
    """Process a single dataset (receipts or forms)"""
    print(f"\n{'='*50}")
    print(f"Processing {dataset_name} dataset from {dataset_dir}")
    print(f"{'='*50}")
    
    # Find image-txt pairs
    pairs = find_image_txt_pairs(dataset_dir)
    print(f"Found {len(pairs)} image-txt pairs")
    
    if len(pairs) == 0:
        print(f"No valid pairs found in {dataset_dir}")
        return None
    
    results = []
    
    # Process each pair
    for i, pair in enumerate(pairs):
        print(f"\nProcessing {i+1}/{len(pairs)}: {pair['base_name']}")
        
        # Load image
        try:
            image = Image.open(pair['image'])
            print(f"Loaded image: {pair['image']}, size: {image.size}")
        except Exception as e:
            print(f"Error loading image {pair['image']}: {e}")
            continue
        
        # Load annotation
        ground_truth = load_annotation_from_txt(pair['txt'])
        if not ground_truth:
            print(f"Empty or invalid annotation file: {pair['txt']}")
            continue
        
        # Run inference
        print("Running OCR extraction...")
        ocr_result = inference(image=image, model=model, processor=processor)
        
        # Print OCR result preview
        print("\nOCR Result Preview:")
        preview = str(ocr_result['result'])[:150] + "..." if len(str(ocr_result['result'])) > 150 else str(ocr_result['result'])
        print(preview)
        
        # Ground truth preview
        print("\nGround Truth Preview:")
        gt_preview = ground_truth[:150] + "..." if len(ground_truth) > 150 else ground_truth
        print(gt_preview)
        
        # Calculate ROUGE scores
        print("\nCalculating ROUGE scores...")
        rouge_scores = calculate_rouge_scores(ocr_result['result'], ground_truth)
        
        print(f"ROUGE-1: {rouge_scores['rouge1']:.4f}")
        print(f"ROUGE-2: {rouge_scores['rouge2']:.4f}")
        print(f"ROUGE-L: {rouge_scores['rougeL']:.4f}")
        
        # Store results
        results.append({
            'filename': pair['base_name'],
            'image_path': pair['image'],
            'txt_path': pair['txt'],
            'ocr_result': str(ocr_result['result']),  # Convert to string in case it's a dict or other type
            'ground_truth': ground_truth,
            'rouge_scores': rouge_scores,
            'inference_time': ocr_result['metrics']['inference_time']
        })
        
        # Save intermediate results
        os.makedirs(CFG.output_dir, exist_ok=True)
        with open(os.path.join(CFG.output_dir, output_file), 'w') as f:
            json.dump({
                'model': CFG.model_name,
                'dataset': dataset_name,
                'results': results
            }, f, indent=2)
    
    print(f"\nDataset processing complete. Results saved to {os.path.join(CFG.output_dir, output_file)}")
    
    # Calculate average scores
    if results:
        avg_rouge1 = sum(r['rouge_scores']['rouge1'] for r in results) / len(results)
        avg_rouge2 = sum(r['rouge_scores']['rouge2'] for r in results) / len(results)
        avg_rougeL = sum(r['rouge_scores']['rougeL'] for r in results) / len(results)
        avg_time = sum(r['inference_time'] for r in results) / len(results)
        
        summary = {
            'dataset': dataset_name,
            'num_images': len(results),
            'avg_rouge1': avg_rouge1,
            'avg_rouge2': avg_rouge2,
            'avg_rougeL': avg_rougeL,
            'avg_inference_time': avg_time
        }
        
        print("\nAverage ROUGE Scores:")
        print(f"  ROUGE-1: {avg_rouge1:.4f}")
        print(f"  ROUGE-2: {avg_rouge2:.4f}")
        print(f"  ROUGE-L: {avg_rougeL:.4f}")
        print(f"Average inference time: {avg_time:.2f} seconds")
        
        return summary
    
    return None

def visualize_results(receipt_results, form_results):
    """Create visualizations comparing performance on both datasets"""
    os.makedirs(CFG.output_dir, exist_ok=True)
    
    if not receipt_results or not form_results:
        print("Not enough data to create visualizations")
        return
    
    # Extract metrics
    datasets = ['Receipts', 'Forms']
    rouge1_scores = [receipt_results['avg_rouge1'], form_results['avg_rouge1']]
    rouge2_scores = [receipt_results['avg_rouge2'], form_results['avg_rouge2']]
    rougeL_scores = [receipt_results['avg_rougeL'], form_results['avg_rougeL']]
    inf_times = [receipt_results['avg_inference_time'], form_results['avg_inference_time']]
    
    # Create ROUGE scores comparison
    plt.figure(figsize=(10, 6))
    x = np.arange(len(datasets))
    width = 0.25
    
    plt.bar(x - width, rouge1_scores, width, label='ROUGE-1', color='#2196F3')
    plt.bar(x, rouge2_scores, width, label='ROUGE-2', color='#4CAF50')
    plt.bar(x + width, rougeL_scores, width, label='ROUGE-L', color='#FFC107')
    
    plt.xlabel('Datasets')
    plt.ylabel('ROUGE Score')
    plt.title('Florence-2 Performance on Different Datasets')
    plt.xticks(x, datasets)
    plt.ylim(0, 1.0)
    plt.legend()
    plt.grid(axis='y', alpha=0.3)
    
    # Add values on top of bars
    for i, model_idx in enumerate(x):
        plt.text(model_idx - width, rouge1_scores[i] + 0.02, f"{rouge1_scores[i]:.3f}", 
                ha='center', va='bottom', fontsize=9)
        plt.text(model_idx, rouge2_scores[i] + 0.02, f"{rouge2_scores[i]:.3f}", 
                ha='center', va='bottom', fontsize=9)
        plt.text(model_idx + width, rougeL_scores[i] + 0.02, f"{rougeL_scores[i]:.3f}", 
                ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    plt.savefig(os.path.join(CFG.output_dir, 'dataset_comparison.png'))
    plt.close()
    
    # Create inference time comparison
    plt.figure(figsize=(8, 5))
    plt.bar(datasets, inf_times, color=['#2196F3', '#4CAF50'])
    plt.xlabel('Datasets')
    plt.ylabel('Average Inference Time (seconds)')
    plt.title('Inference Time Comparison')
    
    # Add values on top of bars
    for i, time in enumerate(inf_times):
        plt.text(i, time + 0.1, f"{time:.2f}s", ha='center', va='bottom')
    
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.savefig(os.path.join(CFG.output_dir, 'inference_time_comparison.png'))
    plt.close()
    
    # Return file paths for reporting
    return {
        'rouge_comparison': os.path.join(CFG.output_dir, 'dataset_comparison.png'),
        'time_comparison': os.path.join(CFG.output_dir, 'inference_time_comparison.png')
    }

def create_comparison_table(receipt_summary, form_summary):
    """Create a comparison table for the two datasets"""
    if not receipt_summary or not form_summary:
        return None
    
    # Determine which dataset performs better for each metric
    better_dataset = {
        'rouge1': 'Receipts' if receipt_summary['avg_rouge1'] > form_summary['avg_rouge1'] else 'Forms',
        'rouge2': 'Receipts' if receipt_summary['avg_rouge2'] > form_summary['avg_rouge2'] else 'Forms',
        'rougeL': 'Receipts' if receipt_summary['avg_rougeL'] > form_summary['avg_rougeL'] else 'Forms',
        'time': 'Receipts' if receipt_summary['avg_inference_time'] < form_summary['avg_inference_time'] else 'Forms'
    }
    
    # Calculate percentage difference
    diff_percent = {
        'rouge1': abs(receipt_summary['avg_rouge1'] - form_summary['avg_rouge1']) / 
                 max(receipt_summary['avg_rouge1'], form_summary['avg_rouge1']) * 100,
        'rouge2': abs(receipt_summary['avg_rouge2'] - form_summary['avg_rouge2']) / 
                 max(receipt_summary['avg_rouge2'], form_summary['avg_rouge2']) * 100,
        'rougeL': abs(receipt_summary['avg_rougeL'] - form_summary['avg_rougeL']) / 
                 max(receipt_summary['avg_rougeL'], form_summary['avg_rougeL']) * 100,
        'time': abs(receipt_summary['avg_inference_time'] - form_summary['avg_inference_time']) / 
               max(receipt_summary['avg_inference_time'], form_summary['avg_inference_time']) * 100
    }
    
    # Create DataFrame for the table
    df = pd.DataFrame({
        'Metric': ['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'Inference Time (s)'],
        'Receipts': [
            f"{receipt_summary['avg_rouge1']:.4f}",
            f"{receipt_summary['avg_rouge2']:.4f}",
            f"{receipt_summary['avg_rougeL']:.4f}",
            f"{receipt_summary['avg_inference_time']:.2f}"
        ],
        'Forms': [
            f"{form_summary['avg_rouge1']:.4f}",
            f"{form_summary['avg_rouge2']:.4f}",
            f"{form_summary['avg_rougeL']:.4f}",
            f"{form_summary['avg_inference_time']:.2f}"
        ],
        'Better Dataset': [
            better_dataset['rouge1'],
            better_dataset['rouge2'],
            better_dataset['rougeL'],
            better_dataset['time']
        ],
        'Difference %': [
            f"{diff_percent['rouge1']:.2f}%",
            f"{diff_percent['rouge2']:.2f}%",
            f"{diff_percent['rougeL']:.2f}%",
            f"{diff_percent['time']:.2f}%"
        ]
    })
    
    # Save to CSV
    csv_path = os.path.join(CFG.output_dir, 'dataset_comparison.csv')
    df.to_csv(csv_path, index=False)
    
    # Also save as markdown table
    md_table = df.to_markdown(index=False)
    with open(os.path.join(CFG.output_dir, 'dataset_comparison.md'), 'w') as f:
        f.write(md_table)
    
    return df

def main():
    # Create output directory
    os.makedirs(CFG.output_dir, exist_ok=True)
    
    # Build model once
    processor, model = build_model()
    
    # Process receipt dataset
    receipt_summary = process_dataset(
        dataset_dir=CFG.receipt_dir,
        output_file=CFG.receipt_output,
        model=model,
        processor=processor,
        dataset_name="Receipts"
    )
    
    # Process form dataset
    form_summary = process_dataset(
        dataset_dir=CFG.form_dir,
        output_file=CFG.form_output,
        model=model,
        processor=processor,
        dataset_name="Forms"
    )
    
    # Create visualizations
    if receipt_summary and form_summary:
        chart_paths = visualize_results(receipt_summary, form_summary)
        
        # Create comparison table
        comparison_table = create_comparison_table(receipt_summary, form_summary)
        
        # Save combined summary
        combined_summary = {
            'model': CFG.model_name,
            'receipts': receipt_summary,
            'forms': form_summary,
            'charts': chart_paths
        }
        
        with open(os.path.join(CFG.output_dir, CFG.summary_output), 'w') as f:
            json.dump(combined_summary, f, indent=2)
        
        print(f"\nComparison summary saved to {os.path.join(CFG.output_dir, CFG.summary_output)}")
        print("\nDataset Comparison:")
        print(comparison_table)
    else:
        print("\nCould not create comparison - one or both datasets failed to process")

if __name__ == "__main__":
    main()


Loading model: microsoft/Florence-2-base





Model loaded successfully on device: cuda:0

Processing Receipts dataset from /kaggle/input/handwritten-data-form-receipts/receipts/receipts
Found 5 image-txt pairs

Processing 1/5: image2
Loaded image: /kaggle/input/handwritten-data-form-receipts/receipts/receipts/image2.jpg, size: (517, 748)
Running OCR extraction...

OCR Result Preview:
6 January1939M.2s & MartinNo.8Dr. to A Mumphy.2 weeks rent of cottage dhsfrom of 6 to 23 Jan 39@ 357310100Famitary fee10cleaning feeF3-16-0fos deposit...

Ground Truth Preview:
9th January 1939

Mrs E. Martin
No. 8
Dr. to A. Murphy,

2 weeks rent of cottage No. 8
from 7th to 21st Jan 39 @ 35/-    3  10  0
Sanitary fee        ...

Calculating ROUGE scores...
ROUGE-1: 0.3248
ROUGE-2: 0.1565
ROUGE-L: 0.3077

Processing 2/5: image1
Loaded image: /kaggle/input/handwritten-data-form-receipts/receipts/receipts/image1.jpg, size: (400, 532)
Running OCR extraction...

OCR Result Preview:
Date19A PHOTOGRAPHERS PLACEM133 MERCER STREETNo.NEW YORK N.Y. 10012Reg. N

In [23]:
!zip -r handwritten_florence /kaggle/working/florence2_results

  adding: kaggle/working/florence2_results/ (stored 0%)
  adding: kaggle/working/florence2_results/florence2_summary.json (deflated 57%)
  adding: kaggle/working/florence2_results/dataset_comparison.png (deflated 22%)
  adding: kaggle/working/florence2_results/inference_time_comparison.png (deflated 20%)
  adding: kaggle/working/florence2_results/dataset_comparison.md (deflated 62%)
  adding: kaggle/working/florence2_results/dataset_comparison.csv (deflated 30%)
  adding: kaggle/working/florence2_results/florence2_form_results.json (deflated 82%)
  adding: kaggle/working/florence2_results/florence2_receipt_results.json (deflated 66%)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [4]:
import os
import gc
import time
import json
import xml.etree.ElementTree as ET
from PIL import Image
import cv2
import numpy as np
import torch
import matplotlib.pyplot as plt
from rouge_score import rouge_scorer
import psutil
import re

# Configuration class
class CFG:
    # Model
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
    max_tokens = 256
    temperature = 0.01
    
    # Dataset paths
    receipt_dir = "/kaggle/input/handwritten-data-form-receipts/receipts/receipts"  # Folder with receipt images and txt files
    form_dir = "/kaggle/input/handwritten-data-form-receipts/forms/forms"        # Folder with form images and txt files
    
    # Output data
    output_dir = "./llama32_results"
    receipt_output = "llama32_receipt_results.json"
    form_output = "llama32_form_results.json"
    summary_output = "llama32_summary.json"
    
    # Image formats to process
    img_extensions = ['.jpg', '.jpeg', '.png', '.tif', '.tiff']
    
    # Prompts for different document types
    receipt_prompt = """
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

<|image|>I have a receipt image. Please extract all the text from it including store name, date, items, prices, and total amount.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""
    
    form_prompt = """
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

<|image|>I have a hospital registration form image. Please extract all text from it including patient details, emergency contacts, medical history (yes/no responses), and insurance information.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

def get_memory_usage():
    """Get current memory usage of the process"""
    try:
        process = psutil.Process(os.getpid())
        memory_info = process.memory_info()
        return {
            'rss': memory_info.rss / (1024 * 1024),  # RSS in MB
            'vms': memory_info.vms / (1024 * 1024)   # VMS in MB
        }
    except:
        return {'rss': 0, 'vms': 0}

def get_cuda_memory_usage():
    """Get CUDA memory usage using torch"""
    if torch.cuda.is_available():
        return {
            'allocated': torch.cuda.memory_allocated() / (1024 * 1024),  # MB
            'reserved': torch.cuda.memory_reserved() / (1024 * 1024),    # MB
            'max_allocated': torch.cuda.max_memory_allocated() / (1024 * 1024)  # MB
        }
    return None

def build_model():
    """Build Llama 3.2 model without triton dependencies"""
    print(f'\nLoading model: {CFG.model_name}\n')
    
    try:
        # Import without using BitsAndBytes quantization
        from transformers import MllamaForConditionalGeneration, AutoProcessor
        
        # Processor
        processor = AutoProcessor.from_pretrained(
            CFG.model_name,
            use_auth_token=os.environ.get("HF_TOKEN")
        )

        # Model with half precision but no quantization
        model = MllamaForConditionalGeneration.from_pretrained(
            CFG.model_name,
            torch_dtype=torch.float16,
            device_map="auto",  # Automatically decide which parts go on which devices
            low_cpu_mem_usage=True,
            use_auth_token=os.environ.get("HF_TOKEN")
        ).eval()
        
        print(f"Model loaded successfully, using device: {next(model.parameters()).device}")
        return processor, model
    
    except Exception as e:
        print(f"Error loading Llama model with MllamaForConditionalGeneration: {str(e)}")
        
        # Try with AutoModelForCausalLM as fallback
        try:
            from transformers import AutoModelForCausalLM, AutoProcessor
            
            processor = AutoProcessor.from_pretrained(
                CFG.model_name,
                use_auth_token=os.environ.get("HF_TOKEN")
            )
            
            model = AutoModelForCausalLM.from_pretrained(
                CFG.model_name,
                torch_dtype=torch.float16,
                device_map="auto",
                low_cpu_mem_usage=True,
                use_auth_token=os.environ.get("HF_TOKEN")
            ).eval()
            
            print(f"Model loaded successfully with AutoModelForCausalLM, using device: {next(model.parameters()).device}")
            return processor, model
            
        except Exception as e2:
            print(f"Error with AutoModelForCausalLM approach: {str(e2)}")
            raise RuntimeError("Failed to load Llama 3.2 model with both approaches")

def inference(image, model, processor, prompt):
    """Run inference with Llama 3.2"""
    start_time = time.time()
    
    # Record metrics before inference
    metrics_before = {
        'memory': get_memory_usage(),
        'cuda': get_cuda_memory_usage(),
        'timestamp': time.time()
    }
    
    try:
        # Prepare inputs
        inputs = processor(
            image,
            prompt,
            return_tensors="pt"
        ).to(model.device)

        # Run generation
        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=CFG.max_tokens,
                temperature=CFG.temperature
            )

        # Process output
        ans = processor.decode(output[0])
        result = ans.split('<|eot_id|><|start_header_id|>assistant<|end_header_id|>')[-1].split('<|eot_id|>')[0]
    
    except Exception as e:
        print(f"Error during inference: {str(e)}")
        result = f"Error: {str(e)}"
    
    # Record metrics after inference
    metrics_after = {
        'memory': get_memory_usage(),
        'cuda': get_cuda_memory_usage(),
        'timestamp': time.time()
    }
    
    # Calculate runtime
    inference_time = metrics_after['timestamp'] - metrics_before['timestamp']
    
    # Clear CUDA cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()
    
    # Return results and metrics
    return {
        'result': result,
        'metrics': {
            'before': metrics_before,
            'after': metrics_after,
            'inference_time': inference_time
        }
    }

def load_annotation_from_txt(txt_path):
    """Load annotation from a text file"""
    try:
        with open(txt_path, 'r', encoding='utf-8') as f:
            return f.read().strip()
    except Exception as e:
        print(f"Error reading annotation file {txt_path}: {e}")
        return ""

def normalize_text(text):
    """Normalize text for better ROUGE matching"""
    # Convert to lowercase
    text = text.lower()
    # Replace multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    # Remove non-alphanumeric chars except spaces
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # Trim leading/trailing spaces
    text = text.strip()
    return text

def calculate_rouge_scores(predicted_text, reference_text):
    """Calculate ROUGE scores between prediction and reference"""
    # Initialize the ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # Normalize texts
    normalized_reference = normalize_text(reference_text)
    normalized_prediction = normalize_text(predicted_text)
    
    # Skip if either text is empty
    if not normalized_reference or not normalized_prediction:
        return {
            'rouge1': 0,
            'rouge2': 0,
            'rougeL': 0,
            'debug': {
                'reference_empty': not normalized_reference,
                'prediction_empty': not normalized_prediction
            }
        }
    
    # Calculate scores
    scores = scorer.score(normalized_reference, normalized_prediction)
    
    # Debug info
    debug_info = {
        'reference_sample': normalized_reference[:100] + "..." if len(normalized_reference) > 100 else normalized_reference,
        'prediction_sample': normalized_prediction[:100] + "..." if len(normalized_prediction) > 100 else normalized_prediction,
        'reference_length': len(normalized_reference),
        'prediction_length': len(normalized_prediction)
    }
    
    return {
        'rouge1': scores['rouge1'].fmeasure,
        'rouge2': scores['rouge2'].fmeasure,
        'rougeL': scores['rougeL'].fmeasure,
        'debug': debug_info
    }

def find_image_txt_pairs(directory):
    """Find matching image and txt files in a directory"""
    # Get all files
    all_files = os.listdir(directory)
    
    # Find image files
    image_files = [f for f in all_files if any(f.lower().endswith(ext) for ext in CFG.img_extensions)]
    
    # Find matching txt files
    pairs = []
    for img_file in image_files:
        base_name = os.path.splitext(img_file)[0]
        txt_file = f"{base_name}.txt"
        
        if txt_file in all_files:
            pairs.append({
                'image': os.path.join(directory, img_file),
                'txt': os.path.join(directory, txt_file),
                'base_name': base_name
            })
    
    return pairs

def process_dataset(dataset_dir, output_file, model, processor, dataset_name):
    """Process a single dataset (receipts or forms)"""
    print(f"\n{'='*50}")
    print(f"Processing {dataset_name} dataset from {dataset_dir}")
    print(f"{'='*50}")
    
    # Find image-txt pairs
    pairs = find_image_txt_pairs(dataset_dir)
    print(f"Found {len(pairs)} image-txt pairs")
    
    if len(pairs) == 0:
        print(f"No valid pairs found in {dataset_dir}")
        return None
    
    results = []
    
    # Select the appropriate prompt based on dataset type
    if dataset_name.lower() == 'receipts':
        prompt = CFG.receipt_prompt
    else:  # forms
        prompt = CFG.form_prompt
    
    # Process each pair
    for i, pair in enumerate(pairs):
        print(f"\nProcessing {i+1}/{len(pairs)}: {pair['base_name']}")
        
        # Load image
        try:
            image = Image.open(pair['image'])
            print(f"Loaded image: {pair['image']}, size: {image.size}")
        except Exception as e:
            print(f"Error loading image {pair['image']}: {e}")
            continue
        
        # Load annotation
        ground_truth = load_annotation_from_txt(pair['txt'])
        if not ground_truth:
            print(f"Empty or invalid annotation file: {pair['txt']}")
            continue
        
        # Run inference
        print(f"Running inference with {dataset_name.lower()} prompt...")
        ocr_result = inference(image=image, model=model, processor=processor, prompt=prompt)
        
        # Print OCR result preview
        print("\nOCR Result Preview:")
        preview = str(ocr_result['result'])[:150] + "..." if len(str(ocr_result['result'])) > 150 else str(ocr_result['result'])
        print(preview)
        
        # Ground truth preview
        print("\nGround Truth Preview:")
        gt_preview = ground_truth[:150] + "..." if len(ground_truth) > 150 else ground_truth
        print(gt_preview)
        
        # Calculate ROUGE scores
        print("\nCalculating ROUGE scores...")
        rouge_scores = calculate_rouge_scores(ocr_result['result'], ground_truth)
        
        print(f"ROUGE-1: {rouge_scores['rouge1']:.4f}")
        print(f"ROUGE-2: {rouge_scores['rouge2']:.4f}")
        print(f"ROUGE-L: {rouge_scores['rougeL']:.4f}")
        
        # Store results
        results.append({
            'filename': pair['base_name'],
            'image_path': pair['image'],
            'txt_path': pair['txt'],
            'ocr_result': str(ocr_result['result']),
            'ground_truth': ground_truth,
            'rouge_scores': rouge_scores,
            'inference_time': ocr_result['metrics']['inference_time']
        })
        
        # Save intermediate results
        os.makedirs(CFG.output_dir, exist_ok=True)
        with open(os.path.join(CFG.output_dir, output_file), 'w') as f:
            json.dump({
                'model': CFG.model_name,
                'dataset': dataset_name,
                'results': results
            }, f, indent=2)
    
    print(f"\nDataset processing complete. Results saved to {os.path.join(CFG.output_dir, output_file)}")
    
    # Calculate average scores
    if results:
        avg_rouge1 = sum(r['rouge_scores']['rouge1'] for r in results) / len(results)
        avg_rouge2 = sum(r['rouge_scores']['rouge2'] for r in results) / len(results)
        avg_rougeL = sum(r['rouge_scores']['rougeL'] for r in results) / len(results)
        avg_time = sum(r['inference_time'] for r in results) / len(results)
        
        summary = {
            'dataset': dataset_name,
            'num_images': len(results),
            'avg_rouge1': avg_rouge1,
            'avg_rouge2': avg_rouge2,
            'avg_rougeL': avg_rougeL,
            'avg_inference_time': avg_time
        }
        
        print("\nAverage ROUGE Scores:")
        print(f"  ROUGE-1: {avg_rouge1:.4f}")
        print(f"  ROUGE-2: {avg_rouge2:.4f}")
        print(f"  ROUGE-L: {avg_rougeL:.4f}")
        print(f"Average inference time: {avg_time:.2f} seconds")
        
        return summary
    
    return None

def visualize_results(receipt_results, form_results):
    """Create visualizations comparing performance on both datasets"""
    os.makedirs(CFG.output_dir, exist_ok=True)
    
    if not receipt_results or not form_results:
        print("Not enough data to create visualizations")
        return
    
    # Extract metrics
    datasets = ['Receipts', 'Forms']
    rouge1_scores = [receipt_results['avg_rouge1'], form_results['avg_rouge1']]
    rouge2_scores = [receipt_results['avg_rouge2'], form_results['avg_rouge2']]
    rougeL_scores = [receipt_results['avg_rougeL'], form_results['avg_rougeL']]
    inf_times = [receipt_results['avg_inference_time'], form_results['avg_inference_time']]
    
    # Create ROUGE scores comparison
    plt.figure(figsize=(10, 6))
    x = np.arange(len(datasets))
    width = 0.25
    
    plt.bar(x - width, rouge1_scores, width, label='ROUGE-1', color='#2196F3')
    plt.bar(x, rouge2_scores, width, label='ROUGE-2', color='#4CAF50')
    plt.bar(x + width, rougeL_scores, width, label='ROUGE-L', color='#FFC107')
    
    plt.xlabel('Datasets')
    plt.ylabel('ROUGE Score')
    plt.title('Llama 3.2 Performance on Different Datasets')
    plt.xticks(x, datasets)
    plt.ylim(0, 1.0)
    plt.legend()
    plt.grid(axis='y', alpha=0.3)
    
    # Add values on top of bars
    for i, model_idx in enumerate(x):
        plt.text(model_idx - width, rouge1_scores[i] + 0.02, f"{rouge1_scores[i]:.3f}", 
                ha='center', va='bottom', fontsize=9)
        plt.text(model_idx, rouge2_scores[i] + 0.02, f"{rouge2_scores[i]:.3f}", 
                ha='center', va='bottom', fontsize=9)
        plt.text(model_idx + width, rougeL_scores[i] + 0.02, f"{rougeL_scores[i]:.3f}", 
                ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    plt.savefig(os.path.join(CFG.output_dir, 'dataset_comparison.png'))
    plt.close()
    
    # Create inference time comparison
    plt.figure(figsize=(8, 5))
    plt.bar(datasets, inf_times, color=['#2196F3', '#4CAF50'])
    plt.xlabel('Datasets')
    plt.ylabel('Average Inference Time (seconds)')
    plt.title('Inference Time Comparison')
    
    # Add values on top of bars
    for i, time in enumerate(inf_times):
        plt.text(i, time + 0.1, f"{time:.2f}s", ha='center', va='bottom')
    
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.savefig(os.path.join(CFG.output_dir, 'inference_time_comparison.png'))
    plt.close()
    
    # Return file paths for reporting
    return {
        'rouge_comparison': os.path.join(CFG.output_dir, 'dataset_comparison.png'),
        'time_comparison': os.path.join(CFG.output_dir, 'inference_time_comparison.png')
    }

def create_comparison_table(receipt_summary, form_summary):
    """Create a comparison table for the two datasets"""
    if not receipt_summary or not form_summary:
        return None
    
    # Determine which dataset performs better for each metric
    better_dataset = {
        'rouge1': 'Receipts' if receipt_summary['avg_rouge1'] > form_summary['avg_rouge1'] else 'Forms',
        'rouge2': 'Receipts' if receipt_summary['avg_rouge2'] > form_summary['avg_rouge2'] else 'Forms',
        'rougeL': 'Receipts' if receipt_summary['avg_rougeL'] > form_summary['avg_rougeL'] else 'Forms',
        'time': 'Receipts' if receipt_summary['avg_inference_time'] < form_summary['avg_inference_time'] else 'Forms'
    }
    
    # Calculate percentage difference
    diff_percent = {
        'rouge1': abs(receipt_summary['avg_rouge1'] - form_summary['avg_rouge1']) / 
                 max(receipt_summary['avg_rouge1'], form_summary['avg_rouge1']) * 100,
        'rouge2': abs(receipt_summary['avg_rouge2'] - form_summary['avg_rouge2']) / 
                 max(receipt_summary['avg_rouge2'], form_summary['avg_rouge2']) * 100,
        'rougeL': abs(receipt_summary['avg_rougeL'] - form_summary['avg_rougeL']) / 
                 max(receipt_summary['avg_rougeL'], form_summary['avg_rougeL']) * 100,
        'time': abs(receipt_summary['avg_inference_time'] - form_summary['avg_inference_time']) / 
               max(receipt_summary['avg_inference_time'], form_summary['avg_inference_time']) * 100
    }
    
    # Create DataFrame for the table
    import pandas as pd
    df = pd.DataFrame({
        'Metric': ['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'Inference Time (s)'],
        'Receipts': [
            f"{receipt_summary['avg_rouge1']:.4f}",
            f"{receipt_summary['avg_rouge2']:.4f}",
            f"{receipt_summary['avg_rougeL']:.4f}",
            f"{receipt_summary['avg_inference_time']:.2f}"
        ],
        'Forms': [
            f"{form_summary['avg_rouge1']:.4f}",
            f"{form_summary['avg_rouge2']:.4f}",
            f"{form_summary['avg_rougeL']:.4f}",
            f"{form_summary['avg_inference_time']:.2f}"
        ],
        'Better Dataset': [
            better_dataset['rouge1'],
            better_dataset['rouge2'],
            better_dataset['rougeL'],
            better_dataset['time']
        ],
        'Difference %': [
            f"{diff_percent['rouge1']:.2f}%",
            f"{diff_percent['rouge2']:.2f}%",
            f"{diff_percent['rougeL']:.2f}%",
            f"{diff_percent['time']:.2f}%"
        ]
    })
    
    # Save to CSV
    csv_path = os.path.join(CFG.output_dir, 'dataset_comparison.csv')
    df.to_csv(csv_path, index=False)
    
    # Also save as markdown table
    with open(os.path.join(CFG.output_dir, 'dataset_comparison.md'), 'w') as f:
        f.write(df.to_markdown(index=False))
    
    return df

def main():
    # Create output directory
    os.makedirs(CFG.output_dir, exist_ok=True)
    
    # Build model once
    processor, model = build_model()
    
    # Process receipt dataset
    receipt_summary = process_dataset(
        dataset_dir=CFG.receipt_dir,
        output_file=CFG.receipt_output,
        model=model,
        processor=processor,
        dataset_name="Receipts"
    )
    
    # Process form dataset
    form_summary = process_dataset(
        dataset_dir=CFG.form_dir,
        output_file=CFG.form_output,
        model=model,
        processor=processor,
        dataset_name="Forms"
    )
    
    # Create visualizations
    if receipt_summary and form_summary:
        chart_paths = visualize_results(receipt_summary, form_summary)
        
        # Create comparison table
        comparison_table = create_comparison_table(receipt_summary, form_summary)
        
        # Save combined summary
        combined_summary = {
            'model': CFG.model_name,
            'receipts': receipt_summary,
            'forms': form_summary,
            'charts': chart_paths
        }
        
        with open(os.path.join(CFG.output_dir, CFG.summary_output), 'w') as f:
            json.dump(combined_summary, f, indent=2)
        
        print(f"\nComparison summary saved to {os.path.join(CFG.output_dir, CFG.summary_output)}")
        print("\nDataset Comparison:")
        print(comparison_table)
    else:
        print("\nCould not create comparison - one or both datasets failed to process")

if __name__ == "__main__":
    main()


Loading model: meta-llama/Llama-3.2-11B-Vision-Instruct



2025-05-21 07:24:56.959305: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747812297.139391      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747812297.191889      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


preprocessor_config.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.8k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/5.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/5.09k [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/89.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

Model loaded successfully, using device: cuda:0

Processing Receipts dataset from /kaggle/input/handwritten-data-form-receipts/receipts/receipts
Found 5 image-txt pairs

Processing 1/5: image2
Loaded image: /kaggle/input/handwritten-data-form-receipts/receipts/receipts/image2.jpg, size: (517, 748)
Running inference with receipts prompt...

OCR Result Preview:

The receipt is from "Dr. to A. Murphy" and is dated January 9th, 1939. The store name is not visible. The items purchased are:

* 2 weeks rent of cot...

Ground Truth Preview:
9th January 1939

Mrs E. Martin
No. 8
Dr. to A. Murphy,

2 weeks rent of cottage No. 8
from 7th to 21st Jan 39 @ 35/-    3  10  0
Sanitary fee        ...

Calculating ROUGE scores...
ROUGE-1: 0.5241
ROUGE-2: 0.3636
ROUGE-L: 0.4690

Processing 2/5: image1
Loaded image: /kaggle/input/handwritten-data-form-receipts/receipts/receipts/image1.jpg, size: (400, 532)
Running inference with receipts prompt...

OCR Result Preview:

The receipt is from A Photographers 

In [5]:
!zip -r llama2_handwritten /kaggle/working/llama32_results

  adding: kaggle/working/llama32_results/ (stored 0%)
  adding: kaggle/working/llama32_results/llama32_summary.json (deflated 54%)
  adding: kaggle/working/llama32_results/dataset_comparison.csv (deflated 28%)
  adding: kaggle/working/llama32_results/inference_time_comparison.png (deflated 19%)
  adding: kaggle/working/llama32_results/dataset_comparison.png (deflated 22%)
  adding: kaggle/working/llama32_results/dataset_comparison.md (deflated 60%)
  adding: kaggle/working/llama32_results/llama32_form_results.json (deflated 82%)
  adding: kaggle/working/llama32_results/llama32_receipt_results.json (deflated 66%)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
