# Imports

In [4]:
%%time

from IPython.display import clear_output

!pip install transformers==4.45.0
!pip install bitsandbytes==0.44.1 accelerate
! pip install einops flash_attn # florence 2
!pip install rouge_score


clear_output()

CPU times: user 217 ms, sys: 128 ms, total: 346 ms
Wall time: 13.4 s


In [None]:
import os
os.environ["HF_TOKEN"] = 'paste ur hf token'

FLORENCE 2 

In [None]:
import pandas as pd
import json
import os
import numpy as np
import xml.etree.ElementTree as ET
from rouge_score import rouge_scorer
import re
import glob
from PIL import Image
import cv2
import torch
from transformers import AutoProcessor, AutoModelForCausalLM
import time
import gc

# Configuration
class CFG:
    # Model
    florence_model = "microsoft/Florence-2-base"  # You can change to large if you have enough GPU memory
    
    # Input data
    image_root = '/kaggle/input/ocr-receipts-text-detection/images'
    annotation_file = '/kaggle/input/ocr-receipts-text-detection/annotations.xml'
    
    # Output data
    output_csv = 'florence2_ocr_results.csv'
    rouge_output = 'florence2_rouge_scores.json'
    
    # Process all images or limit to a specific number
    num_images = 20  # Set to None to process all images

# Load model
def build_model():
    print(f'\nLoading Florence model: {CFG.florence_model}\n')
    
    try:
        processor = AutoProcessor.from_pretrained(
            CFG.florence_model, 
            trust_remote_code=True
        )

        # Determine if CUDA is available and set device accordingly
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {device}")

        # If using CPU, use float32 instead of float16
        dtype = torch.float16 if torch.cuda.is_available() else torch.float32

        model = AutoModelForCausalLM.from_pretrained(
            CFG.florence_model,
            trust_remote_code=True,
            torch_dtype=dtype,
        ).to(device).eval()
        
        return processor, model
    
    except Exception as e:
        print(f"Error loading Florence model: {str(e)}")
        print("\nAttempting to load with different configuration...")
        
        # Try with fewer parameters if memory issues
        processor = AutoProcessor.from_pretrained(
            CFG.florence_model, 
            trust_remote_code=True
        )

        model = AutoModelForCausalLM.from_pretrained(
            CFG.florence_model,
            trust_remote_code=True,
            torch_dtype=torch.float16,
            device_map="auto",
            low_cpu_mem_usage=True
        ).eval()
        
        return processor, model

# Process single image - Keep original OCR processing
def process_image(image_path, processor, model):
    try:
        print(f"Processing image: {os.path.basename(image_path)}")
        
        # Read image with OpenCV then convert to PIL
        image = cv2.imread(image_path)
        if image is None:
            print(f"Error: Could not read image {image_path}")
            return None, None, None
            
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(image_rgb)

        # Start timing
        start_time = time.time()
        if torch.cuda.is_available():
            torch.cuda.reset_peak_memory_stats()

        # Run OCR inference with Florence model - using only "<OCR>" as requested
        inputs = processor(
            text="<OCR>",  # Keep this exactly as "<OCR>" - do not modify
            images=pil_image,
            return_tensors="pt"
        )

        # Move inputs to device
        device = next(model.parameters()).device
        inputs = {k: v.to(device=device, dtype=torch.float16 if k == "pixel_values" else v.dtype)
                 for k, v in inputs.items()}

        # Generate text
        with torch.no_grad():
            generated_ids = model.generate(
                input_ids=inputs["input_ids"],
                pixel_values=inputs["pixel_values"],
                max_new_tokens=1024,
                num_beams=3
            )

        # Decode text
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
        result = processor.post_process_generation(
            generated_text,
            task="<OCR>",
            image_size=(pil_image.width, pil_image.height)
        )

        # Get metrics
        inference_time = time.time() - start_time
        max_memory = torch.cuda.max_memory_allocated() / (1024 ** 2) if torch.cuda.is_available() else 0

        # Clean up
        del inputs, generated_ids
        torch.cuda.empty_cache()
        gc.collect()

        print(f"Inference completed in {inference_time:.2f} seconds")
        
        # Return OCR result, timing, and memory usage
        return result, inference_time, max_memory

    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        import traceback
        traceback.print_exc()
        return None, None, None

# Parse annotations from XML file - improved version
def parse_annotations(xml_file):
    """Parse annotations from XML file with improved error handling"""
    print(f"Parsing annotations from: {xml_file}")
    try:
        tree = ET.parse(xml_file)
        root = tree.getroot()
        
        annotations = {}
        
        # Process each image
        for image in root.findall('.//image'):
            img_id = int(image.get('id'))
            # Extract just the filename, handling both unix and windows paths
            img_name = os.path.basename(image.get('name'))
            
            # Extract annotations for this image
            boxes = []
            for box in image.findall('.//box'):
                label_type = box.get('label')
                
                # Find the text attribute
                for attr in box.findall('.//attribute'):
                    if attr.get('name') == 'text':
                        text = attr.text
                        if text is not None:
                            boxes.append({
                                'label': label_type,
                                'text': text
                            })
                            break
            
            # Store annotations
            annotations[img_name] = {
                'image_id': img_id,
                'boxes': boxes
            }
        
        print(f"Successfully parsed annotations for {len(annotations)} images")
        return annotations
        
    except Exception as e:
        print(f"Error parsing XML file: {str(e)}")
        import traceback
        traceback.print_exc()
        return {}

# Improved text cleaning that preserves important receipt characters
def clean_text(text):
    """Clean text while preserving important receipt characters"""
    if text is None or text == "None":
        return ""
    
    # Convert to string if needed
    text = str(text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove extra whitespace, newlines and normalize
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Preserve important receipt characters like $, ., /, %, etc.
    text = re.sub(r'[^a-z0-9\s$.,:;/%-@]', '', text)
    
    return text

# Extract the ground truth text from annotation boxes - focus on text only
def extract_ground_truth(image_filename, annotations):
    """Extract ground truth text from annotations - simple concatenation"""
    if image_filename not in annotations:
        return ""
    
    # Get all text from annotation boxes
    all_text = ' '.join([box['text'] for box in annotations[image_filename]['boxes']])
    return all_text

# Main processing function for OCR
def process_ocr():
    # Load annotations
    annotations = parse_annotations(CFG.annotation_file)
    print(f"Loaded annotations for {len(annotations)} images")
    
    # Get image paths
    all_images = sorted([
        os.path.join(CFG.image_root, f)
        for f in os.listdir(CFG.image_root)
        if f.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff'))
    ])
    
    # Limit number of images if specified
    if CFG.num_images is not None:
        all_images = all_images[:CFG.num_images]

    print(f"Found {len(all_images)} images to process")

    # Load model once
    processor, model = build_model()

    results = []

    for idx, img_path in enumerate(all_images, 1):
        img_filename = os.path.basename(img_path)
        print(f"\nProcessing image {idx}/{len(all_images)}: {img_filename}")

        # Only process if we have annotations
        if img_filename not in annotations:
            print(f"No annotations found for {img_filename}, skipping")
            continue

        # Process the image
        result, inf_time, mem_usage = process_image(img_path, processor, model)

        # Skip if processing failed
        if result is None:
            print(f"Failed to process {img_filename}, skipping")
            continue

        # Get OCR text
        if isinstance(result, dict) and '<OCR>' in result:
            ocr_text = result['<OCR>']
        else:
            ocr_text = str(result)

        # Preview OCR result
        preview = ocr_text[:150] + "..." if len(ocr_text) > 150 else ocr_text
        print(f"OCR Result Preview: {preview}")

        # Store result
        results.append({
            'image_id': img_filename,
            'ocr_text': ocr_text,
            'inference_time_sec': inf_time,
            'gpu_memory_usage_mb': mem_usage
        })

    # Save results
    df = pd.DataFrame(results)
    df.to_csv(CFG.output_csv, index=False)
    print(f"Results saved to {CFG.output_csv}")

    return df, annotations

# Calculate ROUGE scores between OCR results and ground truth - improved version
def calculate_rouge(ocr_results_df, annotations):
    print("\nCalculating ROUGE scores...")
    
    # Initialize ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    rouge_scores = []

    # Process each image that has OCR results
    for _, row in ocr_results_df.iterrows():
        image_filename = row['image_id']
        ocr_text = clean_text(row['ocr_text'])

        # Skip if OCR failed
        if not ocr_text:
            print(f"Skipping {image_filename} - No OCR text available")
            continue

        # Get ground truth from annotations
        ground_truth = extract_ground_truth(image_filename, annotations)
        ground_truth = clean_text(ground_truth)

        if not ground_truth:
            print(f"Empty ground truth for {image_filename}")
            continue

        # Calculate ROUGE scores
        scores = scorer.score(ground_truth, ocr_text)

        # Calculate per-label scores
        per_label_scores = {}
        if image_filename in annotations:
            for label_type in set(box['label'] for box in annotations[image_filename]['boxes']):
                # Get text for this label only
                label_text = ' '.join([
                    box['text'] for box in annotations[image_filename]['boxes'] 
                    if box['label'] == label_type
                ])
                label_text = clean_text(label_text)
                
                # Skip if empty
                if not label_text:
                    continue
                    
                # Calculate scores
                label_scores = scorer.score(label_text, ocr_text)
                per_label_scores[label_type] = {
                    'rouge1': label_scores['rouge1'].fmeasure,
                    'rouge2': label_scores['rouge2'].fmeasure,
                    'rougeL': label_scores['rougeL'].fmeasure
                }

        # Store scores
        rouge_scores.append({
            'image_file': image_filename,
            'rouge1_precision': scores['rouge1'].precision,
            'rouge1_recall': scores['rouge1'].recall,
            'rouge1_f1': scores['rouge1'].fmeasure,
            'rouge2_precision': scores['rouge2'].precision,
            'rouge2_recall': scores['rouge2'].recall,
            'rouge2_f1': scores['rouge2'].fmeasure,
            'rougeL_precision': scores['rougeL'].precision,
            'rougeL_recall': scores['rougeL'].recall,
            'rougeL_f1': scores['rougeL'].fmeasure,
            'ground_truth_length': len(ground_truth),
            'ocr_text_length': len(ocr_text),
            'per_label_scores': per_label_scores,
            'inference_time': row['inference_time_sec'],
            'memory_usage': row['gpu_memory_usage_mb']
        })

        print(f"Calculated ROUGE for {image_filename}: ROUGE-1 F1 = {scores['rouge1'].fmeasure:.4f}")

    # Save ROUGE scores to JSON
    with open(CFG.rouge_output, 'w') as f:
        json.dump(rouge_scores, f, indent=4)

    # Calculate and print average scores
    if rouge_scores:
        avg_scores = {
            'avg_rouge1_precision': np.mean([s['rouge1_precision'] for s in rouge_scores]),
            'avg_rouge1_recall': np.mean([s['rouge1_recall'] for s in rouge_scores]),
            'avg_rouge1_f1': np.mean([s['rouge1_f1'] for s in rouge_scores]),
            'avg_rouge2_precision': np.mean([s['rouge2_precision'] for s in rouge_scores]),
            'avg_rouge2_recall': np.mean([s['rouge2_recall'] for s in rouge_scores]),
            'avg_rouge2_f1': np.mean([s['rouge2_f1'] for s in rouge_scores]),
            'avg_rougeL_precision': np.mean([s['rougeL_precision'] for s in rouge_scores]),
            'avg_rougeL_recall': np.mean([s['rougeL_recall'] for s in rouge_scores]),
            'avg_rougeL_f1': np.mean([s['rougeL_f1'] for s in rouge_scores]),
            'avg_inference_time': np.mean([s['inference_time'] for s in rouge_scores]),
            'avg_memory_usage': np.mean([s['memory_usage'] for s in rouge_scores if s['memory_usage'] is not None])
        }

        print("\nAverage ROUGE Scores:")
        for metric, value in avg_scores.items():
            print(f"{metric}: {value:.4f}")

        # Save summary
        with open('florence2_rouge_summary.json', 'w') as f:
            json.dump(avg_scores, f, indent=4)

        # Create sorted lists for best/worst performing images
        sorted_by_f1 = sorted(rouge_scores, key=lambda x: x['rougeL_f1'], reverse=True)

        print("\nTop 5 images by ROUGE-L F1 score:")
        for i, score in enumerate(sorted_by_f1[:5]):
            print(f"{i+1}. {score['image_file']} - ROUGE-L F1: {score['rougeL_f1']:.4f}")

        print("\nBottom 5 images by ROUGE-L F1 score:")
        for i, score in enumerate(sorted_by_f1[-5:]):
            print(f"{i+1}. {score['image_file']} - ROUGE-L F1: {score['rougeL_f1']:.4f}")

        return avg_scores
    else:
        print("No ROUGE scores calculated. Check your OCR results and annotation files.")
        return None

# Function to debug annotations for verification
def debug_annotations(annotations, num_samples=3):
    """Print sample annotations to verify parsing is correct"""
    print("\nDebugging annotation parsing:")
    
    if not annotations:
        print("No annotations found!")
        return
    
    # Get a few sample images
    sample_keys = list(annotations.keys())[:num_samples]
    
    for img_key in sample_keys:
        annotation = annotations[img_key]
        print(f"\nImage: {img_key}, ID: {annotation['image_id']}")
        print(f"Number of annotation boxes: {len(annotation['boxes'])}")
        
        # Print first few boxes
        for i, box in enumerate(annotation['boxes'][:3]):
            print(f"  Box {i+1}:")
            print(f"    Label: {box['label']}")
            print(f"    Text: {box['text']}")
        
        # Print ground truth as it would be used
        ground_truth = extract_ground_truth(img_key, annotations)
        print(f"\n  Full ground truth text ({len(ground_truth)} chars):")
        if len(ground_truth) > 100:
            print(f"    {ground_truth[:100]}...")
        else:
            print(f"    {ground_truth}")
        
        # Print cleaned ground truth
        cleaned_gt = clean_text(ground_truth)
        print(f"\n  Cleaned ground truth text ({len(cleaned_gt)} chars):")
        if len(cleaned_gt) > 100:
            print(f"    {cleaned_gt[:100]}...")
        else:
            print(f"    {cleaned_gt}")

# Main execution
if __name__ == "__main__":
    # Process images with OCR
    ocr_results, annotations = process_ocr()
    
    # Debug annotations to verify parsing
    debug_annotations(annotations)
    
    # Calculate ROUGE scores
    calculate_rouge(ocr_results, annotations)

LLAMA 2 VISION INSTRUCT

In [None]:
!pip install -q transformers==4.45.0 bitsandbytes==0.44.1 accelerate
!pip install -q rouge-score psutil
!pip install -q triton flash-attn

In [None]:
import os
import gc
import time
import json
import xml.etree.ElementTree as ET
from PIL import Image
import cv2
import numpy as np
import torch
import matplotlib.pyplot as plt
from rouge_score import rouge_scorer
import psutil
import re

# Configuration class
class CFG:
    # Model
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
    max_tokens = 256
    temperature = 0.01
    
    # Input data
    image_root = '/kaggle/input/ocr-receipts-text-detection/images'
    annotation_file = '/kaggle/input/ocr-receipts-text-detection/annotations.xml'
    
    # Output data
    output_path = "./llama32_results.json"
    
    # Process all images
    process_all = True
    
    # Prompts
    ocr_prompt = """
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

<|image|>Extract text from Grocery Store bill Image:

<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""
    
    filter_prompt = """
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

<|image|>Extract the total amount paid, items and the shop name from Grocery Store bill Image, use bullet points to organize the answer:

<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

def get_memory_usage():
    """Get current memory usage of the process"""
    process = psutil.Process(os.getpid())
    memory_info = process.memory_info()
    return {
        'rss': memory_info.rss / (1024 * 1024),  # RSS in MB
        'vms': memory_info.vms / (1024 * 1024)   # VMS in MB
    }

def get_cuda_memory_usage():
    """Get CUDA memory usage using torch"""
    if torch.cuda.is_available():
        return {
            'allocated': torch.cuda.memory_allocated() / (1024 * 1024),  # MB
            'reserved': torch.cuda.memory_reserved() / (1024 * 1024),    # MB
            'max_allocated': torch.cuda.max_memory_allocated() / (1024 * 1024)  # MB
        }
    return None

def build_model(model_repo):
    """Build Llama 3.2 model without triton dependencies"""
    print(f'\nLoading model: {model_repo}\n')
    
    try:
        # Import without using BitsAndBytes quantization
        from transformers import MllamaForConditionalGeneration, AutoProcessor
        
        # Processor
        processor = AutoProcessor.from_pretrained(
            model_repo,
            use_auth_token=os.environ.get("HF_TOKEN")
        )

        # Model with half precision but no quantization
        model = MllamaForConditionalGeneration.from_pretrained(
            model_repo,
            torch_dtype=torch.float16,
            device_map="auto",  # Automatically decide which parts go on which devices
            low_cpu_mem_usage=True,
            use_auth_token=os.environ.get("HF_TOKEN")
        ).eval()
        
        print(f"Model loaded successfully, using device: {next(model.parameters()).device}")
        return processor, model
    
    except Exception as e:
        print(f"Error loading Llama model: {str(e)}")
        raise  # Re-raise the exception since you want to use Llama 3.2 only

def inference(prompt, image, model, processor):
    """Run inference with Llama 3.2"""
    start_time = time.time()
    
    # Record metrics before inference
    metrics_before = {
        'memory': get_memory_usage(),
        'cuda': get_cuda_memory_usage(),
        'timestamp': time.time()
    }
    
    try:
        # Prepare inputs
        inputs = processor(
            image,
            prompt,
            return_tensors="pt"
        ).to(model.device)

        # Run generation
        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=CFG.max_tokens,
                temperature=CFG.temperature
            )

        # Process output
        ans = processor.decode(output[0])
        result = ans.split('<|eot_id|><|start_header_id|>assistant<|end_header_id|>')[-1].split('<|eot_id|>')[0]
    
    except Exception as e:
        print(f"Error during inference: {str(e)}")
        result = f"Error: {str(e)}"
    
    # Record metrics after inference
    metrics_after = {
        'memory': get_memory_usage(),
        'cuda': get_cuda_memory_usage(),
        'timestamp': time.time()
    }
    
    # Calculate runtime
    inference_time = metrics_after['timestamp'] - metrics_before['timestamp']
    
    # Clear CUDA cache
    torch.cuda.empty_cache()
    gc.collect()
    
    # Return results and metrics
    return {
        'result': result,
        'metrics': {
            'before': metrics_before,
            'after': metrics_after,
            'inference_time': inference_time
        }
    }

def parse_annotations(xml_file):
    """Parse annotations from XML file"""
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    annotations = {}
    
    # Process each image
    for image in root.findall('.//image'):
        img_id = int(image.get('id'))
        img_name = image.get('name').split('/')[-1]
        
        # Extract annotations for this image
        boxes = []
        for box in image.findall('.//box'):
            label_type = box.get('label')
            text = box.find('.//attribute[@name="text"]')
            
            if text is not None and text.text is not None:
                boxes.append({
                    'label': label_type,
                    'text': text.text
                })
        
        # Store annotations
        annotations[img_id] = {
            'filename': img_name,
            'boxes': boxes
        }
    
    return annotations

def normalize_text(text):
    """Normalize text for better ROUGE matching"""
    # Convert to lowercase
    text = text.lower()
    # Replace multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    # Remove non-alphanumeric chars except spaces
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # Trim leading/trailing spaces
    text = text.strip()
    return text

def calculate_rouge_scores(predicted_text, reference_boxes):
    """Calculate ROUGE scores between prediction and references"""
    # Initialize the ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # Combine all reference texts
    all_text = ' '.join([box['text'] for box in reference_boxes])
    normalized_reference = normalize_text(all_text)
    
    # Ensure predicted text is a string
    predicted = str(predicted_text)
    normalized_prediction = normalize_text(predicted)
    
    # Calculate overall ROUGE score with normalized text
    overall_scores = scorer.score(normalized_reference, normalized_prediction)
    
    # Create per-label normalized texts and scores
    label_scores = {}
    references_by_label = {}
    
    for box in reference_boxes:
        label = box['label']
        if label not in references_by_label:
            references_by_label[label] = []
            
        references_by_label[label].append(box['text'])
    
    for label, texts in references_by_label.items():
        label_text = ' '.join(texts)
        normalized_label_text = normalize_text(label_text)
        label_scores[label] = scorer.score(normalized_label_text, normalized_prediction)
    
    # Debug info to understand why scores might be low
    debug_info = {
        'total_reference_chars': len(normalized_reference),
        'total_prediction_chars': len(normalized_prediction),
        'normalized_reference_sample': normalized_reference[:100] + "..." if len(normalized_reference) > 100 else normalized_reference,
        'normalized_prediction_sample': normalized_prediction[:100] + "..." if len(normalized_prediction) > 100 else normalized_prediction,
        'labels_found': list(references_by_label.keys())
    }
    
    return {
        'rouge1': overall_scores['rouge1'].fmeasure,
        'rouge2': overall_scores['rouge2'].fmeasure,
        'rougeL': overall_scores['rougeL'].fmeasure,
        'per_label': {label: {
            'rouge1': scores['rouge1'].fmeasure,
            'rouge2': scores['rouge2'].fmeasure,
            'rougeL': scores['rougeL'].fmeasure
        } for label, scores in label_scores.items()},
        'debug': debug_info
    }

def main():
    # Load annotations
    annotations = parse_annotations(CFG.annotation_file)
    print(f"Loaded annotations for {len(annotations)} images")
    
    # Build model
    processor, model = build_model(CFG.model_name)
    
    results = []
    
    # Process images - either all or just first 3
    if CFG.process_all:
        test_img_ids = list(annotations.keys())
    else:
        test_img_ids = list(annotations.keys())[:3]
    
    print(f"Processing {len(test_img_ids)} images")
    
    # Process each image
    for img_id in test_img_ids:
        annotation = annotations[img_id]
        print(f"\nProcessing image {img_id}: {annotation['filename']}")
        
        # Display annotation summary
        print(f"Annotations: {len(annotation['boxes'])} boxes")
        for i, box in enumerate(annotation['boxes']):
            if i < 3 or i == len(annotation['boxes']) - 1:  # Show first 3 and last annotation
                print(f"  {box['label']}: {box['text']}")
            elif i == 3:
                print(f"  ... ({len(annotation['boxes']) - 4} more) ...")
        
        # Load image
        img_path = os.path.join(CFG.image_root, annotation['filename'])
        image = cv2.imread(img_path)
        if image is None:
            print(f"Image {img_path} not found. Skipping.")
            continue
            
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(image_rgb)
        
        # Run OCR inference
        print("\nRunning OCR extraction...")
        ocr_result = inference(prompt=CFG.ocr_prompt, image=pil_image, model=model, processor=processor)
        
        # Print OCR result preview
        print("\nOCR Result Preview:")
        preview = ocr_result['result'][:150] + "..." if len(ocr_result['result']) > 150 else ocr_result['result']
        print(preview)
        
        # Run filtered OCR inference
        print("\nRunning filtered extraction...")
        filter_result = inference(prompt=CFG.filter_prompt, image=pil_image, model=model, processor=processor)
        
        # Print filtered result preview
        print("\nFiltered Result Preview:")
        filter_preview = filter_result['result'][:150] + "..." if len(filter_result['result']) > 150 else filter_result['result']
        print(filter_preview)
        
        # Calculate ROUGE scores
        print("\nCalculating ROUGE scores...")
        rouge_scores = calculate_rouge_scores(ocr_result['result'], annotation['boxes'])
        
        print(f"ROUGE-1: {rouge_scores['rouge1']:.4f}")
        print(f"ROUGE-2: {rouge_scores['rouge2']:.4f}")
        print(f"ROUGE-L: {rouge_scores['rougeL']:.4f}")
        
        # Print per-label scores
        print("\nPer-label ROUGE-1 scores:")
        for label, scores in rouge_scores['per_label'].items():
            print(f"  {label}: {scores['rouge1']:.4f}")
        
        # Store results
        results.append({
            'image_id': img_id,
            'filename': annotation['filename'],
            'ocr_result': ocr_result['result'],
            'filter_result': filter_result['result'],
            'ground_truth': [box for box in annotation['boxes']],
            'rouge_scores': rouge_scores,
            'ocr_metrics': ocr_result['metrics'],
            'filter_metrics': filter_result['metrics']
        })
        
        # Save intermediate results
        with open(CFG.output_path, 'w') as f:
            json.dump({
                'model': CFG.model_name,
                'results': results
            }, f, indent=2)
    
    print(f"\nEvaluation complete. Results saved to {CFG.output_path}")
    
    # Calculate and display average scores
    if results:
        avg_rouge1 = sum(r['rouge_scores']['rouge1'] for r in results) / len(results)
        avg_rouge2 = sum(r['rouge_scores']['rouge2'] for r in results) / len(results)
        avg_rougeL = sum(r['rouge_scores']['rougeL'] for r in results) / len(results)
        
        # Calculate average inference time
        avg_inference_time = sum(r['ocr_metrics']['inference_time'] for r in results) / len(results)
        
        # Calculate average memory usage if data is available
        cuda_metrics = [r['ocr_metrics']['after']['cuda'] for r in results if r['ocr_metrics']['after']['cuda'] is not None]
        if cuda_metrics:
            avg_allocated_memory = sum(m['allocated'] for m in cuda_metrics) / len(cuda_metrics)
            avg_reserved_memory = sum(m['reserved'] for m in cuda_metrics) / len(cuda_metrics)
            print(f"\nAverage CUDA memory allocated: {avg_allocated_memory:.2f} MB")
            print(f"Average CUDA memory reserved: {avg_reserved_memory:.2f} MB")
        
        # Print average scores
        print("\nAverage ROUGE Scores:")
        print(f"  ROUGE-1: {avg_rouge1:.4f}")
        print(f"  ROUGE-2: {avg_rouge2:.4f}")
        print(f"  ROUGE-L: {avg_rougeL:.4f}")
        print(f"  Average inference time: {avg_inference_time:.2f} seconds")
        
        # Per-label average scores
        print("\nAverage ROUGE-1 per label:")
        label_types = set()
        for r in results:
            label_types.update(r['rouge_scores']['per_label'].keys())
        
        for label in label_types:
            scores = [r['rouge_scores']['per_label'][label]['rouge1'] 
                    for r in results 
                    if label in r['rouge_scores']['per_label']]
            if scores:
                avg_score = sum(scores) / len(scores)
                print(f"  {label}: {avg_score:.4f} (from {len(scores)} images)")
        
        # Create summary output
        summary = {
            'model': CFG.model_name,
            'num_images_processed': len(results),
            'avg_scores': {
                'rouge1': avg_rouge1,
                'rouge2': avg_rouge2,
                'rougeL': avg_rougeL
            },
            'avg_inference_time': avg_inference_time,
            'per_label_avg_scores': {}
        }
        
        # Add per-label stats to summary
        for label in label_types:
            scores = [r['rouge_scores']['per_label'][label]['rouge1'] 
                    for r in results 
                    if label in r['rouge_scores']['per_label']]
            if scores:
                summary['per_label_avg_scores'][label] = sum(scores) / len(scores)
        
        # Save summary
        with open("llama32_summary.json", 'w') as f:
            json.dump(summary, f, indent=2)
        
        print(f"\nSummary saved to llama32_summary.json")

if __name__ == "__main__":
    main()