In [2]:
!pip install -q transformers==4.45.0 bitsandbytes==0.44.1 accelerate
!pip install -q rouge-score psutil
!pip install -q triton flash-attn
import pandas as pd
import json
import os
import numpy as np
from rouge_score import rouge_scorer
import re
import glob
from PIL import Image
import torch
import time
import gc
import sys
import psutil

# Install required packages if not already installed


# Configuration
class CFG:
    llama_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
    image_dir = '/kaggle/input/form-ocr-dataset-1/training_data/images'
    annotation_dir = '/kaggle/input/form-ocr-dataset-1/training_data/annotations'
    num_images = 50
    output_csv = 'llama_ocr_results.csv'
    max_tokens = 256
    temperature = 0.01
    hf_token = ""  # Your HF token
    use_4bit = False  # Set to False to avoid quantization issues
    detailed_metrics = True  # Track detailed performance metrics

# Configure Hugging Face token
os.environ["HF_TOKEN"] = CFG.hf_token

# Function to get current memory usage
def get_memory_usage():
    process = psutil.Process(os.getpid())
    memory_info = process.memory_info()
    
    metrics = {
        'ram_usage_mb': memory_info.rss / (1024 * 1024),
        'gpu_memory_mb': torch.cuda.max_memory_allocated() / (1024 * 1024) if torch.cuda.is_available() else 0,
        'gpu_memory_reserved_mb': torch.cuda.max_memory_reserved() / (1024 * 1024) if torch.cuda.is_available() else 0
    }
    
    return metrics

# Import required modules
from transformers import AutoProcessor, AutoModelForCausalLM

# Load model only once
def build_model():
    print('Loading Llama-3.2-11B-Vision-Instruct model...')
    start_time = time.time()
    
    # Initialize processor
    processor = AutoProcessor.from_pretrained(
        CFG.llama_name,
        token=CFG.hf_token
    )
    
    # Use 16-bit precision (more stable than 4-bit for this model)
    model = AutoModelForCausalLM.from_pretrained(
        CFG.llama_name,
        torch_dtype=torch.float16,
        device_map='auto',
        token=CFG.hf_token
    ).eval()
    
    load_time = time.time() - start_time
    
    print(f"Loaded model in {load_time:.2f} seconds.")
    
    # Get initial memory metrics
    memory_metrics = get_memory_usage()
    print(f"Initial GPU memory usage: {memory_metrics['gpu_memory_mb']:.2f} MB")
    print(f"Initial RAM usage: {memory_metrics['ram_usage_mb']:.2f} MB")
    
    return processor, model, load_time, memory_metrics

# Process single image using Llama
def process_image(image_path, processor, model):
    try:
        metrics = {
            'load_time_sec': None,
            'preprocessing_time_sec': None,
            'inference_time_sec': None,
            'total_time_sec': None,
            'gpu_memory_usage_mb': None,
            'ram_usage_mb': None,
            'image_dimensions': None,
            'success': False
        }
        
        # Start total timing
        total_start_time = time.time()
        
        # Load and preprocess image
        preprocess_start_time = time.time()
        
        # Load image
        image = Image.open(image_path)
        metrics['image_dimensions'] = f"{image.width}x{image.height}"
        
        # Convert to RGB if not already
        if image.mode != 'RGB':
            print(f"Converting {image.mode} image to RGB")
            image = image.convert('RGB')
        
        # Record preprocessing time
        metrics['preprocessing_time_sec'] = time.time() - preprocess_start_time
        
        # Reset CUDA memory stats if available
        if torch.cuda.is_available():
            torch.cuda.reset_peak_memory_stats()
        
        # Define prompt for OCR extraction
        PROMPT = """
        <|begin_of_text|><|start_header_id|>user<|end_header_id|>
        
        <|image|>Extract all the text from this image:
        
        <|eot_id|><|start_header_id|>assistant<|end_header_id|>
        """
        
        # Start inference timing
        inference_start_time = time.time()
        
        # Process inputs - FIXED: only passing required parameters
        model_inputs = processor(
            images=image,
            text=PROMPT,
            return_tensors="pt"
        )
        
        # Move to device
        model_inputs = {k: v.to(model.device) for k, v in model_inputs.items() 
                       if k in ['input_ids', 'attention_mask']}  # Only use inputs the model expects
        
        # Generate output
        output = model.generate(
            **model_inputs,
            max_new_tokens=CFG.max_tokens,
            temperature=CFG.temperature
        )
        
        # Record inference time
        metrics['inference_time_sec'] = time.time() - inference_start_time
        
        # Decode output
        full_text = processor.decode(output[0])
        
        # Extract assistant's response
        if '<|start_header_id|>assistant<|end_header_id|>' in full_text:
            extracted_text = full_text.split('<|start_header_id|>assistant<|end_header_id|>')[-1]
            if '<|eot_id|>' in extracted_text:
                extracted_text = extracted_text.split('<|eot_id|>')[0]
        else:
            # Fallback extraction
            extracted_text = full_text.replace(PROMPT, "")
        
        # Get memory metrics
        if CFG.detailed_metrics:
            mem_metrics = get_memory_usage()
            metrics['gpu_memory_usage_mb'] = mem_metrics['gpu_memory_mb']
            metrics['ram_usage_mb'] = mem_metrics['ram_usage_mb']
        
        # Record total time
        metrics['total_time_sec'] = time.time() - total_start_time
        metrics['success'] = True
        
        # Clean up
        del model_inputs, output
        torch.cuda.empty_cache() if torch.cuda.is_available() else None
        gc.collect()
        
        return extracted_text.strip(), metrics
        
    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        if 'metrics' in locals():
            metrics['error'] = str(e)
            return None, metrics
        else:
            return None, {'error': str(e), 'success': False}

# Main processing function for OCR
def process_ocr():
    # Get images
    all_images = sorted([
        os.path.join(CFG.image_dir, f) 
        for f in os.listdir(CFG.image_dir) 
        if f.lower().endswith(('.png', '.jpg', '.jpeg'))
    ])[:CFG.num_images]
    
    print(f"Found {len(all_images)} images to process")
    
    # Load model once and track loading metrics
    processor, model, model_load_time, initial_memory = build_model()
    
    results = []
    overall_metrics = {
        'model': CFG.llama_name,
        'total_images': len(all_images),
        'successful_images': 0,
        'model_load_time_sec': model_load_time,
        'total_execution_time_sec': 0,
        'average_inference_time_sec': 0,
        'initial_gpu_memory_mb': initial_memory['gpu_memory_mb'],
        'peak_gpu_memory_mb': initial_memory['gpu_memory_mb'],
        'initial_ram_memory_mb': initial_memory['ram_usage_mb'],
        'peak_ram_memory_mb': initial_memory['ram_usage_mb']
    }
    
    overall_start_time = time.time()
    inference_times = []
    
    for idx, img_path in enumerate(all_images, 1):
        img_filename = os.path.basename(img_path)
        print(f"Processing image {idx}/{len(all_images)}: {img_filename}")
        
        ocr_text, metrics = process_image(img_path, processor, model)
        
        # Update metrics
        if metrics.get('success', False):
            overall_metrics['successful_images'] += 1
            if metrics.get('inference_time_sec'):
                inference_times.append(metrics['inference_time_sec'])
            
            if metrics.get('gpu_memory_usage_mb', 0) > overall_metrics['peak_gpu_memory_mb']:
                overall_metrics['peak_gpu_memory_mb'] = metrics['gpu_memory_usage_mb']
                
            if metrics.get('ram_usage_mb', 0) > overall_metrics['peak_ram_memory_mb']:
                overall_metrics['peak_ram_memory_mb'] = metrics['ram_usage_mb']
        
        # Store results
        result_entry = {
            'image_id': img_filename,
            'ocr_text': ocr_text
        }
        
        # Add detailed metrics if enabled
        if CFG.detailed_metrics:
            for key, value in metrics.items():
                result_entry[key] = value
        
        results.append(result_entry)
    
    # Calculate overall metrics
    overall_metrics['total_execution_time_sec'] = time.time() - overall_start_time
    if inference_times:
        overall_metrics['average_inference_time_sec'] = sum(inference_times) / len(inference_times)
        overall_metrics['min_inference_time_sec'] = min(inference_times)
        overall_metrics['max_inference_time_sec'] = max(inference_times)
    
    # Save detailed performance metrics
    with open('llama_performance_metrics.json', 'w') as f:
        json.dump(overall_metrics, f, indent=4)
    
    # Save results
    df = pd.DataFrame(results)
    df.to_csv(CFG.output_csv, index=False)
    print(f"Results saved to {CFG.output_csv}")
    
    # Print performance summary
    print("\n===== PERFORMANCE SUMMARY =====")
    print(f"Total images processed: {overall_metrics['total_images']}")
    print(f"Successful OCR extractions: {overall_metrics['successful_images']}")
    print(f"Model load time: {overall_metrics['model_load_time_sec']:.2f} seconds")
    print(f"Total execution time: {overall_metrics['total_execution_time_sec']:.2f} seconds")
    print(f"Average inference time: {overall_metrics['average_inference_time_sec']:.2f} seconds")
    print(f"Peak GPU memory usage: {overall_metrics['peak_gpu_memory_mb']:.2f} MB")
    print(f"Peak RAM usage: {overall_metrics['peak_ram_memory_mb']:.2f} MB")
    
    return df

# Function to extract all text from annotation file
def extract_text_from_annotation(annotation_file):
    try:
        with open(annotation_file, 'r') as f:
            data = json.load(f)
        
        # Your annotation format has a list of text entries
        all_texts = []
        
        # Extract text from each item in the list
        if isinstance(data, list):
            for item in data:
                if 'text' in item:
                    all_texts.append(item['text'])
        # If the data is a dictionary with a list under a key like 'annotations'
        elif isinstance(data, dict):
            for key in data:
                if isinstance(data[key], list):
                    for item in data[key]:
                        if isinstance(item, dict) and 'text' in item:
                            all_texts.append(item['text'])
        
        # Join all the text pieces
        return " ".join(all_texts)
    except Exception as e:
        print(f"Error extracting text from {annotation_file}: {e}")
        return ""

# Function to clean text for ROUGE comparison
def clean_text(text):
    if text is None or text == "None":
        return ""
    # Remove extra whitespace, newlines and normalize
    return re.sub(r'\s+', ' ', str(text)).strip()

# Main function to calculate ROUGE scores
def calculate_rouge():
    # Load OCR results
    ocr_results_df = pd.read_csv(CFG.output_csv)
    print(f"Loaded {len(ocr_results_df)} OCR results")
    
    # Initialize ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    rouge_scores = []
    
    # Process each image that has OCR results
    for _, row in ocr_results_df.iterrows():
        image_filename = row['image_id']
        ocr_text = clean_text(row['ocr_text'])
        
        # Skip if OCR failed
        if not ocr_text:
            print(f"Skipping {image_filename} - No OCR text available")
            continue
        
        # Find corresponding annotation file
        base_name = os.path.splitext(image_filename)[0]
        annotation_path = os.path.join(CFG.annotation_dir, f"{base_name}.json")
        
        if not os.path.exists(annotation_path):
            print(f"No annotation found for {image_filename}")
            continue
        
        # Extract text from annotation
        ground_truth = extract_text_from_annotation(annotation_path)
        ground_truth = clean_text(ground_truth)
        
        if not ground_truth:
            print(f"Empty ground truth for {image_filename}")
            continue
        
        # Calculate ROUGE scores
        scores = scorer.score(ground_truth, ocr_text)
        
        # Add performance metrics if available
        result_entry = {
            'image_file': image_filename,
            'rouge1_precision': scores['rouge1'].precision,
            'rouge1_recall': scores['rouge1'].recall,
            'rouge1_f1': scores['rouge1'].fmeasure,
            'rouge2_precision': scores['rouge2'].precision,
            'rouge2_recall': scores['rouge2'].recall,
            'rouge2_f1': scores['rouge2'].fmeasure,
            'rougeL_precision': scores['rougeL'].precision,
            'rougeL_recall': scores['rougeL'].recall,
            'rougeL_f1': scores['rougeL'].fmeasure,
            'ground_truth_length': len(ground_truth),
            'ocr_text_length': len(ocr_text)
        }
        
        # Add performance metrics if available in the OCR results
        for metric in ['inference_time_sec', 'total_time_sec', 'gpu_memory_usage_mb', 'ram_usage_mb']:
            if metric in row:
                result_entry[metric] = row[metric]
        
        rouge_scores.append(result_entry)
        
        print(f"Calculated ROUGE for {image_filename}")
    
    # Save ROUGE scores to JSON
    with open('llama_rouge_scores.json', 'w') as f:
        json.dump(rouge_scores, f, indent=4)
    
    # Calculate and print average scores
    if rouge_scores:
        avg_scores = {
            'model': 'Llama-3.2-Vision',
            'avg_rouge1_precision': np.mean([s['rouge1_precision'] for s in rouge_scores]),
            'avg_rouge1_recall': np.mean([s['rouge1_recall'] for s in rouge_scores]),
            'avg_rouge1_f1': np.mean([s['rouge1_f1'] for s in rouge_scores]),
            'avg_rouge2_precision': np.mean([s['rouge2_precision'] for s in rouge_scores]),
            'avg_rouge2_recall': np.mean([s['rouge2_recall'] for s in rouge_scores]),
            'avg_rouge2_f1': np.mean([s['rouge2_f1'] for s in rouge_scores]),
            'avg_rougeL_precision': np.mean([s['rougeL_precision'] for s in rouge_scores]),
            'avg_rougeL_recall': np.mean([s['rougeL_recall'] for s in rouge_scores]),
            'avg_rougeL_f1': np.mean([s['rougeL_f1'] for s in rouge_scores]),
            'num_images_processed': len(rouge_scores)
        }
        
        # Add performance metrics if available
        for metric in ['inference_time_sec', 'total_time_sec', 'gpu_memory_usage_mb', 'ram_usage_mb']:
            if any(metric in s for s in rouge_scores):
                values = [s[metric] for s in rouge_scores if metric in s and s[metric] is not None]
                if values:
                    avg_scores[f'avg_{metric}'] = np.mean(values)
                    avg_scores[f'max_{metric}'] = np.max(values)
                    avg_scores[f'min_{metric}'] = np.min(values)
        
        print("\nAverage ROUGE Scores for Llama-3.2-Vision:")
        for metric, value in avg_scores.items():
            if isinstance(value, float):
                print(f"{metric}: {value:.4f}")
            else:
                print(f"{metric}: {value}")
        
        # Save summary
        with open('llama_rouge_scores_summary.json', 'w') as f:
            json.dump(avg_scores, f, indent=4)
        
        # Create sorted lists for best/worst performing images
        sorted_by_f1 = sorted(rouge_scores, key=lambda x: x['rougeL_f1'], reverse=True)
        
        print("\nTop 5 images by ROUGE-L F1 score:")
        for i, score in enumerate(sorted_by_f1[:5]):
            print(f"{i+1}. {score['image_file']} - ROUGE-L F1: {score['rougeL_f1']:.4f}")
        
        print("\nBottom 5 images by ROUGE-L F1 score:")
        for i, score in enumerate(sorted_by_f1[-5:]):
            print(f"{i+1}. {score['image_file']} - ROUGE-L F1: {score['rougeL_f1']:.4f}")
        
        return avg_scores
    else:
        print("No ROUGE scores calculated. Check your OCR results and annotation files.")
        return None

# Main execution
if __name__ == "__main__":
    # Process images with OCR using Llama model
    process_ocr()
    
    # Calculate ROUGE scores
    calculate_rouge()

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m78.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

2025-05-20 16:31:40.066749: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747758700.276322      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747758700.335600      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Found 50 images to process
Loading Llama-3.2-11B-Vision-Instruct model...


preprocessor_config.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.8k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/5.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/5.09k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/89.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

Loaded model in 175.07 seconds.
Initial GPU memory usage: 13066.53 MB
Initial RAM usage: 7569.65 MB
Processing image 1/50: 0000971160.png
Converting L image to RGB
Processing image 2/50: 0000989556.png
Converting L image to RGB
Processing image 3/50: 0000990274.png
Converting L image to RGB
Processing image 4/50: 0000999294.png
Converting L image to RGB
Processing image 5/50: 0001118259.png
Converting L image to RGB
Processing image 6/50: 0001123541.png
Converting L image to RGB
Processing image 7/50: 0001129658.png
Converting L image to RGB
Processing image 8/50: 0001209043.png
Converting L image to RGB
Processing image 9/50: 0001239897.png
Converting L image to RGB
Processing image 10/50: 0001438955.png
Converting L image to RGB
Processing image 11/50: 0001456787.png
Converting L image to RGB
Processing image 12/50: 0001463282.png
Converting L image to RGB
Processing image 13/50: 0001463448.png
Converting L image to RGB
Processing image 14/50: 0001476912.png
Converting L image to RGB