In [1]:
import torch
import io
import os
import evaluate
from datasets import load_dataset
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
from tqdm.auto import tqdm

In [19]:
import os
import torch

# --- 1. Define Your Project Paths ---
MODEL_NAME = "microsoft/trocr-base-printed"  # The pretrained HF model name

# Model directory (based on your screenshot)
relative_model_dir = "../output/printed_model/trocr-final-model"
abs_model_dir = os.path.abspath(relative_model_dir)
MODEL_DIR = abs_model_dir.replace("\\", "/")

# ✅ Corrected data directory (you have data/images/)
relative_data_dir = "../../data/images/"
RAW_DATA_DIR = os.path.abspath(relative_data_dir)
TEST_PARQUET = "test_printed.parquet"  # or 'val_printed.parquet'
TEST_DATA_PATH = os.path.join(RAW_DATA_DIR, TEST_PARQUET)

# --- 2. Define Your Column Names ---
IMAGE_DATA_COLUMN = "image"
TEXT_LABEL_COLUMN = "text"

# --- 3. Setup Device ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- 4. Sanity Checks ---
print(f"✅ Model path: {MODEL_DIR}")
print(f"✅ Data path: {TEST_DATA_PATH}")
print(f"✅ Using device: {device}")

if not os.path.exists(MODEL_DIR):
    raise FileNotFoundError(f"❌ Model directory not found: {MODEL_DIR}")

if not os.path.exists(TEST_DATA_PATH):
    raise FileNotFoundError(f"❌ Test data not found: {TEST_DATA_PATH}")

✅ Model path: C:/Users/Shashwat Kumar/Desktop/Labs/Optical Character Recognition/ml_training/output/printed_model/trocr-final-model
✅ Data path: C:\Users\Shashwat Kumar\Desktop\Labs\Optical Character Recognition\data\images\test_printed.parquet
✅ Using device: cpu


In [20]:
print(f"Loading processor from Hugging Face: {MODEL_NAME}")
print(f"Loading fine-tuned model from: {MODEL_DIR}")

try:
    # Load the original processor from Hugging Face
    processor = TrOCRProcessor.from_pretrained(MODEL_NAME)
    
    # Load our fine-tuned model weights from the local directory
    model = VisionEncoderDecoderModel.from_pretrained(MODEL_DIR)
    
    # Move the model to your GPU (if available) or CPU
    model.to(device)
    
    print("...Model and processor loaded successfully.")
except Exception as e:
    print(f"ERROR: Could not load model. Did you unzip the folder correctly?")
    print(e)
    raise

Loading processor from Hugging Face: microsoft/trocr-base-printed
Loading fine-tuned model from: C:/Users/Shashwat Kumar/Desktop/Labs/Optical Character Recognition/ml_training/output/printed_model/trocr-final-model
...Model and processor loaded successfully.


In [21]:
print(f"Loading test data from: {TEST_DATA_PATH}")
try:
    test_dataset = load_dataset("parquet", data_files={"test": TEST_DATA_PATH}, split="test")
    print(f"Loaded {len(test_dataset)} total test samples.")
except Exception as e:
    print(f"ERROR: Could not load test file. Check your path: {TEST_DATA_PATH}")
    print(e)
    raise

# --- Standardize Column Schemas ---
print("\nStandardizing column schemas...")
if TEXT_LABEL_COLUMN not in test_dataset.column_names and 'label' in test_dataset.column_names:
    print("  - Renaming 'label' to 'text' in test set.")
    test_dataset = test_dataset.rename_column('label', TEXT_LABEL_COLUMN)
if 'source' in test_dataset.column_names:
    print("  - Removing 'source' column from test set for consistency.")
    test_dataset = test_dataset.remove_columns(['source'])

print("...Test data schema standardized.")

# --- Select a subset for fast evaluation ---
TEST_SUBSET_SIZE = 1000
if len(test_dataset) > TEST_SUBSET_SIZE:
    test_dataset = test_dataset.shuffle(seed=42).select(range(TEST_SUBSET_SIZE))
    print(f"\n--- Using a {TEST_SUBSET_SIZE}-sample subset for evaluation ---")

Loading test data from: C:\Users\Shashwat Kumar\Desktop\Labs\Optical Character Recognition\data\images\test_printed.parquet


Generating test split: 0 examples [00:00, ? examples/s]

Loaded 297308 total test samples.

Standardizing column schemas...
  - Renaming 'label' to 'text' in test set.
...Test data schema standardized.

--- Using a 1000-sample subset for evaluation ---


In [22]:
def predict_batch(batch):
    """
    This function takes a batch from the dataset,
    processes the images, and generates predictions.
    """
    images_to_process = []
    
    # 1. Loop through the batch and load images
    for image_data in batch[IMAGE_DATA_COLUMN]:
        try:
            # This logic handles all 3 data types we've seen
            if isinstance(image_data, bytes):
                image = Image.open(io.BytesIO(image_data)).convert("RGB")
            elif isinstance(image_data, dict) and 'bytes' in image_data:
                image = Image.open(io.BytesIO(image_data['bytes'])).convert("RGB")
            elif isinstance(image_data, Image.Image):
                image = image_data.convert("RGB")
            else:
                continue # Skip unknown type
            
            images_to_process.append(image)
        except Exception:
            continue # Skip corrupt images
            
    # 2. Process all images at once
    # We move the inputs to the same device as the model
    inputs = processor(images=images_to_process, return_tensors="pt", padding=True).to(device)

    # 3. Generate predictions
    generated_ids = model.generate(**inputs, max_length=64)
    
    # 4. Decode predictions
    pred_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
    
    # 5. Add predictions to the batch
    batch["predicted_text"] = pred_text
    return batch

print("Evaluation function defined.")

Evaluation function defined.


In [None]:
# Set a safe batch size for your local machine
# If you get a memory error, lower this to 4 or 1.
EVAL_BATCH_SIZE = 8 

print(f"Running evaluation on {len(test_dataset)} samples (Batch Size: {EVAL_BATCH_SIZE})...")

# This applies our 'predict_batch' function to the entire dataset
results_dataset = test_dataset.map(
    predict_batch,
    batched=True,
    batch_size=EVAL_BATCH_SIZE
)

print("...Evaluation complete.")

# Extract the final lists for calculation
ground_truth_labels = results_dataset[TEXT_LABEL_COLUMN]
model_predictions = results_dataset["predicted_text"]

Running evaluation on 1000 samples (Batch Size: 8)...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]