# Import Libraries

In [None]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

!pip install transformers datasets evaluate jiwer tensorboard



In [None]:
import pandas as pd
from datasets import Dataset, load_dataset, DatasetDict
from transformers import (
    TrOCRProcessor,
    VisionEncoderDecoderModel,
    TrainingArguments,
    Trainer
)
from PIL import Image
import os
import torch
import io
import evaluate

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch

# --- GPU Check ---
if torch.cuda.is_available():
    print(f"✅ GPU is available!")
    print(f"Device Name: {torch.cuda.get_device_name(0)}")
else:
    print(f"WARNING: GPU not available. Check Runtime > Change runtime type.")

✅ GPU is available!
Device Name: Tesla T4


# Run Configuration

In [None]:
print("--- RUN CONFIGURATION: PRINTED MODEL ---")

# --- Define Project Paths ---
DRIVE_PATH = "/content/drive/MyDrive/"

# Check if the path exists to be sure
if not os.path.exists(DRIVE_PATH):
    print(f"ERROR: Google Drive path not found: {DRIVE_PATH}")
    print("Please check the 'drive.mount' cell and your folder names in Google Drive.")
else:
    print(f"Found Google Drive at: {DRIVE_PATH}")

PROCESSED_DATA_DIR = os.path.join(DRIVE_PATH, "ml_training/output/processed_data/")
IMAGES_BASE_DIR = os.path.join(DRIVE_PATH, "data/images/")
OUTPUT_DIR = os.path.join(DRIVE_PATH, "ml_training/output/printed_model/")

# --- Define Model & Data Paths ---
MODEL_NAME = "microsoft/trocr-small-printed"
TRAIN_PARQUET = "printed_streaming.parquet"
VAL_PARQUET = "val_printed.parquet"

# Final paths
TRAIN_PARQUET_PATH = os.path.join(PROCESSED_DATA_DIR, TRAIN_PARQUET)
VAL_PARQUET_PATH = os.path.join(IMAGES_BASE_DIR, VAL_PARQUET)

print(f"Model: {MODEL_NAME}")
print(f"Output: {OUTPUT_DIR}")
print(f"Training data: {TRAIN_PARQUET_PATH}")
print(f"Validation data: {VAL_PARQUET_PATH}")

# --- Define Column Names ---
IMAGE_DATA_COLUMN = "image"
TEXT_LABEL_COLUMN = "text"

--- RUN CONFIGURATION: PRINTED MODEL ---
Found Google Drive at: /content/drive/MyDrive/
Model: microsoft/trocr-small-printed
Output: /content/drive/MyDrive/ml_training/output/printed_model/
Training data: /content/drive/MyDrive/ml_training/output/processed_data/printed_streaming.parquet
Validation data: /content/drive/MyDrive/data/images/val_printed.parquet


# Model and Data Loading

## Load Processor and Model

In [None]:
# --- Load Processor & Model ---
try:
    processor = TrOCRProcessor.from_pretrained(MODEL_NAME)
    model = VisionEncoderDecoderModel.from_pretrained(MODEL_NAME)
    print(f"Processor and Model loaded from {MODEL_NAME}")

    # Set model config for fine-tuning
    model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
    model.config.pad_token_id = processor.tokenizer.pad_token_id
    model.config.vocab_size = model.config.decoder.vocab_size

except Exception as e:
    print(f"Error loading processor/model: {e}")
    raise

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-small-printed and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream 

Processor and Model loaded from microsoft/trocr-small-printed


In [None]:
try:
    print(f"Loading training data from: {TRAIN_PARQUET_PATH}")
    # Load the training data (we know it has 'image', 'text', 'source')
    # We use split='train' to extract the dataset from the dict
    train_dataset = load_dataset("parquet", data_files={"train": TRAIN_PARQUET_PATH}, split="train")

    print(f"Loading validation data from: {VAL_PARQUET_PATH}")
    # Load the validation data (we know it has 'image' (struct), 'label')
    val_dataset = load_dataset("parquet", data_files={"validation": VAL_PARQUET_PATH}, split="validation")

    print("Successfully loaded datasets separately.")
    print(f"\nRaw train dataset: {train_dataset}")
    print(f"Raw val dataset: {val_dataset}")

except Exception as e:
    print(f"Error loading Parquet files: {e}")
    raise

Loading training data from: /content/drive/MyDrive/ml_training/output/processed_data/printed_streaming.parquet
Loading validation data from: /content/drive/MyDrive/data/images/val_printed.parquet
Successfully loaded datasets separately.

Raw train dataset: Dataset({
    features: ['image', 'text', 'source'],
    num_rows: 100000
})
Raw val dataset: Dataset({
    features: ['image', 'label'],
    num_rows: 267578
})


In [None]:
# --- Standardize Column Names ---
print("\nStandardizing column schemas...")

# Rename 'label' in validation to 'text'
if TEXT_LABEL_COLUMN not in val_dataset.column_names and 'label' in val_dataset.column_names:
    print("  - Renaming 'label' to 'text' in validation set.")
    val_dataset = val_dataset.rename_column('label', TEXT_LABEL_COLUMN)

# Remove 'source' from training data to match validation
if 'source' in train_dataset.column_names:
    print("  - Removing 'source' column from training set for consistency.")
    train_dataset = train_dataset.remove_columns(['source'])

print("\n...Standardization complete.")
print(f"Cleaned train dataset features: {train_dataset.features}")
print(f"Cleaned val dataset features: {val_dataset.features}")


Standardizing column schemas...
  - Renaming 'label' to 'text' in validation set.
  - Removing 'source' column from training set for consistency.

...Standardization complete.
Cleaned train dataset features: {'image': Value('binary'), 'text': Value('string')}
Cleaned val dataset features: {'image': Image(mode=None, decode=True), 'text': Value('string')}


In [None]:
# --- Subsample Training and Validation Set ---
train_dataset = train_dataset.select(range(10000))
print(f"\n--- USING 10% TRAINING SUBSET: {len(train_dataset)} samples ---")

VAL_SUBSET_SIZE = 1000
if len(val_dataset) > VAL_SUBSET_SIZE:
    print(f"\nValidation set is very large ({len(val_dataset)}).")
    val_dataset = val_dataset.shuffle(seed=42).select(range(VAL_SUBSET_SIZE))
    print(f"Using a random subset of {len(val_dataset)} samples for faster validation.")
else:
    print(f"\nUsing full validation set of {len(val_dataset)} samples.")


--- USING 10% TRAINING SUBSET: 10000 samples ---

Validation set is very large (267578).
Using a random subset of 1000 samples for faster validation.


In [None]:
# Define the On-the-Fly Data Collator

class OnTheFlyDataCollator:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, batch_of_examples):
        images_to_process = []
        text_to_process = []

        for example in batch_of_examples:
            try:
                # Get data
                image_data = example[IMAGE_DATA_COLUMN]
                text = example[TEXT_LABEL_COLUMN]

                # Check for bad text
                if not isinstance(text, str) or len(text.strip()) == 0:
                    continue

                # --- This logic handles all 3 data types we've seen ---
                if isinstance(image_data, bytes):
                    # For train_dataset: 'image' is raw bytes
                    image = Image.open(io.BytesIO(image_data)).convert("RGB")
                elif isinstance(image_data, dict) and 'bytes' in image_data:
                    # For val_dataset (struct): 'image' is {'bytes': ...}
                    image = Image.open(io.BytesIO(image_data['bytes'])).convert("RGB")
                elif isinstance(image_data, Image.Image):
                    # For val_dataset (auto-decoded): 'image' is a PIL object
                    image = image_data.convert("RGB")
                else:
                    # Unrecognized type
                    print(f"Warning: Skipping sample with unknown image data type: {type(image_data)}")
                    continue

                images_to_process.append(image)
                text_to_process.append(text)

            except Exception as e:
                print(f"Warning: Skipping corrupt sample. Error: {e}")

        # Process the entire batch at once
        model_inputs = self.processor(
            images=images_to_process,
            text=text_to_process,
            padding="max_length",
            truncation=True,
            max_length=64,
            return_tensors="pt" # Return PyTorch tensors
        )

        return model_inputs

# Initialize our new collator
on_the_fly_collator = OnTheFlyDataCollator(processor=processor)
print("Initialized on-the-fly data collator.")

Initialized on-the-fly data collator.


In [None]:
# --- Define Metrics ---

try:
    cer_metric = evaluate.load("cer")
    print("\nLoaded CER metric from 'evaluate' library.")
except Exception as e:
    print(f"Error loading CER metric: {e}")
    raise

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    return {"cer": cer}


Loaded CER metric from 'evaluate' library.


In [None]:
training_args = TrainingArguments(
    output_dir=os.path.join(OUTPUT_DIR, "trocr-small-checkpoints"),
    num_train_epochs=5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    eval_accumulation_steps=16,
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=2,
    remove_unused_columns=False,
    gradient_checkpointing=True,
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="cer",
    greater_is_better=False,
    report_to="tensorboard",
    max_grad_norm=1.0,
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=on_the_fly_collator,
    compute_metrics=compute_metrics,
)

In [None]:
 # Start Training

print("\n--- Starting Training ---")

try:
    trainer.train()
    print("--- Training Complete ---")
except Exception as e:
    print(f"\n--- ERROR during training ---")
    print(e)
    raise


--- Starting Training ---


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
