In [1]:
import numpy as np
import pandas as pd
import torch
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from torchvision import transforms
from datasets import Dataset
import evaluate
from PIL import Image
import os

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


In [2]:
# Dataset directories containing images
DATA_DIRS = {
    "train": r"D:\Soy Vitou\GITHUB\DEEP-LEARNING\TEXT-RECOGNITION\DATASET-TESTING\split_dataset\train",
    "valid": r"D:\Soy Vitou\GITHUB\DEEP-LEARNING\TEXT-RECOGNITION\DATASET-TESTING\split_dataset\valid",
    "test": r"D:\Soy Vitou\GITHUB\DEEP-LEARNING\TEXT-RECOGNITION\DATASET-TESTING\split_dataset\test",
}

# Text annotation files
ANNOTATION_FILES = {
    "train": r"D:\Soy Vitou\GITHUB\DEEP-LEARNING\TEXT-RECOGNITION\DATASET-TESTING\split_dataset\train.txt",
    "valid": r"D:\Soy Vitou\GITHUB\DEEP-LEARNING\TEXT-RECOGNITION\DATASET-TESTING\split_dataset\valid.txt",
    "test": r"D:\Soy Vitou\GITHUB\DEEP-LEARNING\TEXT-RECOGNITION\DATASET-TESTING\split_dataset\test.txt",
}

# File containing all unique characters
CHAR_FILE = r"D:\Soy Vitou\GITHUB\DEEP-LEARNING\TEXT-RECOGNITION\DATASET-TESTING\annotation.txt"


In [3]:
def get_unique_chars(file_path):
    unique_chars = set()
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.strip().split(" ", 1)
            if len(parts) == 2:
                unique_chars.update(parts[1])  # Extract characters from the text
    return sorted(unique_chars)

khmer_tokens = get_unique_chars(CHAR_FILE)
print(f"Total unique Khmer characters: {len(khmer_tokens)}")


Total unique Khmer characters: 14


In [4]:
# Load the TrOCR processor (pretrained model for OCR)
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")

# Add new Khmer tokens to the tokenizer
new_token_count = processor.tokenizer.add_tokens(khmer_tokens)
print(f"Added {new_token_count} new tokens to tokenizer")

# Load TrOCR model
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten").to(device)

# Resize embeddings to include new tokens
if new_token_count > 0:
    model.decoder.resize_token_embeddings(len(processor.tokenizer))


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Added 14 new tokens to tokenizer


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.48.2"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder

In [6]:
def transform_image(image_path):
    image = Image.open(image_path).convert("RGB")
    return processor(images=image, return_tensors="pt").pixel_values.squeeze()

def preprocess_data(df):
    images = []
    texts = []
    
    for _, row in df.iterrows():
        try:
            images.append(transform_image(row["image_path"]))
            texts.append(row["text"])
        except Exception as e:
            print(f"Error loading {row['image_path']}: {e}")
    
    return Dataset.from_dict({"image": images, "text": texts})

# Convert pandas DataFrame to Hugging Face Dataset format
train_dataset = preprocess_data(train_df)
valid_dataset = preprocess_data(valid_df)
test_dataset = preprocess_data(test_df)


In [7]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./trocr-khmer",
    eval_strategy="steps",
    save_steps=1000,
    eval_steps=200,
    logging_steps=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    fp16=True if torch.cuda.is_available() else False,
    predict_with_generate=True,
)


In [8]:
from evaluate import load

cer_metric = load("cer")  # Character Error Rate (CER)

def compute_metrics(pred):
    pred_str = processor.batch_decode(pred.predictions, skip_special_tokens=True)
    label_str = processor.batch_decode(pred.label_ids, skip_special_tokens=True)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    return {"cer": cer}


In [9]:
from transformers import default_data_collator

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=processor.feature_extractor,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics
)

trainer.train()


  trainer = Seq2SeqTrainer(


ValueError: You have to specify pixel_values