In [1]:
import torch
import os
import pandas as pd
from PIL import Image
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, VisionEncoderDecoderConfig, AutoTokenizer,  ViTFeatureExtractor,ViTImageProcessor
from datasets import Dataset

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [3]:
image_processor =  ViTImageProcessor.from_pretrained("microsoft/trocr-base-handwritten")
tokenizer = AutoTokenizer.from_pretrained('microsoft/trocr-base-handwritten')

In [4]:
nepali_characters = ['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'क्ष', 'त्र', 'ज्ञ', '१', '२', '३', '४', '५', '६', '७', '८', '९', '०','्','ँ','ः','।','ा','ि','ी','ु','ू','े','ै','ो','ौ','◌ं','ृ','?',',']

In [5]:
tokenizer.add_tokens(nepali_characters)
# Save the updated tokenizer
tokenizer.save_pretrained('updated_tokenizer')

('updated_tokenizer\\tokenizer_config.json',
 'updated_tokenizer\\special_tokens_map.json',
 'updated_tokenizer\\vocab.json',
 'updated_tokenizer\\merges.txt',
 'updated_tokenizer\\added_tokens.json',
 'updated_tokenizer\\tokenizer.json')

In [6]:
processor = TrOCRProcessor(image_processor=image_processor, tokenizer=tokenizer)

In [7]:
config = VisionEncoderDecoderConfig.from_pretrained("microsoft/trocr-base-handwritten")
config.decoder_start_token_id = processor.tokenizer.pad_token_id
config.pad_token_id = processor.tokenizer.pad_token_id

In [8]:
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten", config=config)

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
model.decoder.resize_token_embeddings(len(tokenizer))

Embedding(50336, 1024)

In [10]:
model.to(device)

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fea

In [11]:
def load_data(images_path, csv_path):
    data = pd.read_csv(csv_path)
    images = []
    texts = []
    for index, row in data.iterrows():
        image_path = os.path.join(images_path, row['image'])
        text = row['word']
        
        image = Image.open(image_path).convert("RGB")
        images.append(image)
        texts.append(text)
    
    return images, texts

In [12]:
train_images, train_texts = load_data("ocr_dataset", "ocr_dataset/final_label.csv")
val_images, val_texts = load_data("ocr_dataset", "ocr_dataset/val_label.csv")
train_encodings = processor(images=train_images, text=train_texts, padding=True, truncation=True, return_tensors="pt")
val_encodings = processor(images=val_images, text=val_texts, padding=True, truncation=True, return_tensors="pt")
train_dataset = Dataset.from_dict({
    'pixel_values': train_encodings['pixel_values'],
    'labels': train_encodings['labels']
})

val_dataset = Dataset.from_dict({
    'pixel_values': val_encodings['pixel_values'],
    'labels': val_encodings['labels']
})

Unused or unrecognized kwargs: padding, truncation.
Unused or unrecognized kwargs: padding, truncation.


In [13]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./nepali_models",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True  # Use 16-bit precision training if supported
)

# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1.726958
2,0.853000,1.674618
3,0.565600,1.673369
4,0.552900,1.647445
5,0.545600,1.689838
6,0.539300,1.629272
7,0.539300,1.625648
8,0.533200,1.607369
9,0.523500,1.602397
10,0.508500,1.604841


TrainOutput(global_step=4190, training_loss=0.5748614673113766, metrics={'train_runtime': 27682.1567, 'train_samples_per_second': 1.209, 'train_steps_per_second': 0.151, 'total_flos': 2.503762249072509e+19, 'train_loss': 0.5748614673113766, 'epoch': 10.0})

In [14]:
model_path = "./best_nepali_model"
trainer.save_model(model_path)