In [None]:
import os
import torch
import evaluate
import numpy as np
import pandas as pd
import glob as glob
import torch.optim as optim
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
import opendatasets as od
from torchsummary import summary

from PIL import Image
from zipfile import ZipFile
from tqdm.notebook import tqdm
from dataclasses import dataclass
from torch.utils.data import Dataset
from urllib.request import urlretrieve
from transformers import (
    VisionEncoderDecoderModel,
    TrOCRProcessor,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    XLMRobertaTokenizer,
    AutoTokenizer,
    default_data_collator,
    get_scheduler
)

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
od.download(
    "https://www.kaggle.com/datasets/constantinwerner/cyrillic-handwriting-dataset")

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
BATCH_SIZE = 4
EPOCHS = 10
LEARNING_RATE = 5e-5
DATA_ROOT = '/content/cyrillic-handwriting-dataset'
MODEL_NAME = "microsoft/trocr-base-stage1"
ALPHABET = " %(),-./0123456789:;?[]«»АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё"
freeze_up_to_layer = 10
accumulation_steps = 4

In [None]:
data_train = pd.read_csv('/content/cyrillic-handwriting-dataset/train.tsv', sep='\t',header = None, names=['file_name','text'])
data_test = pd.read_csv('/content/cyrillic-handwriting-dataset/test.tsv', sep='\t',header = None, names=['file_name','text'])

In [None]:
data_train = data_train.dropna(subset=['file_name', 'text']).reset_index(drop=True)
data_test = data_test.dropna(subset=['file_name', 'text']).reset_index(drop=True)

In [None]:
data_train = data_train.loc[0:4999]
data_train

Unnamed: 0,file_name,text
0,aa1.png,Молдова
1,aa1007.png,продолжила борьбу
2,aa101.png,разработанные
3,aa1012.png,Плачи
4,aa1013.png,Гимны богам
...,...,...
4995,ab6247.png,Ивол
4996,ab6248.png,ками
4997,ab6249.png,Есть перекрытия
4998,ab6250.png,чать


In [None]:
train_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomRotation(degrees=15),
    transforms.ColorJitter(brightness=.5, contrast=.5, saturation=.5, hue=.3),
    transforms.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5)),
    transforms.ToTensor()
])

In [None]:
class ClassyDataset(Dataset):
    def __init__(self, root_dir, df, processor, train_transforms, max_target_length=128):
        self.root_dir = root_dir
        self.df = df
        self.processor = processor
        self.max_target_length = max_target_length
        self.train_transforms = train_transforms

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_name = self.df['file_name'][idx]
        text = self.df['text'][idx]

        try:
            image_path = os.path.join(self.root_dir, file_name)
            image = Image.open(image_path).convert('RGB')

            image = self.train_transforms(image)
            pixel_values = self.processor(image, return_tensors='pt').pixel_values

            # Process the text and get labels
            labels = self.processor.tokenizer(
                text,
                padding='max_length',
                max_length=self.max_target_length
            ).input_ids

            # Replace padding token ID with -100 for loss masking
            labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]

            # Return the encoding
            encoding = {
                "pixel_values": pixel_values.squeeze(),
                "labels": torch.tensor(labels)
            }
            return encoding

        except FileNotFoundError as e:
            print(f"FileNotFoundError at index {idx}: {e}")
            return None

        except Exception as e:
            print(f"Unexpected error at index {idx}: {e}")
            return None

In [None]:
processor = TrOCRProcessor.from_pretrained(MODEL_NAME)
train_dataset = ClassyDataset(
    root_dir=os.path.join(DATA_ROOT, 'train/'),
    df=data_train,
    processor=processor,
    train_transforms=train_transforms
)
valid_dataset = ClassyDataset(
    root_dir=os.path.join(DATA_ROOT, 'test/'),
    df=data_test,
    processor=processor,
    train_transforms=train_transforms
)

Downloading:   0%|          | 0.00/228 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
model = VisionEncoderDecoderModel.from_pretrained(MODEL_NAME)
model.to(device)
model

Downloading:   0%|          | 0.00/4.11k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-stage1 and are newly initialized: ['encoder.pooler.dense.weight', 'encoder.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): PatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_featur

In [None]:
def get_layers(model: torch.nn.Module):
    children = list(model.children())
    return [model] if len(children) == 0 else [ci for c in children for ci in get_layers(c)]

get_layers(model)


[Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16)),
 Dropout(p=0.0, inplace=False),
 Linear(in_features=768, out_features=768, bias=False),
 Linear(in_features=768, out_features=768, bias=False),
 Linear(in_features=768, out_features=768, bias=False),
 Dropout(p=0.0, inplace=False),
 Linear(in_features=768, out_features=768, bias=True),
 Dropout(p=0.0, inplace=False),
 Linear(in_features=768, out_features=3072, bias=True),
 GELUActivation(),
 Linear(in_features=3072, out_features=768, bias=True),
 Dropout(p=0.0, inplace=False),
 LayerNorm((768,), eps=1e-12, elementwise_affine=True),
 LayerNorm((768,), eps=1e-12, elementwise_affine=True),
 Linear(in_features=768, out_features=768, bias=False),
 Linear(in_features=768, out_features=768, bias=False),
 Linear(in_features=768, out_features=768, bias=False),
 Dropout(p=0.0, inplace=False),
 Linear(in_features=768, out_features=768, bias=True),
 Dropout(p=0.0, inplace=False),
 Linear(in_features=768, out_features=3072, bias=True),
 GELUAc

In [None]:
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

384,864,768 total parameters.
384,864,768 training parameters.


In [None]:
for i, layer in enumerate(model.encoder.encoder.layer):
    if i < freeze_up_to_layer:
        for param in layer.parameters():
            param.requires_grad = False

In [None]:
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

314,009,088 training parameters.


In [None]:
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

In [None]:
optimizer = optim.AdamW(
    [p for p in model.parameters() if p.requires_grad],  # Only include trainable parameters
    lr=LEARNING_RATE, weight_decay=0.0005
)

In [None]:
cer_metric = evaluate.load('cer')


def compute_cer(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    print(f"Predicted: {pred_str[:5]}, Actual: {label_str[:5]}")  
    return {"cer": cer}

Downloading builder script:   0%|          | 0.00/5.60k [00:00<?, ?B/s]

In [None]:
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy='steps',
    eval_steps=500,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    fp16=True,
    output_dir='/content/drive/MyDrive',
    logging_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=5,
    report_to='tensorboard',
    num_train_epochs=EPOCHS,
    gradient_accumulation_steps=4,  # Custom gradient accumulation steps
    learning_rate=LEARNING_RATE,     # Custom learning rate
)

In [None]:
num_training_steps = len(train_dataset) // (BATCH_SIZE * accumulation_steps) * EPOCHS
num_warmup_steps = int(0.1 * num_training_steps)  # Warm up for the first 10% of steps
scheduler = get_scheduler(
    name='linear',
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

In [None]:
def custom_data_collator(features):
    filtered_features = [f for f in features if f is not None]
    return default_data_collator(filtered_features)

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=processor.feature_extractor,
    args=training_args,
    compute_metrics=compute_cer,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=custom_data_collator,
    optimizers=(optimizer, scheduler)
)

Using amp half precision backend


In [None]:
res = trainer.train()



***** Running training *****
  Num examples = 5000
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 4
  Total optimization steps = 3120


Step,Training Loss,Validation Loss,Cer
500,3.2657,2.577405,0.699271
1000,0.8894,2.310483,0.672963
1500,0.7011,2.476035,0.630869
2000,0.496,2.305044,0.610631
2500,0.3768,2.465654,0.603278
3000,0.3279,2.230413,0.581962


Saving model checkpoint to /content/drive/MyDrive/checkpoint-312
Configuration saved in /content/drive/MyDrive/checkpoint-312/config.json
Model weights saved in /content/drive/MyDrive/checkpoint-312/pytorch_model.bin
Feature extractor saved in /content/drive/MyDrive/checkpoint-312/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 1544
  Batch size = 4


Predicted: ['ибо', 'остань', 'пое', 'Отихою', '1 киаса'], Actual: ['ибо', 'осталось', 'поле', 'Оптическое', '1 класса']


Saving model checkpoint to /content/drive/MyDrive/checkpoint-624
Configuration saved in /content/drive/MyDrive/checkpoint-624/config.json
Model weights saved in /content/drive/MyDrive/checkpoint-624/pytorch_model.bin
Feature extractor saved in /content/drive/MyDrive/checkpoint-624/preprocessor_config.json
Saving model checkpoint to /content/drive/MyDrive/checkpoint-936
Configuration saved in /content/drive/MyDrive/checkpoint-936/config.json
Model weights saved in /content/drive/MyDrive/checkpoint-936/pytorch_model.bin
Feature extractor saved in /content/drive/MyDrive/checkpoint-936/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 1544
  Batch size = 4


Predicted: ['ибо', 'остагось', 'пого', 'Отичкое', '1 кого'], Actual: ['ибо', 'осталось', 'поле', 'Оптическое', '1 класса']


Saving model checkpoint to /content/drive/MyDrive/checkpoint-1248
Configuration saved in /content/drive/MyDrive/checkpoint-1248/config.json
Model weights saved in /content/drive/MyDrive/checkpoint-1248/pytorch_model.bin
Feature extractor saved in /content/drive/MyDrive/checkpoint-1248/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 1544
  Batch size = 4


Predicted: ['ибо', 'осталовь', 'поеф', 'Стихные', 'К клосо'], Actual: ['ибо', 'осталось', 'поле', 'Оптическое', '1 класса']


Saving model checkpoint to /content/drive/MyDrive/checkpoint-1560
Configuration saved in /content/drive/MyDrive/checkpoint-1560/config.json
Model weights saved in /content/drive/MyDrive/checkpoint-1560/pytorch_model.bin
Feature extractor saved in /content/drive/MyDrive/checkpoint-1560/preprocessor_config.json
Saving model checkpoint to /content/drive/MyDrive/checkpoint-1872
Configuration saved in /content/drive/MyDrive/checkpoint-1872/config.json
Model weights saved in /content/drive/MyDrive/checkpoint-1872/pytorch_model.bin
Feature extractor saved in /content/drive/MyDrive/checkpoint-1872/preprocessor_config.json
Deleting older checkpoint [/content/drive/MyDrive/checkpoint-312] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1544
  Batch size = 4


Predicted: ['ибо', 'остаюсь', 'пое', 'Отнююе', '1 класса'], Actual: ['ибо', 'осталось', 'поле', 'Оптическое', '1 класса']


Saving model checkpoint to /content/drive/MyDrive/checkpoint-2184
Configuration saved in /content/drive/MyDrive/checkpoint-2184/config.json
Model weights saved in /content/drive/MyDrive/checkpoint-2184/pytorch_model.bin
Feature extractor saved in /content/drive/MyDrive/checkpoint-2184/preprocessor_config.json
Saving model checkpoint to /content/drive/MyDrive/checkpoint-2496
Configuration saved in /content/drive/MyDrive/checkpoint-2496/config.json
Model weights saved in /content/drive/MyDrive/checkpoint-2496/pytorch_model.bin
Feature extractor saved in /content/drive/MyDrive/checkpoint-2496/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 1544
  Batch size = 4


Predicted: ['ибо', 'могато', 'поча', 'Оппическое', '1 класо'], Actual: ['ибо', 'осталось', 'поле', 'Оптическое', '1 класса']


Saving model checkpoint to /content/drive/MyDrive/checkpoint-2808
Configuration saved in /content/drive/MyDrive/checkpoint-2808/config.json
Model weights saved in /content/drive/MyDrive/checkpoint-2808/pytorch_model.bin
Feature extractor saved in /content/drive/MyDrive/checkpoint-2808/preprocessor_config.json
Deleting older checkpoint [/content/drive/MyDrive/checkpoint-1248] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1544
  Batch size = 4


Predicted: ['ибо', 'осталось', 'поеф', 'Опическ', '1 класса'], Actual: ['ибо', 'осталось', 'поле', 'Оптическое', '1 класса']


Saving model checkpoint to /content/drive/MyDrive/checkpoint-3120
Configuration saved in /content/drive/MyDrive/checkpoint-3120/config.json
Model weights saved in /content/drive/MyDrive/checkpoint-3120/pytorch_model.bin
Feature extractor saved in /content/drive/MyDrive/checkpoint-3120/preprocessor_config.json
Deleting older checkpoint [/content/drive/MyDrive/checkpoint-1560] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)




In [None]:
torch.save(model, '/content/drive/MyDrive/model.pth')
processor.save_pretrained('/content/drive/MyDrive/processor')

Feature extractor saved in /content/drive/MyDrive/processor/preprocessor_config.json
tokenizer config file saved in /content/drive/MyDrive/processor/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/processor/special_tokens_map.json


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = torch.load('/content/drive/MyDrive/model.pth').to(device)
processor = TrOCRProcessor.from_pretrained('/content/drive/MyDrive/processor')

loading feature extractor configuration file /content/drive/MyDrive/processor/preprocessor_config.json
Feature extractor ViTFeatureExtractor {
  "do_normalize": true,
  "do_resize": true,
  "feature_extractor_type": "ViTFeatureExtractor",
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "processor_class": "TrOCRProcessor",
  "resample": 2,
  "size": 384
}

Didn't find file /content/drive/MyDrive/processor/added_tokens.json. We won't load it.
loading file /content/drive/MyDrive/processor/vocab.json
loading file /content/drive/MyDrive/processor/merges.txt
loading file /content/drive/MyDrive/processor/tokenizer.json
loading file None
loading file /content/drive/MyDrive/processor/special_tokens_map.json
loading file /content/drive/MyDrive/processor/tokenizer_config.json


In [None]:
#def preprocess_image(image_path):
    #image = Image.open(image_path).convert('RGB')
    #transform = transforms.Compose([
        #transforms.Resize((224, 224)),
        #transforms.ToTensor()
    #])
    #image = transform(image)
    #return image

#def get_text_from_image(image_path):
    #image = preprocess_image(image_path)
    #pixel_values = processor(images=[image], return_tensors='pt').pixel_values.to(device)

    #model.eval()
    #with torch.no_grad():
        #output = model.generate(pixel_values=pixel_values)

    #decoded_text = processor.batch_decode(output, skip_special_tokens=True)[0]
    #return decoded_text


In [None]:
#image_path = '/content/drive/MyDrive/image.png'
#
#text_output = get_text_from_image(image_path)
#print("Predicted Text:", text_output)