# Transformer for letters recognition
Fine-tuning of ViT for handwritten letters recognition.

About:

1. [ViT paper](https://arxiv.org/pdf/2010.11929.pdf), [ViT Github](https://github.com/google-research/vision_transformer), [ViT on huggingface🤗](https://huggingface.co/docs/transformers/model_doc/vit)


## Imports

In [2]:
!pip3 install transformers tokenizers datasets evaluate --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m81.9/84.0 kB[0m [31m12.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m81.9/84.0 kB[0m [31m12.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m647.4 kB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os
import random
import warnings
import shutil

import zipfile
from google.colab import drive

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm, trange

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output
from PIL import Image

import datasets
from datasets import load_dataset, ClassLabel

from transformers import ViTFeatureExtractor
from transformers import ViTForImageClassification
from transformers import TrainingArguments, Trainer
from transformers import pipeline

import evaluate

from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, cohen_kappa_score, matthews_corrcoef
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import preprocessing

import torch
from torch import nn
import torch.nn.functional as F
import torchvision
from torch.utils.data import Dataset, DataLoader


warnings.filterwarnings("ignore")
%matplotlib inline

In [4]:
if not os.path.isdir('/content/data'):
  os.mkdir('/content/data')
os.chdir('/content/data')
os.path.abspath('.')

'/content/data'

In [5]:
from google.colab import drive
drive.mount('/content/drive')
if os.path.exists('/content/drive/MyDrive/part_1.zip'):
    with zipfile.ZipFile('/content/drive/MyDrive/part_1.zip', 'r') as zip_ref:
        zip_ref.extractall('/content/data')

Mounted at /content/drive


In [6]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [7]:
accuracy = evaluate.load('accuracy', 'multiclass')

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [8]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.benchmark = True
        torch.backends.cudnn.deterministic = False

In [9]:
seed_everything()

In [10]:
df = pd.read_csv('/content/data/part_1/all_files_final.csv', delimiter=',')
line_name = "/content/data"
full_paths = [line_name + path[1:] for path in df["new_path"]]
df["new_path"] = full_paths
df = df.dropna(subset=['letter'])
df = df[df['letter'].str.fullmatch(r'[А-Яа-яЁё]')]
df.head()

Unnamed: 0,id,letter_position,category,letter,new_path,word_id,word_true
0,0,0,surnames,б,/content/data/part_1/surnames/baulina_0/b_0.jpg,0,баулина
1,1,1,surnames,а,/content/data/part_1/surnames/baulina_0/a_1.jpg,0,баулина
2,2,2,surnames,у,/content/data/part_1/surnames/baulina_0/u_2.jpg,0,баулина
3,3,3,surnames,л,/content/data/part_1/surnames/baulina_0/l_3.jpg,0,баулина
4,4,4,surnames,и,/content/data/part_1/surnames/baulina_0/i_4.jpg,0,баулина


[Issue](https://github.com/huggingface/transformers/issues/21638) on sizes of images. Let's use the ```PIL``` to open images. Then convert it to RGB according to [this duscussion](https://stackoverflow.com/questions/75168665/unsupported-number-of-image-dimensions-while-using-image-utils-from-transforme)

In [11]:
class LettersDataset(Dataset):
    def __init__(self, data, feature_extractor, transform=None):
        self.data = data
        self.transform = transform
        self.feature_extractor = feature_extractor

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        img_path = self.data.iloc[idx].new_path
        # print(img_path)
        image = Image.open(img_path)
        # print(image)
        label = self.data.iloc[idx].labels
        if self.transform:
            item = self.transform(image, label, self.feature_extractor)
            return item

        return image, label

In [12]:
# Фильтрация: оставляем только строки, где файл действительно существует
df = df[df['new_path'].apply(os.path.exists)].reset_index(drop=True)
le = preprocessing.LabelEncoder()
df['labels'] = le.fit_transform(df.letter)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [14]:
df.head()

Unnamed: 0,id,letter_position,category,letter,new_path,word_id,word_true,labels
0,0,0,surnames,б,/content/data/part_1/surnames/baulina_0/b_0.jpg,0,баулина,1
1,1,1,surnames,а,/content/data/part_1/surnames/baulina_0/a_1.jpg,0,баулина,0
2,2,2,surnames,у,/content/data/part_1/surnames/baulina_0/u_2.jpg,0,баулина,19
3,3,3,surnames,л,/content/data/part_1/surnames/baulina_0/l_3.jpg,0,баулина,11
4,4,4,surnames,и,/content/data/part_1/surnames/baulina_0/i_4.jpg,0,баулина,8


In [15]:
le.classes_

array(['а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'й', 'к', 'л', 'м',
       'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ',
       'ъ', 'ы', 'ь', 'э', 'ю', 'я', 'ё'], dtype=object)

## ViT Model

In [19]:
vit_model_name = 'google/vit-base-patch16-224'
vit_feature_extractor = ViTFeatureExtractor.from_pretrained(vit_model_name)

In [20]:
def process_example(image, label, feature_extractor):
    inputs = feature_extractor(image.convert('RGB'), return_tensors='pt')
    inputs['labels'] = label
    return inputs

In [36]:
train_dataset = LettersDataset(train_df, vit_feature_extractor, transform=process_example)
test_dataset = LettersDataset(test_df, vit_feature_extractor, transform=process_example)

In [22]:
# item size
train_dataset[0]['pixel_values'].shape  # batch_size, num_channels, width, height

torch.Size([1, 3, 224, 224])

In [23]:
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'][0] for x in batch], 0),
        'labels': torch.tensor([x['labels'] for x in batch])
    }

In [24]:
# size after data collator
batch = [train_dataset[0], train_dataset[1], train_dataset[2]]
collated = collate_fn(batch)
print(collated['pixel_values'].shape)
print(collated['labels'].shape)

torch.Size([3, 3, 224, 224])
torch.Size([3])


In [25]:
labels = df.labels.unique()

model = ViTForImageClassification.from_pretrained(
    vit_model_name,
    num_labels=labels.shape[0],
    id2label={str(i): c for i, c in enumerate(le.classes_)},  # to convert id of class to real label
    label2id={c: str(i) for i, c in enumerate(le.classes_)},
    ignore_mismatched_sizes=True
)

config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([33]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([33, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
labels

array([ 1,  0, 19, 11,  8, 13, 18, 28, 31, 14,  5, 10, 20, 16,  2,  3, 12,
       17, 30, 24, 22,  7, 32,  4, 15,  6, 23, 27, 21, 25,  9, 29, 26])

In [27]:
model.to(device)

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermed

In [28]:
def compute_metrics(p):
    return accuracy.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)

In [None]:
training_args = TrainingArguments(
    output_dir="./vit-base-letters",
    per_device_train_batch_size=64,
    num_train_epochs=4,
    fp16=True,
    logging_steps=100,
    learning_rate=2e-4,
    save_total_limit=2,
    save_strategy='steps',
    save_steps=200,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to='tensorboard',
)


In [None]:
from torch import nn, exp

class Fl_Trainer(Trainer):

    def __init__(self, alpha=0.25, gamma=2, **kwargs):
        super().__init__(**kwargs)
        self.alpha = alpha
        self.gamma = gamma

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

         # Учитываем только замаскированные токены (labels != -100)
        mask = (labels != -100)
        valid_logits = logits[mask]  # (num_masked_tokens, vocab_size)
        valid_labels = labels[mask]  # (num_masked_tokens)

        # Вычисляем Focal Loss только для валидных токенов
        ce_loss = nn.CrossEntropyLoss(reduction='none')(valid_logits, valid_labels)
        pt = exp(-ce_loss)
        focal_loss = (self.alpha * (1 - pt)**self.gamma * ce_loss).mean()

        return (focal_loss, outputs) if return_outputs else focal_loss


trainer = Fl_Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    tokenizer=vit_feature_extractor,
)

In [None]:
trainer.train(resume_from_checkpoint=True)
trainer.save_model('/content/drive/MyDrive/models/vit-base-letters_2')

Step,Training Loss
4700,0.0046
4800,0.0048
4900,0.0054
5000,0.0054
5100,0.0044
5200,0.0046
5300,0.0051
5400,0.0049
5500,0.0042
5600,0.0051


### Evaluation

In [16]:
new_vit_model_name = '/content/drive/MyDrive/models/vit-base-letters_2'
vit_model_name = 'google/vit-base-patch16-224'
vit_feature_extractor = ViTFeatureExtractor.from_pretrained(vit_model_name)

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

In [17]:
labels = df.labels.unique()

new_model = ViTForImageClassification.from_pretrained(
    new_vit_model_name,
    num_labels=labels.shape[0],
    id2label={str(i): c for i, c in enumerate(le.classes_)},  # to convert id of class to real label
    label2id={c: str(i) for i, c in enumerate(le.classes_)},
    ignore_mismatched_sizes=True
)

In [29]:
# sample
outputs = new_model(test_dataset[0]['pixel_values'])
logits = outputs.logits
predicted_class_idx = logits.argmax(-1).item()
print("Predicted class:", new_model.config.id2label[str(predicted_class_idx)])
print('Actual class:', new_model.config.id2label[str(test_dataset[0]['labels'])])

Predicted class: а
Actual class: а


In [45]:
new_model.to(device)

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermed

In [47]:
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch

# Подключение к GPU/CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
new_model.to(device)
new_model.eval()

# Collate-функция: убираем лишнюю размерность (1, 3, 224, 224) → (3, 224, 224)
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'].squeeze(0) for x in batch]),  # [B, 3, 224, 224]
        'labels': torch.tensor([int(x['labels']) for x in batch])                    # [B]
    }

# DataLoader
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

# Для сбора результатов
actual_labels = []
predicted_labels = []

# Предсказания
with torch.no_grad():
    for batch in tqdm(test_loader):
        pixel_values = batch['pixel_values'].to(device)   # [B, 3, 224, 224]
        labels = batch['labels']

        outputs = new_model(pixel_values)
        logits = outputs.logits.cpu()
        predicted_indices = logits.argmax(dim=-1).numpy()

        predicted_labels.extend([new_model.config.id2label[str(i)] for i in predicted_indices])
        actual_labels.extend([new_model.config.id2label[str(int(i))] for i in labels])


100%|██████████| 937/937 [06:26<00:00,  2.42it/s]


In [48]:
len(predicted_labels)

29961

In [49]:
print(classification_report(actual_labels, predicted_labels))

              precision    recall  f1-score   support

           а       0.99      0.99      0.99      4975
           б       0.98      0.98      0.98       451
           в       0.99      0.99      0.99      1562
           г       0.98      0.98      0.98       494
           д       0.99      0.97      0.98       710
           е       0.99      0.99      0.99      1935
           ж       0.98      0.98      0.98       126
           з       0.99      0.99      0.99       253
           и       0.95      0.98      0.97      2411
           й       0.93      0.76      0.84       212
           к       0.99      1.00      0.99      1667
           л       0.99      0.98      0.98      1695
           м       0.99      0.98      0.98       821
           н       0.98      0.98      0.98      2402
           о       0.99      0.98      0.99      2319
           п       0.97      0.97      0.97       429
           р       0.99      0.99      0.99      1722
           с       0.98    

In [50]:
from collections import defaultdict
from sklearn.metrics import accuracy_score

words_true, words_pred = defaultdict(dict), defaultdict(dict)
for i, (w, p) in enumerate(zip(test_df['word_id'], test_df['letter_position'])):
    words_true[w][p], words_pred[w][p] = actual_labels[i], predicted_labels[i]

assemble = lambda d: ''.join(v for _, v in sorted(d.items()))
true_words = [assemble(d) for d in words_true.values()]
pred_words = [assemble(d) for d in words_pred.values()]

print(f"Word-level accuracy: {accuracy_score(true_words, pred_words):.4f}")

Word-level accuracy: 0.9718
