In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from PIL import Image
import random

import os
import json
import cv2

import torch
from torch.utils.data import random_split, DataLoader, Dataset
from torchvision import datasets, transforms

from transformers import TrOCRProcessor, VisionEncoderDecoderModel, default_data_collator
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments



Завиксируем всю случайность!

In [2]:
seed = 23
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.determenistic = True

Соберем наш датасет из полученных файлов

In [3]:
df1 = pd.read_csv('anno_first.csv')
df2 = pd.read_csv('anno_second.csv')
df3 = pd.read_csv('anno_basketball.csv')
df4 = pd.read_csv('anno_fiba.csv')
#df5 = pd.read_csv('anno_ncaa.csv')
df = pd.concat([df1, df2, df4], axis=0)
df.shape

(88984, 3)

Разобьем датасет на train test и eval части

In [4]:
diff_df, eval_df = train_test_split(df, test_size=0.2)
train_df, test_df = train_test_split(diff_df, test_size=0.2)
# we reset the indices to start from zero
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
eval_df.reset_index(drop=True, inplace=True)

In [5]:
store_pathes = []

class IAMDataset(Dataset):
    def __init__(self, root_dir, df, processor, max_target_length=3):
        self.root_dir = root_dir
        self.df = df
        self.processor = processor
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
      
        file_name = self.df['file_name'][idx]
        text = str(self.df['text'][idx])
        store_pathes.append(file_name)
   
        image = Image.open(file_name).convert("RGB")
        image = image.resize((64, 64))
        pixel_values = self.processor(image, return_tensors="pt").pixel_values
        # add labels (input_ids) by encoding the text
        labels = self.processor.tokenizer(text, 
                                          padding="max_length", 
                                          max_length=self.max_target_length).input_ids
        # important: make sure that PAD tokens are ignored by the loss function
        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]

        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding

Создадим наши датасеты

In [6]:
from transformers import TrOCRProcessor

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
train_dataset = IAMDataset(root_dir='C:\\Users\\Mytre\\OneDrive\\Документы\\Data\\Work\\',
                           df=train_df,
                           processor=processor)
test_dataset = IAMDataset(root_dir='C:\\Users\\Mytre\\OneDrive\\Документы\\Data\\Work\\',
                           df=test_df,
                           processor=processor)
eval_dataset = IAMDataset(root_dir='C:\\Users\\Mytre\\OneDrive\\Документы\\Data\\Work\\',
                           df=test_df,
                           processor=processor)

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


In [7]:
print("Number of training examples:", len(train_dataset))
print("Number of testing examples:", len(test_dataset))
print("Number of validation examples:", len(eval_dataset))

Number of training examples: 56949
Number of testing examples: 14238
Number of validation examples: 14238


Загрузим предъобученный трансформер

In [8]:
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-stage1")

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-stage1 and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Сконфигурируем нашу модель

In [9]:
# set special tokens used for creating the decoder_input_ids from the labels
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size

# set beam search parameters
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 4
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 10

Сконфигурируем цикл обучения

In [10]:
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    num_train_epochs=10,
    evaluation_strategy="steps",
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    fp16=True, 
    output_dir="./",
    logging_steps=4000,
    save_steps=40000,
    eval_steps=20000,
)

Оопределим метрику

In [11]:
import evaluate

cer_metric = evaluate.load("cer")

In [12]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {"cer": cer}

Переопределим оптимизатор

In [13]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters())

Запустим цикл обучения

In [14]:
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=processor.feature_extractor,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=default_data_collator,
)
trainer.train()

model.save_pretrained('./model')



  0%|          | 0/189830 [00:00<?, ?it/s]

{'loss': 0.6694, 'learning_rate': 4.894774271716799e-05, 'epoch': 0.21}
{'loss': 0.3131, 'learning_rate': 4.7894695253648e-05, 'epoch': 0.42}
{'loss': 0.2273, 'learning_rate': 4.6841384396565354e-05, 'epoch': 0.63}
{'loss': 0.1881, 'learning_rate': 4.578860032660802e-05, 'epoch': 0.84}
{'loss': 0.1683, 'learning_rate': 4.473528946952537e-05, 'epoch': 1.05}




  0%|          | 0/4746 [00:00<?, ?it/s]

{'eval_loss': 4.082074165344238, 'eval_cer': 0.26678765880217786, 'eval_runtime': 2710.7428, 'eval_samples_per_second': 5.252, 'eval_steps_per_second': 1.751, 'epoch': 1.05}
{'loss': 0.147, 'learning_rate': 4.3682505399568034e-05, 'epoch': 1.26}
{'loss': 0.1419, 'learning_rate': 4.2629457936048044e-05, 'epoch': 1.48}
{'loss': 0.1286, 'learning_rate': 4.15761470789654e-05, 'epoch': 1.69}
{'loss': 0.133, 'learning_rate': 4.0522836221882736e-05, 'epoch': 1.9}
{'loss': 0.1115, 'learning_rate': 3.947005215192541e-05, 'epoch': 2.11}


  0%|          | 0/4746 [00:00<?, ?it/s]

{'eval_loss': 4.892834186553955, 'eval_cer': 0.1948363678941514, 'eval_runtime': 2710.2859, 'eval_samples_per_second': 5.253, 'eval_steps_per_second': 1.751, 'epoch': 2.11}
{'loss': 0.1052, 'learning_rate': 3.8416741294842757e-05, 'epoch': 2.32}
{'loss': 0.0955, 'learning_rate': 3.7363693831322767e-05, 'epoch': 2.53}
{'loss': 0.0953, 'learning_rate': 3.6310646367802777e-05, 'epoch': 2.74}
{'loss': 0.0945, 'learning_rate': 3.525707211715746e-05, 'epoch': 2.95}
{'loss': 0.0756, 'learning_rate': 3.420402465363747e-05, 'epoch': 3.16}


  0%|          | 0/4746 [00:00<?, ?it/s]

{'eval_loss': 5.505166053771973, 'eval_cer': 0.3093495696973245, 'eval_runtime': 2750.3964, 'eval_samples_per_second': 5.177, 'eval_steps_per_second': 1.726, 'epoch': 3.16}
{'loss': 0.0792, 'learning_rate': 3.3151240583680136e-05, 'epoch': 3.37}
{'loss': 0.0771, 'learning_rate': 3.209766633303482e-05, 'epoch': 3.58}
{'loss': 0.0776, 'learning_rate': 3.104435547595217e-05, 'epoch': 3.79}
{'loss': 0.0756, 'learning_rate': 2.9991308012432178e-05, 'epoch': 4.0}
{'loss': 0.0563, 'learning_rate': 2.8937733761786863e-05, 'epoch': 4.21}


  0%|          | 0/4746 [00:00<?, ?it/s]

{'eval_loss': 2.794046401977539, 'eval_cer': 0.11978221415607986, 'eval_runtime': 2690.793, 'eval_samples_per_second': 5.291, 'eval_steps_per_second': 1.764, 'epoch': 4.21}
{'loss': 0.0581, 'learning_rate': 2.788468629826687e-05, 'epoch': 4.43}
{'loss': 0.0609, 'learning_rate': 2.683137544118422e-05, 'epoch': 4.64}
{'loss': 0.0608, 'learning_rate': 2.577832797766423e-05, 'epoch': 4.85}
{'loss': 0.0492, 'learning_rate': 2.4725017120581572e-05, 'epoch': 5.06}
{'loss': 0.0438, 'learning_rate': 2.367170626349892e-05, 'epoch': 5.27}


  0%|          | 0/4746 [00:00<?, ?it/s]

{'eval_loss': 5.273472785949707, 'eval_cer': 0.229436215678239, 'eval_runtime': 2683.3841, 'eval_samples_per_second': 5.306, 'eval_steps_per_second': 1.769, 'epoch': 5.27}
{'loss': 0.0448, 'learning_rate': 2.2618922193541592e-05, 'epoch': 5.48}
{'loss': 0.0452, 'learning_rate': 2.1565347942896277e-05, 'epoch': 5.69}
{'loss': 0.0451, 'learning_rate': 2.0511773692250963e-05, 'epoch': 5.9}
{'loss': 0.0393, 'learning_rate': 1.9458989622293634e-05, 'epoch': 6.11}
{'loss': 0.0317, 'learning_rate': 1.840567876521098e-05, 'epoch': 6.32}


  0%|          | 0/4746 [00:00<?, ?it/s]

{'eval_loss': 5.03128719329834, 'eval_cer': 0.3169603653181898, 'eval_runtime': 2671.6253, 'eval_samples_per_second': 5.329, 'eval_steps_per_second': 1.776, 'epoch': 6.32}
{'loss': 0.0309, 'learning_rate': 1.7352367908128326e-05, 'epoch': 6.53}
{'loss': 0.0297, 'learning_rate': 1.6299057051045675e-05, 'epoch': 6.74}
{'loss': 0.0348, 'learning_rate': 1.524600958752568e-05, 'epoch': 6.95}
{'loss': 0.0206, 'learning_rate': 1.4192435336880367e-05, 'epoch': 7.16}
{'loss': 0.0236, 'learning_rate': 1.3139651266923036e-05, 'epoch': 7.38}


  0%|          | 0/4746 [00:00<?, ?it/s]

{'eval_loss': 5.167418479919434, 'eval_cer': 0.16017797552836485, 'eval_runtime': 2691.8952, 'eval_samples_per_second': 5.289, 'eval_steps_per_second': 1.763, 'epoch': 7.38}
{'loss': 0.0197, 'learning_rate': 1.2086340409840384e-05, 'epoch': 7.59}
{'loss': 0.0224, 'learning_rate': 1.1033292946320392e-05, 'epoch': 7.8}
{'loss': 0.019, 'learning_rate': 9.97998208923774e-06, 'epoch': 8.01}
{'loss': 0.0133, 'learning_rate': 8.926934625717748e-06, 'epoch': 8.22}
{'loss': 0.0159, 'learning_rate': 7.873623768635094e-06, 'epoch': 8.43}


  0%|          | 0/4746 [00:00<?, ?it/s]

{'eval_loss': 6.0792388916015625, 'eval_cer': 0.1537966161231778, 'eval_runtime': 2706.3699, 'eval_samples_per_second': 5.261, 'eval_steps_per_second': 1.754, 'epoch': 8.43}
{'loss': 0.0156, 'learning_rate': 6.820312911552441e-06, 'epoch': 8.64}
{'loss': 0.0107, 'learning_rate': 5.767002054469789e-06, 'epoch': 8.85}
{'loss': 0.0109, 'learning_rate': 4.7134278038244746e-06, 'epoch': 9.06}
{'loss': 0.0093, 'learning_rate': 3.660116946741822e-06, 'epoch': 9.27}
{'loss': 0.0095, 'learning_rate': 2.6070694832218302e-06, 'epoch': 9.48}


  0%|          | 0/4746 [00:00<?, ?it/s]

{'eval_loss': 6.954314231872559, 'eval_cer': 0.11480592471166794, 'eval_runtime': 2692.7972, 'eval_samples_per_second': 5.287, 'eval_steps_per_second': 1.762, 'epoch': 9.48}
{'loss': 0.0096, 'learning_rate': 1.5540220197018386e-06, 'epoch': 9.69}
{'loss': 0.0044, 'learning_rate': 5.007111626191856e-07, 'epoch': 9.9}
{'train_runtime': 102249.0808, 'train_samples_per_second': 5.57, 'train_steps_per_second': 1.857, 'train_loss': 0.0809435841758755, 'epoch': 10.0}


In [None]:
url = "crops\\8\\ballerTV_137example.jpg"
image = Image.open(url).convert("RGB")

pixel_values = processor(image, return_tensors="pt").pixel_values
generated_ids = model.generate(pixel_values.cuda())

generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(f'Номер на футболке: {generated_text}')

Номер на футболке: 8


Определим eval_dataloader и Загрузим обученную модель

In [45]:
eval_dataloader = DataLoader(eval_dataset, batch_size=32)
model = VisionEncoderDecoderModel.from_pretrained('checkpoint-80000')

Посчитаем точность обученной модели на eval датасете

In [None]:
from sklearn.metrics import accuracy_score

# Define the device to run the evaluation on
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set the model to evaluation mode
model.eval()
model.to(device)
# Evaluate the model on the eval dataset
y_true = []
y_pred = []

with torch.no_grad():
    for i, batch in tqdm(enumerate(eval_dataloader)):
        #url = eval_df['file_name'][i]
        target_text = processor.batch_decode(batch['labels'], skip_special_tokens=True)
        #image = Image.open(url).convert("RGB")        
        pixel_values = batch['pixel_values']
        generated_ids = model.generate(pixel_values.cuda())       

        # Make a prediction
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)

        x = [] 
        for j in generated_text:       
            if j.isdigit():
                x.append(int(j))
            else:
                x.append(1000)        

        # Save the true and predicted labels
        y_true.append(target_text)
        y_pred.append(x)  

        if (i % 100  == 0) & (i > 2):
            accuracy = accuracy_score(y_true, y_pred)
            print("Accuracy:", accuracy) 

In [46]:
df10 = pd.read_csv('anno_00new.csv')

eval_dataset = IAMDataset(root_dir='C:\\Users\\Mytre\\OneDrive\\Документы\\Data\\Work\\',
                           df=df10,
                           processor=processor)

print("Number of validation examples:", len(eval_dataset))

Number of validation examples: 23894


Посчитаем точность обученной модели на eval датасете

In [47]:
from sklearn.metrics import accuracy_score

# Define the device to run the evaluation on
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set the model to evaluation mode
model.eval()
model.to(device)
# Evaluate the model on the eval dataset
y_true = []
y_pred = []
xx = []
with torch.no_grad():
    for i in tqdm(range(len(eval_dataset)), nrows=2):
        url = df10['file_name'][i]
        xx.append(url)
        target_text = int(df10['text'][i])
        image = Image.open(url).convert("RGB")   

        pixel_values = processor(image, return_tensors="pt").pixel_values
        generated_ids = model.generate(pixel_values.cuda(), max_new_tokens=3)

        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

        #pixel_values = batch['pixel_values']
        #generated_ids = model.generate(pixel_values.cuda())       

        # Make a prediction
        #generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
        x = []        
        if generated_text.isdigit():
            
            x = int(generated_text)
        else:
            x = 1000    
        
        # Save the true and predicted labels
        y_true.append(target_text)
        y_pred.append(x)

        if (i % 400  == 0) & (i > 2):
            accuracy = accuracy_score(y_true, y_pred)
            print("Accuracy:", accuracy) 

print("Accuracy:", accuracy)                   

  0%|          | 0/23894 [00:00<?, ?it/s]

Accuracy: 0.6982543640897756


KeyboardInterrupt: 