In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from PIL import Image
import random

import torch
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import random_split, DataLoader, Dataset
from torchvision import datasets, transforms
from torch.utils.tensorboard import SummaryWriter

from transformers import TrOCRProcessor, VisionEncoderDecoderModel, default_data_collator
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import evaluate
import albumentations as A


Завиксируем всю случайность!

In [2]:
seed = 23
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.determenistic = True

Соберем наш датасет из полученных файлов

In [3]:
basketball_df = pd.read_csv('for_train\\train_bascetball.csv')
streetball_df = pd.read_csv('for_train\\train_streetball.csv')
volleyball_df = pd.read_csv('for_train\\train_volleyball.csv')

res_df = pd.concat([basketball_df, streetball_df, volleyball_df], axis=0)
res_df.shape

(150630, 3)

Разобьем датасет на train test и eval части

In [4]:
diff_df, test_df, = train_test_split(res_df, test_size=0.2, shuffle=True)
train_df, eval_df = train_test_split(diff_df, test_size=0.1)
# we reset the indices to start from zero
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
eval_df.reset_index(drop=True, inplace=True)

In [5]:
img_size = 386

transform_train = A.Compose([A.Resize(img_size, img_size),
                A.RandomResizedCrop(img_size, img_size, scale=(0.9, 1), p=1), 
				A.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=10, val_shift_limit=10, p=1),
				A.RandomBrightnessContrast(brightness_limit=(-0.2,0.2), contrast_limit=(-0.2, 0.2), p=1),
				A.CLAHE(clip_limit=(1,4), p=1),
                A.ChannelShuffle(p=1.0),
                A.ColorJitter(always_apply=False, p=1.0, brightness=(0.8, 1.2), contrast=(0.8, 1.2), saturation=(0.8, 1.2), hue=(-0.2, 0.2)),
                #A.imgaug.transforms.IAASharpen(alpha=(0.2, 0.3), lightness=(0.5, 0.7), p=1),
                #A.Cutout(max_h_size=int(img_size * 0.05), max_w_size=int(img_size * 0.05), num_holes=5, p= 0.5),				
               ])

transform_test = A.Compose([A.Resize(img_size, img_size)])

In [6]:
class IAMDataset(Dataset):
    def __init__(self, root_dir, df, processor, max_target_length=3, transforms = None):
        self.root_dir = root_dir
        self.df = df
        self.processor = processor
        self.max_target_length = max_target_length
        self.transforms = transforms

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
      
        file_name = self.df['file_name'][idx]
        text = str(self.df['text'][idx])
   
        image = Image.open(file_name).convert("RGB")
        
        if self.transforms:
            image = self.transforms(image=np.array(image))["image"]
        
        #image = image.resize((64, 64))
        pixel_values = self.processor(image, return_tensors="pt").pixel_values
        # add labels (input_ids) by encoding the text
        labels = self.processor.tokenizer(text, 
                                          padding="max_length", 
                                          max_length=self.max_target_length).input_ids
        # important: make sure that PAD tokens are ignored by the loss function
        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]

        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding

Создадим наши датасеты

In [7]:
from transformers import TrOCRProcessor

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
train_dataset = IAMDataset(root_dir='C:\\Users\\Mytre\\OneDrive\\Документы\\Data\\Work\\',
                           df=train_df,
                           processor=processor,
                           transforms=transform_train)
test_dataset = IAMDataset(root_dir='C:\\Users\\Mytre\\OneDrive\\Документы\\Data\\Work\\',
                           df=test_df,
                           processor=processor,
                           transforms=transform_test)
eval_dataset = IAMDataset(root_dir='C:\\Users\\Mytre\\OneDrive\\Документы\\Data\\Work\\',
                           df=eval_df,
                           processor=processor,
                           transforms=transform_test)

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


In [8]:
print("Number of training examples:", len(train_dataset))
print("Number of validation examples:", len(eval_dataset))
print("Number of testing examples:", len(test_dataset))

Number of training examples: 108453
Number of validation examples: 12051
Number of testing examples: 30126


In [9]:
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=4)
test_dataloader = DataLoader(test_dataset, batch_size=32)

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tensor_board = SummaryWriter()

# model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-stage1")
model = VisionEncoderDecoderModel.from_pretrained('model_best1')
model.to(device)

optimizer = AdamW(model.parameters(), lr=0.00001)
scheduler = StepLR(optimizer, step_size=1, gamma=0.998)

In [11]:
#model = VisionEncoderDecoderModel.from_pretrained('model')
#model.to(device)
print(device)

cuda


Сконфигурируем нашу модель

In [12]:
# set special tokens used for creating the decoder_input_ids from the labels
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size

# set beam search parameters
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 4
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 10

Оопределим метрику

In [13]:
acc_metric = evaluate.load("accuracy")
cer_metric = evaluate.load("cer")

In [14]:
def compute_acc(pred_ids, label_ids):
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    #label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    #label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    x = [] 
    for j in pred_str:       
        if j.isdigit():
            x.append(int(j))
        else:
            x.append(1000)
    label_ids = [int(x) for x in label_ids]       
    acc = acc_metric.compute(predictions=x, references=label_ids)

    return acc

def compute_cer(pred_ids, label_ids):
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return cer

In [15]:
def drow_graf(acc, loss):

    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(8, 6))

    # Plot the data on the first subplot
    ax1.plot(list(range(len(acc))), acc, color='green')
    ax1.plot(list(range(len(acc))), acc, color='red')
    ax1.set_title('Validation accuracy')

    # Plot the data on the second subplot
    ax2.plot(list(range(len(loss))), loss)
    ax2.set_title('Training loss')

    # Add a title to the figure
    fig.suptitle('Metrics')

    # Display the plot
    plt.show()

In [16]:
def check_val_acc(j, step_val, counter, accuracy):
    model.eval()
    graf_acc = []
    valid_acc = []
    valid_cer = []
    valid_acc_batch = []
    valid_cer_batch = []

    
    with torch.no_grad():
        for batch in tqdm(eval_dataloader):
            # run batch generation          
            target_text = processor.batch_decode(batch['labels'], skip_special_tokens=True)
            #target_text = [int(x) for x in target_text]
       
            outputs = model.generate(batch["pixel_values"].to(device), max_new_tokens=4)
            # compute metrics
            
            acc = compute_acc(pred_ids=outputs, label_ids=target_text)
            cer = compute_cer(pred_ids=outputs, label_ids=batch['labels'])           
            #print(outputs)
            valid_cer.append(cer)       
            valid_acc.append(acc['accuracy'])  
            valid_acc_batch.append(acc['accuracy']) 
            valid_cer_batch.append(cer)

            if (j % 100 == 0) & (j > 2 ):
               
                tensor_board.add_scalar('Validation accuracy:', np.mean(valid_acc_batch), global_step=step_val)
                tensor_board.add_scalar('Validation cer:', np.mean(valid_cer_batch), global_step=step_val)
                tensor_board.flush()
                step_val += 1

            if (j % 2000 == 0) & (j > 2 ):
                print(f'Validation accuracy: {np.mean(valid_acc_batch)}')
                print(f'Validation cer:: {np.mean(valid_cer_batch)}')
                print()
                mean_acc =  np.mean(valid_acc)   
                graf_acc.append(mean_acc)

                valid_acc_batch = []
                valid_cer_batch
                

                #drow_graf(graf_acc, graf_loss)
            j += 1 
            
        counter += 1    
        if np.mean(valid_acc) > accuracy:
            model.save_pretrained('model_best1')    
            accuracy = np.mean(valid_acc)
            counter = 0
            print('Model saved!!!')

        #if counter > 5:
            #print('Early stopping!!!')
            #print(f'Result validation accuracy: {accuracy}')
            #break

        mean_acc =  np.mean(valid_acc)   
        graf_acc.append(mean_acc)
        print("Total validation accuracy:", mean_acc)
        print("Total validation cer:", np.mean(valid_cer))

    return j, step_val, counter, accuracy

In [None]:
step = 0
step_val = 0
accuracy = 0.
counter = 0
graf_loss = []
graf_acc = []
i, j = 0, 0

for epoch in range(100):  # loop over the dataset multiple times
    # train
    train_loss = []
    acc_batch = []
    train_loss_batch = []
    
    valid_acc = []
    valid_cer = []
    valid_acc_batch = []
    valid_cer_batch = []

    model.train()
    current_lr = optimizer.param_groups[0]['lr']
    
    print(f'Epoch {epoch + 1} start, lr={current_lr}')

    for batch in tqdm(train_dataloader):
        # get the inputs       
    
        #break
        # forward + backward + optimize
        target_text = processor.batch_decode(batch['labels'], skip_special_tokens=True)        
        #outputs_acc = model.generate(batch["pixel_values"].to(device), max_new_tokens=4)
                                   
        for k,v in batch.items():
            batch[k] = v.to(device) 

        outputs = model(**batch)
       
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        #print(target_text)
        #break
        #acc = compute_acc(pred_ids=outputs_acc, label_ids=target_text)
        #acc_train = compute_acc(pred_ids=outputs_acc, label_ids=target_text)
        #acc_batch.append(acc_train['accuracy'])

        train_loss.append(loss.item())
        train_loss_batch.append(loss.item())

        if (i % 100 == 0) & (i > 2 ):
            
            tensor_board.add_scalar('Train loss:', np.mean(train_loss_batch), global_step=step)   
            #tensor_board.add_scalar('Train accuracy:', np.mean(acc_batch), global_step=step)
            tensor_board.flush()
            step += 1

            if np.mean(valid_acc) > accuracy:
                model.save_pretrained('model_best')    
                accuracy = np.mean(valid_acc)
                counter = 0
                print(f'Model saved after {step} steps!!!')


        if (i % 4500 == 0): # & (i > 2 ):
            print(f'Train loss: {np.mean(train_loss_batch)}')
            
            mean_loss = np.mean(train_loss)
            graf_loss.append(mean_loss)

            scheduler.step()
            current_lr = optimizer.param_groups[0]['lr']
            
            #drow_graf(graf_acc, graf_loss)
            print(f'lr={current_lr}')

            j, step_val, counter, accuracy = check_val_acc(j, step_val, counter, accuracy)
        i += 1

    mean_loss = np.mean(train_loss)   
    print(f"Loss after epoch {epoch + 1}:", mean_loss)
    graf_loss.append(mean_loss)
    # evaluate
    '''
    model.eval()

    valid_acc = []
    valid_cer = []
    valid_acc_batch = []
    valid_cer_batch = []

    with torch.no_grad():
        for batch in tqdm(eval_dataloader):
            # run batch generation          
            target_text = processor.batch_decode(batch['labels'], skip_special_tokens=True)
            #target_text = [int(x) for x in target_text]
       
            outputs = model.generate(batch["pixel_values"].to(device), max_new_tokens=4)
            # compute metrics
            
            acc = compute_acc(pred_ids=outputs, label_ids=target_text)
            cer = compute_cer(pred_ids=outputs, label_ids=batch['labels'])           
            #print(outputs)
            valid_cer.append(cer)       
            valid_acc.append(acc['accuracy'])  
            valid_acc_batch.append(acc['accuracy']) 
            valid_cer_batch.append(cer)

            if (j % 100 == 0) & (j > 2 ):
               
                tensor_board.add_scalar('Validation accuracy:', np.mean(valid_acc_batch), global_step=step_val)
                tensor_board.add_scalar('Validation cer:', np.mean(valid_cer_batch), global_step=step_val)
                tensor_board.flush()
                step_val += 1

            if (j % 2000 == 0) & (j > 2 ):
                print(f'Validation accuracy: {np.mean(valid_acc_batch)}')
                print(f'Validation cer:: {np.mean(valid_cer_batch)}')
                print()
                mean_acc =  np.mean(valid_acc)   
                graf_acc.append(mean_acc)

                valid_acc_batch = []
                valid_cer_batch
                

                #drow_graf(graf_acc, graf_loss)
            j += 1 
            
        counter += 1    
        if np.mean(valid_acc) > accuracy:
            model.save_pretrained('model_best1')    
            accuracy = np.mean(valid_acc)
            counter = 0
            print('Model saved!!!')

        if counter > 5:
            print('Early stopping!!!')
            print(f'Result validation accuracy: {accuracy}')
            break

        mean_acc =  np.mean(valid_acc)   
        graf_acc.append(mean_acc)
        print("Total validation accuracy:", mean_acc)
        print("Total validation cer:", np.mean(valid_cer))

        #drow_graf(graf_acc, graf_loss)
'''

In [17]:
model = VisionEncoderDecoderModel.from_pretrained('model_best1')
model.to(device)

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0): ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=76

Проветим обученную модель на тестовом датасете

In [19]:
from sklearn.metrics import accuracy_score

torch.cuda.empty_cache()
# Define the device to run the evaluation on
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set the model to evaluation mode
model.eval()
model.to(device)
# Evaluate the model on the eval dataset
diff_acc = []
acc = []
i = 0

with torch.no_grad():
    for batch in tqdm(test_dataloader):
    
        target_text = processor.batch_decode(batch['labels'], skip_special_tokens=True)
        target_text = [int(x) for x in target_text]
           
        pixel_values = batch['pixel_values']
        generated_ids = model.generate(pixel_values.cuda(), max_new_tokens=4)       
       
        # Make a prediction
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)

        x = [] 
        for j in generated_text:       
            if j.isdigit():
                x.append(int(j))
            else:
                x.append(1000)        
        
        bach_acc = accuracy_score(target_text, x)
        # Save the true and predicted labels
        diff_acc.append(bach_acc)  
        acc.append(bach_acc)

        if (i % 100  == 0) & (i > 2):
            accuracy = np.mean(diff_acc)
            print("Accuracy:", accuracy) 
        i += 1

    print(f"Total accuracy: {np.mean(acc)}")    

  0%|          | 0/942 [00:00<?, ?it/s]

Accuracy: 0.9551361386138614
Accuracy: 0.9539800995024875
Accuracy: 0.9536960132890365
Accuracy: 0.9532418952618454
Accuracy: 0.9520958083832335
Accuracy: 0.9528910149750416
Accuracy: 0.9536376604850214
Accuracy: 0.9533785892634207
Accuracy: 0.9532117092119867
Total accuracy: 0.95348991507431
