Entraîner TrOCR sur un ensemble de données personnalisé de chèques pourrait pour réduire les erreurs.

In [1]:
!pip install -q transformers
!pip install -q sentencepiece
!pip install -q jiwer
!pip install -q datasets
!pip install -q evaluate
!pip install -q -U accelerate
!pip install -q matplotlib
!pip install -q protobuf==3.20.1
!pip install -q tensorboard

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incomp

transformers : Il s'agit de la bibliothèque Hugging Face transformers qui nous donne accès à des centaines de modèles basés sur des transformateurs, y compris le modèle TrOCR.

sentencepiece : Il s'agit de la bibliothèque de tokenisation sentencepiece nécessaire pour convertir les mots en tokens et nombres. C'est également une partie de la famille Hugging Face.

jiwer : La bibliothèque jiwer nous donne accès à plusieurs métriques de reconnaissance vocale et de langage. Celles-ci incluent le WER (Word Error Rate) et le CER (Character Error Rate). Nous utiliserons la métrique CER pour évaluer le modèle pendant l'entraînement.

In [23]:
import os
import os
import torch
import evaluate
import numpy as np
import pandas as pd
import glob as glob
import torch.optim as optim
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
import torchvision.transforms.functional as F
import random
from PIL import Image
from zipfile import ZipFile
from tqdm.notebook import tqdm
from dataclasses import dataclass
from torch.utils.data import Dataset
from urllib.request import urlretrieve
from transformers import (
    VisionEncoderDecoderModel,
    TrOCRProcessor,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    default_data_collator,
    GenerationConfig
)

VisionEncoderDecoderModel : Nous avons besoin de cette classe pour définir différents modèles TrOCR.

TrOCRProcessor : TrOCR attend que le jeu de données suive un processus de normalisation particulier. Cette classe veillera à ce que les images soient correctement normalisées et traitées.

Seq2SeqTrainer : Ceci est nécessaire pour initialiser l'API Trainer.

Seq2SeqTrainingArguments : Pendant l'entraînement, l'API Trainer attend plusieurs arguments.

La classe Seq2SeqTrainingArguments initialise tous les arguments requis avant de les passer à l'API.

transforms : Le module de transformations Torchvision est nécessaire pour appliquer des augmentations de données aux images.


In [3]:
# the seed for reproducibility across different runs and define the computation device.

def seed_everything(seed_value):
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
%cd /content/drive/MyDrive/TrOCR

/content/drive/MyDrive/TrOCR


In [6]:
!ls fr

train  valid


In [7]:
@dataclass(frozen=True)
class TrainingConfig:
    BATCH_SIZE:    int = 8
    EPOCHS:        int = 50
    LEARNING_RATE: float = 0.0005

@dataclass(frozen=True)
class DatasetConfig:
    DATA_ROOT:     str = 'fr'

@dataclass(frozen=True)
class ModelConfig:
    MODEL_NAME: str = 'microsoft/trocr-base-handwritten'

In [8]:
def is_image_file(filename):
    # Check if the file extension is a common image format
    return filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))

In [9]:
# Load the CSV into a DataFrame
dataset_path=DatasetConfig.DATA_ROOT

train_df = pd.read_csv(f"{dataset_path}/train/train.csv")

# Rename columns
train_df.rename(columns={'file-name': 'file_name'}, inplace=True)

valid_df = pd.read_csv(f"{dataset_path}/valid/valid.csv")

# Rename columns
valid_df.rename(columns={'file-name': 'file_name'}, inplace=True)

print("Valid DataFrame:")
print(valid_df.head())

print("Train DataFrame:")
print(train_df.head())

Valid DataFrame:
    file_name                                              text
0  000000.png                                millions de dinars
1  000040.png                                       Vingt mille
2  000077.png                                   cent dix dinars
3  000137.png                                   SIX CENT TRENTE
4  000391.png  cent vingt-trois mille quatre cent cinquante-six
Train DataFrame:
    file_name                     text
0  000225.png                cinquante
1  000258.png                    douze
2  000158.png                       un
3  000109.png  Vingt six milles dinars
4  000023.png       cent Vingts Dinars


In [10]:
# defining the augmentations

class VerySmallRotation(object):
    def __init__(self, max_degrees=1):
        self.max_degrees = max_degrees

    def __call__(self, img):
        angle = random.uniform(-self.max_degrees, self.max_degrees)
        return F.rotate(img, angle, fill=(255, 255, 255))

class MinimalShear(object):
    def __init__(self, max_shear=0.02):
        self.max_shear = max_shear

    def __call__(self, img):
        shear = random.uniform(-self.max_shear, self.max_shear)
        return F.affine(img, angle=0, translate=(0, 0), scale=1, shear=[shear, 0], fill=(255, 255, 255))

train_transforms = transforms.Compose([
    transforms.RandomApply([
        transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.05),
    ], p=0.3),
    transforms.RandomApply([
        transforms.GaussianBlur(kernel_size=3, sigma=(0.1, 0.5)),
    ], p=0.1),
    VerySmallRotation(max_degrees=1),
    MinimalShear(max_shear=0.02),
    transforms.RandomApply([
        transforms.Lambda(lambda x: F.adjust_gamma(x, gamma=random.uniform(0.9, 1.1))),
    ], p=0.2),
])

In [11]:
class CustomOCRDataset(Dataset):
    def __init__(self, root_dir, df, processor, max_target_length=128):
        self.root_dir = root_dir
        self.df = df
        self.processor = processor
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # The image file name.
        file_name = self.df['file_name'][idx]
        # The text (label).
        text = self.df['text'][idx]
        # Read the image, apply augmentations, and get the transformed pixels.
        image = Image.open(os.path.join(self.root_dir, file_name)).convert('RGB')
        image = train_transforms(image)
        pixel_values = self.processor(image, return_tensors='pt').pixel_values
        # Pass the text through the tokenizer and get the labels,
        # i.e. tokenized labels.
        labels = self.processor.tokenizer(
            text,
            padding='max_length',
            max_length=self.max_target_length
        ).input_ids
        # We are using -100 as the padding token.
        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]
        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding

The __init__() method accepts the root directory path, the DataFrame, TrOCR processor, and the maximum label length as parameters.

The __getitem__() method first reads the label and image from the disk. It then passes the image through the transforms to apply the augmentations. The TrOCRProcessor returns the normalized pixel values in PyTorch tensor format. Next, the text labels are passed through the tokenizer. If a label is shorter than 128 characters, it is padded with -100 to a length of 128. If it is longer than 128 characters, then the characters are truncated.  Finally, it returns the pixel values and the labels as a dictionary.

In [12]:
processor = TrOCRProcessor.from_pretrained(ModelConfig.MODEL_NAME)
train_dataset = CustomOCRDataset(
    root_dir=os.path.join(DatasetConfig.DATA_ROOT, 'train/'),
    df=train_df,
    processor=processor
)
valid_dataset = CustomOCRDataset(
    root_dir=os.path.join(DatasetConfig.DATA_ROOT, 'valid/'),
    df=valid_df,
    processor=processor
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]



In [13]:
print("Number of training examples:", len(train_dataset))
print("Number of validation examples:", len(valid_dataset))

Number of training examples: 340
Number of validation examples: 61


In [14]:
# Charger le modèle pré-entraîné
model = VisionEncoderDecoderModel.from_pretrained(ModelConfig.MODEL_NAME)
model.to(device)
print(model)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

config.json:   0%|          | 0.00/4.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fea

In [26]:
# Set special tokens used for creating the decoder_input_ids from the labels.
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size
# set beam search parameters
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

In [16]:
optimizer = optim.AdamW(
    model.parameters(), lr=TrainingConfig.LEARNING_RATE, weight_decay=0.005
)

In [17]:
cer_metric = evaluate.load('cer')

def compute_cer(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    return {"cer": cer}

Downloading builder script:   0%|          | 0.00/5.60k [00:00<?, ?B/s]

In [35]:
# Définir les arguments d'entraînement
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=TrainingConfig.BATCH_SIZE,
    per_device_eval_batch_size=TrainingConfig.BATCH_SIZE,
    fp16=True,
    output_dir='seq2seq_model_printed/',
    logging_steps=2,
    save_steps=100,
    eval_steps=100,
    num_train_epochs=TrainingConfig.EPOCHS,  # Add this line to set the number of epochs
)



In [36]:
# Initialize trainer.
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=processor.feature_extractor,
    args=training_args,
    compute_metrics=compute_cer,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=default_data_collator
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [37]:
from accelerate import Accelerator
accelerator = Accelerator()

In [38]:
res = trainer.train()

Epoch,Training Loss,Validation Loss,Cer
1,0.7309,0.90581,0.241643
2,0.9145,1.582401,0.385097
3,0.5612,0.625773,0.205432
4,0.3395,0.987646,0.310585
5,0.0646,1.028094,0.279248
6,0.2563,0.908263,0.277159
7,0.1363,0.990564,0.29805
8,0.3099,0.917186,0.248607
9,0.9118,1.330019,0.393454
10,0.0697,0.889377,0.284819


Non-default generation parameters: {'max_length': 64, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
Non-default generation parameters: {'max_length': 64, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
Non-default generation parameters: {'max_length': 64, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
Non-default generation parameters: {'max_length': 64, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
Non-default generation parameters: {'max_length': 64, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
Non-default generation parameters: {'max_length': 64, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
Non-default generation parameters: {'max_length': 64, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
Non-default generation parameters: {'max_length': 64, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
Non-default generation parameters: {'max

RuntimeError: [enforce fail at inline_container.cc:603] . unexpected pos 2139796928 vs 2139796816

In [39]:
processor = TrOCRProcessor.from_pretrained(ModelConfig.MODEL_NAME)
trained_model = VisionEncoderDecoderModel.from_pretrained('seq2seq_model_printed/checkpoint-'+str(res.global_step)).to(device)



OSError: seq2seq_model_printed/checkpoint-129 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
def read_and_show(image_path):
    """
    :param image_path: String, path to the input image.


    Returns:
        image: PIL Image.
    """
    image = Image.open(image_path).convert('RGB')
    return image

In [None]:
def ocr(image, processor, model):
    """
    :param image: PIL Image.
    :param processor: Huggingface OCR processor.
    :param model: Huggingface OCR model.


    Returns:
        generated_text: the OCR'd text string.
    """
    # We can directly perform OCR on cropped images.
    pixel_values = processor(image, return_tensors='pt').pixel_values.to(device)
    generated_ids = model.generate(pixel_values)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_text

In [None]:
def eval_new_data(
    data_path=os.path.join(DatasetConfig.DATA_ROOT, 'valid', '*'),
    num_samples=60
):
    image_paths = glob.glob(data_path)
    image_paths = [path for path in image_paths if is_image_file(path)]  # Filter out non-image files
    for i, image_path in tqdm(enumerate(image_paths), total=len(image_paths)):
        if i == num_samples:
            break
        image = read_and_show(image_path)
        text = ocr(image, processor, trained_model)
        plt.figure(figsize=(7, 4))
        plt.imshow(image)
        plt.title(text)
        plt.axis('off')
        plt.show()

eval_new_data(
    data_path=os.path.join(DatasetConfig.DATA_ROOT, 'valid', '*'),
    num_samples=60
)
