<a href="https://colab.research.google.com/github/Shakib-IO/Human_AI_gsoc_2024/blob/main/TrOCR_IAM_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 ##### TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models

 [Paper](https://arxiv.org/pdf/2109.10282.pdf) | [Github](https://github.com/microsoft/unilm/tree/master/trocr)

In [2]:
!pip install -q git+https://github.com/huggingface/transformers.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone


In [3]:
!pip install -q datasets jiwer

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m63.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import pandas as pd
from PIL import Image
from datasets import load_metric
from tqdm.notebook import tqdm

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import TrOCRProcessor
from transformers import VisionEncoderDecoderModel

In [5]:
# Load the dataset
df = pd.read_fwf('/content/drive/MyDrive/IAM_Dataset/gt_test.txt', header=None)
df.rename(columns={0: "file_name", 1: "text"}, inplace=True)
del df[2]
df.head()

Unnamed: 0,file_name,text
0,c04-110-00.jpg,Become a success with a disc and hey presto ! ...
1,c04-110-01.jpg,"assuredness "" Bella Bella Marie "" ( Parlophone..."
2,c04-110-02.jpg,I don't think he will storm the charts with th...
3,c04-110-03.jpg,"CHRIS CHARLES , 39 , who lives in Stockton-on-..."
4,c04-116-00.jpg,He is also a director of a couple of garages ....


In [6]:
# Create a Dataset class
class IAM_Dataset(Dataset):
  def __init__(self, root_dir, df, processor, max_target_length=128):
    self.root_dir = root_dir
    self.df = df
    self.processor = processor
    self.max_target_length = max_target_length

  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    file_name = self.df['file_name'][idx]
    text = self.df['text'][idx]
    if file_name.endswith('jp'):
          file_name = file_name + 'g'
    # Image Resize and Normalize
    image = Image.open(self.root_dir + file_name).convert("RGB")
    pixel_values = self.processor(image, return_tensors="pt").pixel_values
    labels = self.processor.tokenizer(text, padding="max_length", max_length=self.max_target_length).input_ids
    labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]
    encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
    return encoding




In [7]:
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
test_set = IAM_Dataset(root_dir = "/content/drive/MyDrive/IAM_Dataset/image/", df = df, processor=processor)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [8]:
test_dataloader = DataLoader(test_set, batch_size=8)

In [9]:
batch = next(iter(test_dataloader))

In [10]:
for k,v in batch.items():
  print(k, v.shape)

pixel_values torch.Size([8, 3, 384, 384])
labels torch.Size([8, 128])


In [11]:
# Sample some labels
labels = batch['labels']
labels[labels == -100] = processor.tokenizer.pad_token_id
label_str = processor.batch_decode(labels, skip_special_tokens=True)
label_str

["Become a success with a disc and hey presto! You're a star.... Rolly sings with",
 'assuredness " Bella Bella Marie " ( Parlophone ), a lively song that changes tempo mid-way',
 "I don't think he will storm the charts with this one, but it's a good start.",
 'CHRIS CHARLES, 39, who lives in Stockton-on-Tees, is an accountant.',
 'He is also a director of a couple of garages. And he finds time as well to be a lyric',
 'writer. He writes with Tolchard Evans, composer of " Lady of Spain " and other big hits.',
 'Tolch, as he is known in Tin Pan Alley, likes songs with a month in the title. He wrote',
 '" My September Love, " the big David Whitfield hit of 1956.']

In [12]:
# Run Evalution
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
model.to(device)

config.json:   0%|          | 0.00/4.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fea

In [13]:
# Load the "character error rate (CER) metric"
cer = load_metric("cer")

  cer = load_metric("cer")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [14]:
for batch in tqdm(test_dataloader):
  pixel_values = batch["pixel_values"].to(device)
  outputs = model.generate(pixel_values)

  # decode
  pred_str = processor.batch_decode(outputs, skip_special_tokens=True)

  # decode labels
  labels = batch["labels"]
  labels[labels == -100] = processor.tokenizer.pad_token_id
  label_str = processor.batch_decode(labels, skip_special_tokens=True)

  # add batch to metric
  cer.add_batch(predictions=pred_str, references=label_str)

final_score = cer.compute()

  0%|          | 0/365 [00:00<?, ?it/s]



In [15]:
print("Character error rate on test set:", final_score)

Character error rate on test set: 0.03746350692349094
