In [1]:
%pip install transformers torch torchvision pillow opencv-python

Collecting transformers
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
Collecting torch
  Using cached torch-2.5.1-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting torchvision
  Using cached torchvision-0.20.1-cp311-cp311-win_amd64.whl.metadata (6.2 kB)
Collecting pillow
  Downloading pillow-11.1.0-cp311-cp311-win_amd64.whl.metadata (9.3 kB)
Collecting opencv-python
  Using cached opencv_python-4.10.0.84-cp37-abi3-win_amd64.whl.metadata (20 kB)
Collecting filelock (from transformers)
  Using cached filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.24.0 (from transformers)
  Using cached huggingface_hub-0.27.0-py3-none-any.whl.metadata (13 kB)
Collecting numpy>=1.17 (from transformers)
  Downloading numpy-2.2.1-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting pyyaml>=5.1 (from transformers)
  Using cached PyYAML-6.0.2-cp311-cp311-win_amd64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloadi

In [16]:
import cv2
import re
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import torch
import os

In [3]:
def preprocess_image(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    height, width = gray.shape
    scaling_factor = 600 / max(height, width)
    resized = cv2.resize(gray, None, fx=scaling_factor, fy=scaling_factor, interpolation=cv2.INTER_LINEAR)
    processed = cv2.adaptiveThreshold(resized, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    return processed

In [9]:
def extract_text_with_trocr(processor, model, image):
    rgb_image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
    pil_image = Image.fromarray(rgb_image)
    pixel_values = processor(pil_image, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_text

In [6]:
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.47.1"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activ

In [19]:
image_folder = "./input/dates/"
output_file = "./output/results.txt"

In [20]:
for file_name in os.listdir(image_folder):
    if file_name.lower().endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(image_folder, file_name)
        
        preprocessed_image = preprocess_image(image_path)
        
        recognized_text = extract_text_with_trocr(processor, model, preprocessed_image)        

        with open(output_file, "a") as f:
            f.write(f"File: {file_name}\n")
            f.write(f"Recognized Text: {recognized_text}\n")
            f.write("-" * 40 + "\n")

print(f"Results written to {output_file}")

Results written to ./output/results.txt
