# Handwriting recognition

## Data collection

In [None]:
# Commande Linux pour extraire les images dans data/words
!tar -xf data/IAM_Words/words.tgz -C data/words
# Commande Linux pour déplacer le fichier /data/IAM_Words/words.txt dans data/
!mv data/IAM_Words/words.txt data


In [6]:
!head -20 data/words.txt

#--- words.txt ---------------------------------------------------------------#
#
# iam database word information
#
# format: a01-000u-00-00 ok 154 1 408 768 27 51 AT A
#
#     a01-000u-00-00  -> word id for line 00 in form a01-000u
#     ok              -> result of word segmentation
#                            ok: word was correctly
#                            er: segmentation of word can be bad
#
#     154             -> graylevel to binarize the line containing this word
#     1               -> number of components for this word
#     408 768 27 51   -> bounding box around this word in x,y,w,h format
#     AT              -> the grammatical tag for this word, see the
#                        file tagset.txt for an explanation
#     A               -> the transcription for this word
#
a01-000u-00-00 ok 154 408 768 27 51 AT A
a01-000u-00-01 ok 154 507 766 213 48 NN MOVE


## Imports

In [None]:
import keras
from keras.layers import StringLookup
from keras import ops
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import os

# Fix the random seed for reproducibility
np.random.seed(42)
keras.utils.set_random_seed(42)

2025-09-07 22:55:55.553233: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-09-07 22:55:55.553527: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-07 22:55:55.599219: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-09-07 22:55:59.238056: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off,

## Dataset splitting

In [10]:
base_path = "data"
words_list = []

words = open(f"{base_path}/words.txt", "r").readlines()
for line in words:
    if line[0] == "#": # If the first character of the line is "#" -> pass to the next line
        continue
    if line.split(" ")[1] != "err":  # We don't need to deal with errored entries, we keep just the "ok"
        words_list.append(line)

print(len(words_list))

np.random.shuffle(words_list)

96456


### We will split the dataset into three subsets with a 90:5:5 ratio (train:validation:test).

In [None]:
split_idx = int(0.9 * len(words_list)) # 90% of the len of words_list
train_samples = words_list[:split_idx] # keep 0 to 90% of the len of words_list
test_samples = words_list[split_idx:] # keep 90% to 100% so the last 10% of words_list

val_split_idx = int(0.5 * len(test_samples)) # 50% of the len of test_samples
validation_samples = test_samples[:val_split_idx] # keep 0 to 50% of the len of test_samples
test_samples = test_samples[val_split_idx:] # keep 50% to 100% so the last 50% of test_samples

# test if the split is correct
assert len(words_list) == len(train_samples) + len(validation_samples) + len(test_samples)

print(f"Total training samples: {len(train_samples)}")
print(f"Total validation samples: {len(validation_samples)}")
print(f"Total test samples: {len(test_samples)}")

Total training samples: 86810
Total validation samples: 4823
Total test samples: 4823


## Data input pipeline
We start building our data input pipeline by first preparing the image paths.

In [16]:
base_image_path = os.path.join(base_path, "words")


def get_image_paths_and_labels(samples):
    paths = []
    corrected_samples = []
    for file_line in samples:
        line_split = file_line.strip()
        line_split = line_split.split(" ")
    
        # Each line split will have this format for the corresponding image:
        # part1/part1-part2/part1-part2-part3.png
        # a01-000u-00-00 ok 154 408 768 27 51 AT A
        image_name = line_split[0] # a01-000u-00-00
        partI = image_name.split("-")[0] # a01
        partII = image_name.split("-")[1] # 000u
        img_path = os.path.join(
            base_image_path, partI, partI + "-" + partII, image_name + ".png"
        ) # data/words/a01/a01-00u/a01-000u-00-00.png
        if os.path.getsize(img_path):
            paths.append(img_path)
            corrected_samples.append(file_line.split("\n")[0])

    return paths, corrected_samples


train_img_paths, train_labels = get_image_paths_and_labels(train_samples)
validation_img_paths, validation_labels = get_image_paths_and_labels(validation_samples)
test_img_paths, test_labels = get_image_paths_and_labels(test_samples)

Then we prepare the ground-truth labels.

In [17]:
# Find maximum length and the size of the vocabulary in the training data.
train_labels_cleaned = []
characters = set()
max_len = 0

for label in train_labels:
    label = label.split(" ")[-1].strip() # a01-000u-00-00 ok 154 408 768 27 51 AT A -> "A"
    for char in label:
        characters.add(char)

    max_len = max(max_len, len(label))
    train_labels_cleaned.append(label)

characters = sorted(list(characters))

print("Maximum length: ", max_len)
print("Vocab size: ", len(characters))

# Check some label samples.
train_labels_cleaned[:10]

Maximum length:  21
Vocab size:  78


['the',
 'boy',
 'By',
 'to',
 'must',
 'position',
 'opposed',
 'was',
 'Dr.',
 'National']

Now we clean the validation and the test labels as well.

In [18]:
def clean_labels(labels):
    cleaned_labels = []
    for label in labels:
        label = label.split(" ")[-1].strip()
        cleaned_labels.append(label)
    return cleaned_labels


validation_labels_cleaned = clean_labels(validation_labels)
test_labels_cleaned = clean_labels(test_labels)

## Building the character vocabulary
Keras provides different preprocessing layers to deal with different modalities of data. This guide provides a comprehensive introduction. Our example involves preprocessing labels at the character level. This means that if there are two labels, e.g. "cat" and "dog", then our character vocabulary should be {a, c, d, g, o, t} (without any special tokens). We use the StringLookup layer for this purpose.

In [None]:
# The model can't work directly with characters.
# It needs numbers (encoding) during training,
# and then numbers are converted back to characters (decoding) for the result.

AUTOTUNE = tf.data.AUTOTUNE

# Mapping characters to integers.
char_to_num = StringLookup(vocabulary=list(characters), mask_token=None)

# Mapping integers back to original characters.
num_to_char = StringLookup(vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True)

## Resizing images without distortion
Instead of square images, many OCR models work with rectangular images. This will become clearer in a moment when we will visualize a few samples from the dataset. While aspect-unaware resizing square images does not introduce a significant amount of distortion this is not the case for rectangular images. But resizing images to a uniform size is a requirement for mini-batching. So we need to perform our resizing such that the following criteria are met:

* Aspect ratio is preserved.
* Content of the images is not affected.

In [20]:
def distortion_free_resize(image, img_size):
    w, h = img_size
    image = tf.image.resize(image, size=(h, w), preserve_aspect_ratio=True)

    # Check tha amount of padding needed to be done.
    pad_height = h - ops.shape(image)[0]
    pad_width = w - ops.shape(image)[1]

    # Only necessary if you want to do same amount of padding on both sides.
    if pad_height % 2 != 0:
        height = pad_height // 2
        pad_height_top = height + 1
        pad_height_bottom = height
    else:
        pad_height_top = pad_height_bottom = pad_height // 2

    if pad_width % 2 != 0:
        width = pad_width // 2
        pad_width_left = width + 1
        pad_width_right = width
    else:
        pad_width_left = pad_width_right = pad_width // 2

    image = tf.pad(
        image,
        paddings=[
            [pad_height_top, pad_height_bottom],
            [pad_width_left, pad_width_right],
            [0, 0],
        ],
    )

    image = ops.transpose(image, (1, 0, 2))
    image = tf.image.flip_left_right(image)
    return image

## Putting the utilities together

In [None]:
batch_size = 64
padding_token = 99
image_width = 128
image_height = 32


def preprocess_image(image_path, img_size=(image_width, image_height)):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_png(image, 1)
    image = distortion_free_resize(image, img_size)
    image = ops.cast(image, tf.float32) / 255.0
    return image


def vectorize_label(label):
    label = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8"))
    length = ops.shape(label)[0]
    pad_amount = max_len - length
    label = tf.pad(label, paddings=[[0, pad_amount]], constant_values=padding_token)
    return label


def process_images_labels(image_path, label):
    image = preprocess_image(image_path)
    label = vectorize_label(label)
    return {"image": image, "label": label}


def prepare_dataset(image_paths, labels):
    dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels)).map(
        process_images_labels, num_parallel_calls=AUTOTUNE
    )
    return dataset.batch(batch_size).cache().prefetch(AUTOTUNE)