In [1]:
import tensorflow as tf
try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")]
except: pass

from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard

from mltu.preprocessors import ImageReader
from mltu.transformers import ImageResizer, LabelIndexer, LabelPadding, ImageShowCV2
from mltu.augmentors import RandomBrightness, RandomRotate, RandomErodeDilate, RandomSharpen
from mltu.annotations.images import CVImage

from mltu.tensorflow.dataProvider import DataProvider
from mltu.tensorflow.losses import CTCloss
from mltu.tensorflow.callbacks import Model2onnx, TrainLogger
from mltu.tensorflow.metrics import CERMetric, WERMetric

from model import train_model
from mconfigs import ModelConfigs

import os
from tqdm import tqdm



This code is designed to prepare a dataset for a machine-learning model. The dataset includes images of handwritten text along with the corresponding transcriptions. Initially, the code specifies the locations of the text file and the folder containing the images. It then initializes three variables: Dataset, vocab, and max_len.

Next, the code opens and reads all the lines from the text file. For each line, it checks if the line starts with "#" or if the third word is "err"; if either condition is met, the line is skipped. Otherwise, it extracts the first three and first eight letters of the first word in the line to construct the folder path where the image is stored. It also generates a file name by appending ".png" to the first word. The text written in the image, which is the last word in the line, is also extracted and the newline character at the end is removed.

Finally, the code adds the image path and the corresponding text label to the Dataset. It also updates the vocab with all the characters in the label and keeps track of the length of the longest label encountered so far.

In [2]:
sentences_txt_path = os.path.join("Datasets", "IAM_Sentences", "ascii", "sentences.txt")
sentences_folder_path = os.path.join("Datasets", "IAM_Sentences", "sentences")

dataset, vocab, max_len = [], set(), 0
words = open(sentences_txt_path, "r").readlines()
for line in tqdm(words):
    if line.startswith("#"):
        continue

    line_split = line.split(" ")
    if line_split[2] == "err":
        continue

    folder1 = line_split[0][:3]
    folder2 = "-".join(line_split[0].split("-")[:2])
    file_name = line_split[0] + ".png"
    label = line_split[-1].rstrip("\n")

    # replace "|" with " " in label
    label = label.replace("|", " ")

    rel_path = os.path.join(sentences_folder_path, folder1, folder2, file_name)
    if not os.path.exists(rel_path):
        print(f"File not found: {rel_path}")
        continue

    dataset.append([rel_path, label])
    vocab.update(list(label))
    max_len = max(max_len, len(label))

100%|██████████| 16777/16777 [00:02<00:00, 7036.54it/s]



Next, the Dataset must be preprocessed and prepared for use in the model. This involves tasks such as reading images, converting the images of handwritten sentences into a suitable format, indexing labels, padding labels, and dividing the Dataset into training and testing sets. Similarly to before, I am performing these steps using a custom DataProvider object.

In [3]:
# Create a ModelConfigs object to store model configurations
configs = ModelConfigs()

# Save vocab and maximum text length to configs
configs.vocab = "".join(vocab)
configs.max_text_length = max_len
configs.save()

In [4]:
# Create a data provider for the dataset
data_provider = DataProvider(
    dataset=dataset,
    skip_validation=True,
    batch_size=configs.batch_size,
    data_preprocessors=[ImageReader(CVImage)],
    transformers=[
        ImageResizer(configs.width, configs.height, keep_aspect_ratio=True),
        LabelIndexer(configs.vocab),
        LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab)),
        ],
)

In [5]:
# Split the dataset into training and validation sets
train_data_provider, val_data_provider = data_provider.split(split = 0.9)

Additionally, when training machine learning models, it's crucial to apply data augmentation techniques to achieve better training results. I am using RandomBrightness, RandomErodeDilate, and RandomSharpen augmentors on the training data provider object.

In [6]:
# Augment training data with random brightness, rotation and erode/dilate
train_data_provider.augmentors = [
    RandomBrightness(), 
    RandomErodeDilate(),
    RandomSharpen(),
    ]


Defining Model Architecture:

Once the Dataset is prepared, the next step is to design and train a machine-learning model using TensorFlow and the CTC loss function. This involves selecting an appropriate model architecture and choosing hyperparameters such as the learning rate and batch size. Here are a few recommendations for creating the model:

Define the Input and Output Layers:
The input layer of the model should be a 4D tensor with dimensions [batch size, width, height, channels], where the batch size is the number of images in a batch, width and height are the dimensions of the images, and channels represent the number of color channels in the images (1 for grayscale, 3 for RGB).

Add the CNN Layers:
The CNN layers of the model should be responsible for extracting features from the images. A typical architecture for the CNN layers includes a combination of convolutional, pooling, and fully-connected (dense) layers.

Add the RNN Layers:
The RNN layers of the model should be responsible for processing the sequence of features and predicting the characters in the text. A common type of RNN to use for this task is a long short-term memory (LSTM) network.

Compile the Model:
Once the CNN and RNN layers have been defined, the model can be compiled using the CTC loss function and an optimizer such as Adam.

By following these steps, you can design a robust machine learning model for recognizing handwritten text from images.

Once we have our model, it can be trained on the training set using an optimization algorithm such as Adam. During training, the model will make predictions on the input data. Utilizing the CTC loss function, the model will update to reduce the disparity between the estimated values and the actual labels.

We will use the following code to create the model, compile it, define the optimizer, loss, metrics, and callbacks, and initiate the training process:

In [7]:
print(tf.__version__)
print(tf.keras.__version__)

2.16.1
3.3.3


In [8]:
# Creating TensorFlow model architecture
model = train_model(
    input_dim = (configs.height, configs.width, 3),
    output_dim = len(configs.vocab),
)

The following Variables were used a Lambda layer's call (tf.nn.convolution), but
are not present in its tracked objects:
  <tf.Variable 'conv2d/kernel:0' shape=(3, 3, 3, 32) dtype=float32>
It is possible that this is intended behavior, but it is more likely
an omission. This is a strong indication that this layer should be
formulated as a subclassed Layer rather than a Lambda layer.
The following Variables were used a Lambda layer's call (tf.nn.convolution_1), but
are not present in its tracked objects:
  <tf.Variable 'conv2d_1/kernel:0' shape=(3, 3, 32, 32) dtype=float32>
It is possible that this is intended behavior, but it is more likely
an omission. This is a strong indication that this layer should be
formulated as a subclassed Layer rather than a Lambda layer.
The following Variables were used a Lambda layer's call (tf.nn.convolution_2), but
are not present in its tracked objects:
  <tf.Variable 'conv2d_2/kernel:0' shape=(1, 1, 3, 32) dtype=float32>
It is possible that this is in



The following Variables were used a Lambda layer's call (tf.nn.convolution_8), but
are not present in its tracked objects:
  <tf.Variable 'conv2d_8/kernel:0' shape=(3, 3, 32, 64) dtype=float32>
It is possible that this is intended behavior, but it is more likely
an omission. This is a strong indication that this layer should be
formulated as a subclassed Layer rather than a Lambda layer.
The following Variables were used a Lambda layer's call (tf.nn.convolution_9), but
are not present in its tracked objects:
  <tf.Variable 'conv2d_9/kernel:0' shape=(3, 3, 64, 64) dtype=float32>
It is possible that this is intended behavior, but it is more likely
an omission. This is a strong indication that this layer should be
formulated as a subclassed Layer rather than a Lambda layer.
The following Variables were used a Lambda layer's call (tf.nn.convolution_10), but
are not present in its tracked objects:
  <tf.Variable 'conv2d_10/kernel:0' shape=(1, 1, 32, 64) dtype=float32>
It is possible that th

AttributeError: module 'tensorflow.python.keras.layers' has no attribute 'LSTM'

In [None]:
# Compile the model and print summary
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate), 
    loss=CTCloss(), 
    metrics=[
        CERMetric(vocabulary=configs.vocab),
        WERMetric(vocabulary=configs.vocab)
        ],
)
model.summary(line_length=110)

In [None]:
# Define callbacks
earlystopper = EarlyStopping(monitor="val_CER", patience=20, verbose=1, mode="min")
checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5.keras", monitor="val_CER", verbose=1, save_best_only=True, mode="min")
trainLogger = TrainLogger(configs.model_path)
tb_callback = TensorBoard(f"{configs.model_path}/logs", update_freq=1)
reduceLROnPlat = ReduceLROnPlateau(monitor="val_CER", factor=0.9, min_delta=1e-10, patience=5, verbose=1, mode="auto")
model2onnx = Model2onnx(f"{configs.model_path}/model.h5")

In [None]:
# Train the model
model.fit(
    train_data_provider,
    validation_data=val_data_provider,
    epochs=configs.train_epochs,
    callbacks=[earlystopper, checkpoint, trainLogger, reduceLROnPlat, tb_callback, model2onnx],
    workers=configs.train_workers
)