# Preprocessing

## Create a dataset with the images path and the corresponding labels

In [1]:
! pip install pandas tensorflow



In [2]:
! pip install imgaug



In [2]:
import pandas as pd
import os

# Relative path to the dataset
iam_lines_path = os.path.join('../IAM-data', 'iam_lines_gt.txt')

# Relative path to save the preprocessed data
preprocessed_data_path = os.path.join('../IAM-data', 'preprocessed_data')

In [3]:
def get_dataframe(iam_lines_path: str):
    """
    Extract the image path and the label from the IAM lines file and
    create a pandas dataframe

    Parameters
    ----------
    iam_lines_path : str
        Path to the IAM lines file

    Returns
    -------
    df : pandas dataframe
        Dataframe containing the image path and the label
    """
    data = []
    image_path = None
    label = None

    # in the file, one line contains the image path, the next line the label, then an empty line
    # extract the image path and the label and create a pandas dataframe
    with open(iam_lines_path, 'r') as f:
        lines = f.readlines()

    for line in lines:
        line = line.strip()
        if line:
            if not image_path:
                image_path = line
            else:
                label = line
        else:
            if image_path and label:
                data.append((image_path, label))
                image_path = None
                label = None

    df = pd.DataFrame(data, columns=['image_path', 'label'])

    return df


In [4]:
df = get_dataframe(iam_lines_path)
df.head()

Unnamed: 0,image_path,label
0,a03-017-07.png,into the pro-communist north and the
1,a03-017-05.png,"to 1958 kept the kingdom in peace, though"
2,a03-017-08.png,pro-western centre and south.
3,a03-017-02.png,in Phnom Penh indicate that he still regards
4,a03-017-06.png,at the cost of virtual partition of the country


## Crop the images to the specified size
- read the images from the specified filepath into a list
- crop them to the specified size or the maximum width and height of the images
- convert the list to a numpy array

In [5]:
import imgaug.augmenters as iaa
import imgaug as ia
import imageio.v2 as imageio
import numpy as np
from typing import List

In [6]:
def read_images(filepath: str) -> List[np.ndarray]:
    """
    Read the images from the specified filepath

    Parameters
    ----------
    filepath : str
        Path to the image

    Returns
    -------
    img : numpy array
        Image as a numpy array
    """
    images = []
    for img in os.listdir(filepath):
        try:
            img = imageio.imread(os.path.join(filepath, img))
            images.append(img)
        except:
            print('Error reading image: {}'.format(img))
            pass

    return images

In [7]:
images = read_images('IAM-data/img')
print(type(images[0]))
print(images[0].shape)
print(images[0].dtype)


<class 'numpy.ndarray'>
(149, 1824)
uint8


In [8]:
# Find the maximum width and height of the images
max_shape = np.max([list(img.shape) for img in images], axis=0)

print(max_shape[0])

342


In [9]:
def crop_images(images: List[np.ndarray],
                crop_size: tuple[int, int] = None) -> List[np.ndarray]:
    """
    Crop the images to the specified size.
    They are padded, in order to keep the aspect ratio

    Parameters
    ----------
    images : list
        List of images
    crop_size : tuple
        Size to crop the images to

    Returns
    -------
    cropped_images : list
        List of cropped images
    """
    # Use the maximum width and height of the images if no crop size is specified
    if crop_size is None:
        np.max([list(img.shape) for img in images], axis=0)

    w = max_shape[0]
    h = max_shape[1]
    padded_images = []

    crop = iaa.Sequential([iaa.PadToFixedSize(
        width=w, height=h, position="center", pad_cval=255)])

    # Crop the images
    for img in images:
        padded_img = crop.augment_image(img)
        padded_images.append(padded_img)

    return padded_images

In [1]:
cropped_images = crop_images(images)

print(cropped_images[0].shape)

NameError: name 'crop_images' is not defined

In [None]:
def list_to_ndarray(images: List[np.ndarray]) -> np.ndarray:
    """
    Convert a list of images to a numpy array
    Make sure that the images have the same shape (use crop_images)

    Parameters
    ----------
    images : list
        List of images

    Returns
    -------
    images : numpy array
        Images as a numpy array
    """
    images = np.array(images)

    return images

In [None]:
images = list_to_ndarray(cropped_images)

print(images.shape)

# Use Tensorflow

In [2]:
import tensorflow as tf
import os
import numpy as np
from tensorflow import keras

In [18]:
def load_images(img_path: str) -> list[tf.Tensor]:
    """
    Load and decode the images from the specified filepath
    :param img_path: str
        Path to the images
    :return: tf.Tensor
        Images as a tf.Tensor list
    """
    images = []

    stop = 20
    i = 0
    for file in os.listdir(img_path):
        if i == stop:
            break
        file_path = os.path.join(img_path, file)
        image = tf.io.read_file(file_path)
        image = tf.image.decode_png(image, channels=1)
        image = tf.cast(image, tf.float32) / 255.0
        images.append(image)
        i += 1

    return images


def resize_images(images: list[tf.Tensor],
                  size: tuple[int, int] = None) -> list[tf.Tensor]:
    """
    Resize the images to the specified shape
    :param size: tuple[int, int]
        Size to resize the images to
        If None, the image is resized to the original size
    :param images: list[tf.Tensor]
        List of images as a tf.Tensor
    :return: list[tf.Tensor]
        Resized images
    """
    # if shape is not specified, use the maximum width and height of the images
    if size is None:
        size = np.max([list(img.shape) for img in images], axis=0)

    (h, w) = (size[0], size[1])

    resized_images = []

    # resize the images
    for img in images:
        resized_img = tf.image.resize_with_pad(img, target_height=h, target_width=w)
        resized_images.append(resized_img)

    return resized_images


def load_labels(iam_lines_path: str) -> tuple[list[str], set[str], int]:
    """
    Load the labels from the specified filepath
    :param iam_lines_path: str
        Path to the labels
    :return: tuple[list[str], set[str], int]
        List of image names, list labels
        Maximum length of the labels
    """
    image_names = []
    labels = []
    image_path = None
    label = None
    # vocab = set()
    max_label_length = 0

    # In the file, one line contains the image path,
    # the next line the label, then an empty line.
    # Extract the image path and the label and create a pandas dataframe
    with open(iam_lines_path, 'r') as f:
        lines = f.readlines()
    stop = 21
    i = 0
    for line in lines:
        if i == stop:
            break
        line = line.strip()
        if line:
            if not image_path:
                image_path = line
            else:
                label = line
                image_names.append(image_path)
                # vocab.update(list(label))
                max_label_length = max(max_label_length, len(label))
                i += 1
        else:
            if image_path and label:
                labels.append(label)
                image_path = None
                label = None


    # vocab = sorted(vocab)

    # return image_names, labels, vocab, max_label_length
    return image_names, labels, max_label_length

#
# def encode_labels(labels: tf.constant, vocab: set[str],
#                   max_num_words: int) -> list[list[int]]:
#     """
#     Encode the labels as a tf.Tensor
#     :param labels: tf.constant
#         List of labels
#     :param vocab: set[str]
#         Vocabulary of the labels
#     :param max_num_words: int
#         Maximum number of words in the vocabulary
#     :return: list[list[int]]
#         Encoded labels as a list
#     """
#     layer = keras.layers.StringLookup(vocabulary=list(vocab))
#     encoded_labels = layer(labels)
#
#     return encoded_labels


In [None]:
def get_tf_dataset(images: list[tf.Tensor], images_names: list[str], labels: list[str])\
        -> tf.data.Dataset:
    """
    Return a tf.data.Dataset object
    :param images: list[tf.Tensor]
        List of images
    :param images_names: list[str]
        List of image names
    :param labels: list[str]
        List of labels
    :return: tf.data.Dataset
        Dataset object
    """


In [4]:
def get_labels_tf(labels: list[str], max_len: int) -> tf.constant:
    """
    Get the labels as a tf.constant
    :param labels: list[str]
        List of labels
    :param max_len: int
        Maximum label length
    :return: tf.constant
        Labels as a tf.constant
    """
    # pad the labels to the maximum length
    padded_labels = tf.keras.preprocessing.sequence.pad_sequences(
        labels, maxlen=max_len, padding='post', value=' '
    )
    data = tf.data.Dataset.from_tensor_slices([[char for char in label] for label in padded_labels])
    return data

In [19]:
image_tensors = load_images('../IAM-data/img')
print("Number of images found: ", len(image_tensors))
image_tensors = resize_images(image_tensors)
print("images resized")

image_names, labels, vocab, max_label_len = load_labels('../IAM-data/iam_lines_gt.txt')
print("Number of labels", len(labels))
print("vocab: ", vocab)
print("max_label_len: ", max_label_len)
print("vocab size: ", len(vocab))


Number of images found:  20
images resized
Number of labels 20
vocab:  [' ', '"', '(', ')', ',', '-', '.', '1', '2', '3', '4', '5', '7', '8', '9', 'A', 'C', 'E', 'F', 'G', 'H', 'L', 'M', 'O', 'P', 'S', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y']
max_label_len:  54
vocab size:  49


In [21]:
from keras.src.layers import StringLookup

AUTOTUNE = tf.data.AUTOTUNE

# Mapping characters to integers.
char_to_num = StringLookup(vocabulary=list(vocab), mask_token=None)

# Mapping integers back to original characters.
num_to_char = StringLookup(
    vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True
)

In [22]:
# encode and pad the labels
encoded_labels = char_to_num(tf.strings.unicode_split(labels, input_encoding="UTF-8"))
padded_labels = []
for label in encoded_labels:
    length = tf.shape(label)[0]
    pad_amount = max_label_len - length
    label = tf.pad(label, paddings=[[0, pad_amount]], constant_values=99)
    padded_labels.append(label)

In [23]:
# create a tf.data.Dataset
data = tf.data.Dataset.from_tensor_slices((image_tensors, padded_labels))

In [27]:
# only 20 samples
data

<_TensorSliceDataset element_spec=(TensorSpec(shape=(224, 2077, 1), dtype=tf.float32, name=None), TensorSpec(shape=(54,), dtype=tf.int64, name=None))>

# TODO
- shuffle the data
- split the data
- data augmentation - make sure not to augment the test data
- if imgaug works with tensors - then reuse the code from task 1
- create batches
- model implementation + architecture (check paper)
- optuna implementation
- train the model

In [6]:
import tensorflow as tf
import numpy as np

# Assuming you have two lists of ndarrays: list1 and list2
list1 = [np.array([1, 2, 3]), np.array([4, 5, 6]), np.array([7, 8, 9])]
list2 = ["ana", "are", "mere"]

# Convert the lists of ndarrays to a TensorFlow Dataset
dataset = tf.data.Dataset.from_tensor_slices((list1, list2))

# Iterate over the elements in the dataset
for element in dataset:
    print(element)


(<tf.Tensor: shape=(3,), dtype=int64, numpy=array([1, 2, 3])>, <tf.Tensor: shape=(), dtype=string, numpy=b'ana'>)
(<tf.Tensor: shape=(3,), dtype=int64, numpy=array([4, 5, 6])>, <tf.Tensor: shape=(), dtype=string, numpy=b'are'>)
(<tf.Tensor: shape=(3,), dtype=int64, numpy=array([7, 8, 9])>, <tf.Tensor: shape=(), dtype=string, numpy=b'mere'>)
