In [1]:

from tensorflow.keras.layers.experimental.preprocessing import StringLookup
from tensorflow import keras
from pathlib import Path

import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import os
import cv2 as cv
import pandas as pd
from tensorflow.keras import layers

from sklearn.model_selection import train_test_split
np.random.seed(42)
tf.random.set_seed(42)

In [2]:
print(tf.__version__)

2.10.0


In [3]:
short_training_set = pd.read_csv('TrainLables.csv').drop(['Unnamed: 0'],axis=1).sort_values(by=['Image'])

ll = 6*list(short_training_set['Text'])

list_paths = sorted(os.listdir("Processed_Images"))

full_path = ["Processed_Images/"+i for i in list_paths]

long_training_set = pd.DataFrame({'Image':full_path,"Text":ll})

test_set = pd.read_csv('TestLabels.csv').drop(['Unnamed: 0'],axis=1).sort_values(by=['Image'])

list_paths_test = sorted(os.listdir("Processed_Test_Images"))

full_path_test = ["Processed_Test_Images/"+i for i in list_paths_test]

test_set['Image'] = full_path_test

# identifying unique characters in the dataset

characters = set(char for label in short_training_set['Text'] for char in label)
characters = sorted(list(characters))

# Maximum length of any line in the dataset
max_length = max([len(label) for label in short_training_set['Text']])
print('The maximum line length is {} characters'.format(max_length))

The maximum line length is 128 characters


In [4]:
print('There are '+str(len(short_training_set))+ " image in the short trainging set")
print('There are '+str(len(long_training_set))+ " image in the long trainging set")
print('There are '+str(len(test_set))+ " image in the test set")

There are 1400 image in the short trainging set
There are 8400 image in the long trainging set
There are 233 image in the test set


In [5]:
def train_val(df):
    
    X = df["Image"]
    y = df['Text']
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
    
    return  np.array(list(X_train)), np.array(list(X_val)), np.array(list(y_train)), np.array(list(y_val))

X_train_path, X_val_Path, y_train, y_val   = train_val(long_training_set)

X_test_path = list(test_set['Image'])
y_test = list(test_set['Text'])

In [6]:
print('There are '+str(len(X_train_path))+ " image in the trainging set")
print('There are '+str(len(X_val_Path))+ " image in the validation set")
print('There are '+str(len(test_set))+ " image in the test set")

There are 5880 image in the trainging set
There are 2520 image in the validation set
There are 233 image in the test set


In [7]:
# Mapping characters to integers
char_to_num = layers.StringLookup(
    vocabulary=list(characters), mask_token=None
)

# Mapping integers back to original characters
num_to_char = layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True
)

In [8]:
def distortion_free_resize(image, img_size):
    w, h = img_size
    image = tf.image.resize(image, size=(h, w), preserve_aspect_ratio=True)

    # Check tha amount of padding needed to be done.
    pad_height = h - tf.shape(image)[0]
    pad_width = w - tf.shape(image)[1]

    # Only necessary if you want to do same amount of padding on both sides.
    if pad_height % 2 != 0:
        height = pad_height // 2
        pad_height_top = height + 1
        pad_height_bottom = height
    else:
        pad_height_top = pad_height_bottom = pad_height // 2

    if pad_width % 2 != 0:
        width = pad_width // 2
        pad_width_left = width + 1
        pad_width_right = width
    else:
        pad_width_left = pad_width_right = pad_width // 2

    image = tf.pad(
        image,
        paddings=[
            [pad_height_top, pad_height_bottom],
            [pad_width_left, pad_width_right],
            [0, 0],
        ],
    )

    image = tf.transpose(image, perm=[1, 0, 2])
    image = tf.image.flip_left_right(image)
    return image

In [9]:
batch_size = 64
padding_token = 99
image_width = 2882
image_height = 46


def preprocess_image(image_path, img_size=(image_width, image_height)):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_png(image, channels=1) # convert to grayscale
    image = distortion_free_resize(image, img_size) # changing the image size keeping the aspect ration through padding
    image = tf.cast(image, tf.float32) / 255.0
    return image


def vectorize_label(label):
    # converting sequence of character to sequence of corresponding number and keeping fixed size of the label through padding
    label = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8"))
    length = tf.shape(label)[0]
    pad_amount = 128 - length
    label = tf.pad(label, paddings=[[0, pad_amount]], constant_values=padding_token)
    return label
    


def process_images_labels(image_path, label):
    image = preprocess_image(image_path)
    label = vectorize_label(label)
    return {"image": image, "label": label}


def prepare_dataset(image_paths, labels):
    # generating dataset in batches
    dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels)).map(
        process_images_labels, num_parallel_calls=tf.data.AUTOTUNE
    )
    return dataset.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)

In [10]:
train_ds = prepare_dataset(X_train_path, y_train)
validation_ds = prepare_dataset(X_val_Path, y_val)
test_ds = prepare_dataset(X_test_path, y_test)

# Creating the model class

In [None]:
class CTCLayer(keras.layers.Layer):
    def __init__(self, name=None):
        super().__init__(name=name)
        self.loss_fn = keras.backend.ctc_batch_cost

    def call(self, y_true, y_pred):
        batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
        input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
        label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

        input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
        self.add_loss(loss)

        # At test time, just return the computed predictions.
        return y_pred


def build_model():
    # Inputs to the model
    input_img = keras.Input(shape=(image_width, image_height, 1), name="image")
    labels = keras.layers.Input(name="label", shape=(None,))

    # First conv block.
    x = keras.layers.Conv2D(
        64,
        (3, 3),
        activation="relu",
        kernel_initializer="he_normal",
        padding="same",
        name="Conv1",
    )(input_img)
    x = keras.layers.MaxPooling2D((2, 2), name="pool1")(x)
    x= keras.layers.BatchNormalization()(x)
    new_shape = ((image_width // 2), (image_height // 2) * 64)
    x = keras.layers.Reshape(target_shape=new_shape, name="reshape")(x)
    x = keras.layers.Dense(16, activation="relu", name="dense2")(x)
    x= keras.layers.BatchNormalization()(x)
    x = keras.layers.Bidirectional(
        keras.layers.LSTM(128, return_sequences=True, dropout=0.35))(x)
    x = keras.layers.Dense(
        len(char_to_num.get_vocabulary()) + 2, activation="softmax", name="dense3"
    )(x)
    output = CTCLayer(name="ctc_loss")(labels, x)

    # Define the model.
    model = keras.models.Model(
        inputs=[input_img, labels], outputs=output, name="Arabic_OCR"
    )
    # Optimizer.
    lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.0001,
    decay_steps=10000,
    decay_rate=0.9)
    opt = keras.optimizers.Adam(learning_rate=lr_schedule)
    # Compile the model and return.
    model.compile(optimizer=opt)
    return model

In [None]:
# Get the model.
model = build_model()
model.summary()

In [None]:
validation_images = []
validation_labels = []

for batch in validation_ds:
    validation_images.append(batch["image"])
    validation_labels.append(batch["label"])

In [None]:
def calculate_edit_distance(labels, predictions):
    # Get a single batch and convert its labels to sparse tensors.
    saprse_labels = tf.sparse.from_dense(labels)

    # Make predictions and convert them to sparse tensors.
    input_len = np.ones(predictions.shape[0]) * predictions.shape[1]

    predictions_decoded = keras.backend.ctc_decode(
        
        predictions, input_length=input_len, greedy=False, beam_width=100,
    )[0][0][:, :max_length]
    sparse_predictions =tf.sparse.from_dense(predictions_decoded)
    
    # Compute individual edit distances and average them out.
    edit_distances = tf.edit_distance(
        sparse_predictions, saprse_labels, normalize=False
    )
    return tf.reduce_mean(edit_distances)


class EditDistanceCallback(keras.callbacks.Callback):
    def __init__(self, pred_model):
        super().__init__()
        self.prediction_model = pred_model

    def on_epoch_end(self, epoch, logs=None):
        edit_distances = []

        for i in range(len(validation_images)):
            labels = validation_labels[i]
            predictions = self.prediction_model.predict(validation_images[i])
            edit_distances.append(calculate_edit_distance(labels, predictions).numpy())

        print(
            f"Mean edit distance for epoch {epoch + 1}: {np.mean(edit_distances):.4f}"
        )

In [None]:
epochs = 30  # To get good results this should be at least 50.

model = build_model()
prediction_model = keras.models.Model(
    model.get_layer(name="image").input, model.get_layer(name="dense3").output
)
edit_distance_callback = EditDistanceCallback(prediction_model)

stopping=tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta=0,
    patience=3,
    verbose=0,
    mode="auto",
    baseline=None,
    restore_best_weights=False,
)

# Train the model.
history = model.fit(
    train_ds,
    validation_data=validation_ds,
    epochs=epochs,
    callbacks=[edit_distance_callback],
    shuffle=True
)

#add early stopping mechanism

In [None]:
def decode_batch_predictions(pred):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    # Use greedy search. For complex tasks, you can use beam search.
    results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0][
        :, :max_length
    ]
    # Iterate over the results and get back the text.
    output_text = []
    for res in results:
        res = tf.gather(res, tf.where(tf.math.not_equal(res, -1)))
        res = tf.strings.reduce_join(num_to_char(res)).numpy().decode("utf-8")
        output_text.append(res)
    print(output_text)
    return output_text

In [None]:
#  Let's check results on some test samples.
for batch in test_ds.take(1):
    batch_images = batch["image"]
    _, ax = plt.subplots(4, 4, figsize=(20, 8))

    preds = prediction_model.predict(batch_images)
    pred_texts = decode_batch_predictions(preds)

    for i in range(16):
        img = batch_images[i]
        img = tf.image.flip_left_right(img)
        img = tf.transpose(img, perm=[1, 0, 2])
        img = (img * 255.0).numpy().clip(0, 255).astype(np.uint8)
        img = img[:, :, 0]

        title = f"Prediction: {pred_texts[i]}"
        print(pred_texts[i]=='')
        ax[i // 4, i % 4].imshow(img, cmap="gray")
        ax[i // 4, i % 4].set_title(title)
        ax[i // 4, i % 4].axis("off")

plt.show()

In [None]:
plt.plot(history.history['val_loss'][2:len(history.history['val_loss'])-1])
plt.plot(history.history['loss'][2:len(history.history['loss'])-1])

In [None]:
plt.plot(history.history['val_loss'][1:len(history.history['val_loss'])-1])
plt.plot(history.history['loss'][1:len(history.history['loss'])-1])

In [None]:
plt.plot(history.history['val_loss'])
plt.plot(history.history['loss'])