In [None]:
import os 
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path 
from collections import Counter
import string
import tensorflow as tf 
from tensorflow import keras
from tensorflow.keras import layers

In [None]:

# Define constants
image_height, image_width = 50, 200
batch_size = 16

data_dir = Path("./captcha_images_v2/")

image_paths = sorted(list(map(str, list(data_dir.glob("*.png")))))

labels = [img.split(os.path.sep)[-1].split(".png")[0] for img in image_paths]

print("Number of images found: ", len(image_paths))
print("Number of labels found: ", len(labels))
characters = string.ascii_lowercase  + "0123456789"

print("symbols that could be detected : ", characters)
print("number of symbols : ",len(characters))

In [None]:
max_length = max([len(label) for label in labels])

# Create a set of all unique characters in the labels
all_possible_characters = sorted(set("".join(labels)))

# Create a mapping of characters to integers and integers to characters
char_to_int = {char: i for i, char in enumerate(all_possible_characters)}
int_to_char = {i: char for char, i in char_to_int.items()}

# Define a function to preprocess an image
def preprocess_image(image_path):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_png(image, channels=1)  # Grayscale image
    image = tf.image.resize(image, (image_height, image_width))
    return image

# Preprocess images and labels
images = [preprocess_image(image_path) for image_path in image_paths]
encoded_labels = [[char_to_int[char] for char in label] for label in labels]

# Create TensorFlow Datasets
dataset = tf.data.Dataset.from_tensor_slices((images, encoded_labels))

# shuffle the dataset
dataset = dataset.shuffle(buffer_size=len(images))

# Split the dataset into training and validation sets (adjust as needed)
train_size = int(0.8 * len(image_paths))
train_dataset = dataset.take(train_size).batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)
validation_dataset = dataset.skip(train_size).batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)

In [None]:
def visualize_random_samples(dataset, int_to_char, num_samples=5):


    # Create an iterator for the dataset
    dataset_iter = iter(dataset)

    # Iterate through the random samples and visualize them
    for i in range(num_samples):
        image, label = next(dataset_iter)

        # Decode the label (convert integers to characters)
        label = [int_to_char[int(x)] for x in label[0].numpy()]

        # Display the image and label
        plt.figure(figsize=(4, 2))
        plt.imshow(image[0, :, :, 0])
        plt.title("Label: " + ''.join(label))
        plt.axis('off')
        plt.show()


visualize_random_samples(validation_dataset, int_to_char, num_samples=5)

In [None]:
def build_model():
    input_data = layers.Input(shape=(image_height, image_width, 1), name='input_image')
    # Standardize values to be in the [0, 1] range
    x = layers.Rescaling(1./255)(input_data)

    # Transpose the tensor to shape (None, image_width, image_height, 1)
    x = layers.Lambda(lambda x: tf.transpose(x, perm=[0, 2, 1, 3]), name="transpose")(x)

    # Convolutional layers
    x = layers.Conv2D(64, (3, 3), activation="relu", kernel_initializer=tf.keras.initializers.he_normal(), padding="same")(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D((2, 2), name="pool1")(x)

    x = layers.Conv2D(128, (3, 3), activation="relu", kernel_initializer=tf.keras.initializers.he_normal(), padding="same")(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D((2, 2), name="pool2")(x)

    x = layers.Conv2D(256, (3, 3), activation="relu", kernel_initializer=tf.keras.initializers.he_normal(), padding="same")(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D((2, 1), name="pool3")(x) # Pooling over time dimension

    x = layers.Reshape(target_shape=(image_width // 8, (image_height // 4) * 256), name="reshape")(x)
    x = layers.Dense(128, activation="relu", kernel_initializer=tf.keras.initializers.he_normal())(x)
    x = layers.Dropout(0.2)(x)


    # Recurrent layers (Bidirectional LSTM)
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.25))(x)

    # Output layer (CTC)
    output = layers.Dense(len(characters) + 1, activation='softmax')(x)
    model = keras.models.Model(
        inputs=input_data, outputs=output, name="ocr_model"
    )
    model.compile(optimizer=tf.keras.optimizers.Adam(), loss=ctc_loss)
    return model
# Compile the model with CTC loss
def ctc_loss(y_true, y_pred):
    input_length = tf.fill((batch_size, 1), tf.shape(y_pred)[1])
    label_length = tf.fill((batch_size, 1), max_length)
    loss = tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    return loss

model=build_model()
model.summary()

In [None]:
num_epochs = 50

callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='loss',
        patience=5,
        verbose=1,
        restore_best_weights=True),
]

history = model.fit(
    train_dataset,
    epochs=num_epochs,
    validation_data=validation_dataset,
    callbacks=callbacks
)

In [None]:

best_epoch = history.history['val_loss'].index(min(history.history['val_loss']))
def plot_training_history(history, best_epoch):
    # Set a high-DPI figure (for sharper image)
    plt.figure(figsize=(12, 8), dpi=150)  # Increase size and resolution

    # Plot training & validation loss values
    plt.plot(history.history['loss'], label='Training Loss', linewidth=2)
    plt.plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
    plt.title('Model Loss over Epochs', fontsize=16)
    plt.xlabel('Epoch', fontsize=14)
    plt.ylabel('Loss', fontsize=14)
    plt.legend(fontsize=12)
    plt.grid(True)

    # Display the lowest validation loss and the epoch at which it occurred
    min_val_loss = min(history.history['val_loss'])
    plt.annotate(
        f'Lowest Validation Loss: {min_val_loss:.4f}\nEpoch: {best_epoch + 1}',
        xy=(best_epoch, min_val_loss),
        xytext=(best_epoch - 5, min_val_loss + 0.3),
        arrowprops=dict(facecolor='black', arrowstyle='->'),
        fontsize=12,
        bbox=dict(boxstyle="round,pad=0.3", edgecolor='black', facecolor='white')
    )

    plt.tight_layout()
    plt.show()
plot_training_history(history, best_epoch)

In [None]:
def decode_and_visualize_samples(model, dataset, int_to_char, num_samples=5):

    # Create an iterator for the dataset
    dataset_iter = iter(dataset)

    # Create a subplot grid
    fig, axes = plt.subplots(num_samples, 1, figsize=(4, 2 * num_samples))

    # Iterate through the random samples, decode, and visualize them
    for i in range(num_samples):
        image, label = next(dataset_iter)

        # Make predictions using the model
        predictions = model.predict(image)
        # Decode the predictions using CTC decode
        decoded, _ = keras.backend.ctc_decode(predictions, input_length=tf.fill((batch_size,), 25), greedy=True)

        # Convert decoded labels to characters
        decoded_labels = [int_to_char[int(x)] for x in decoded[0][0,:max_length].numpy()]

        # Display the image and decoded label
        axes[i].imshow(image[0, :, :, 0], cmap='gray')
        axes[i].set_title("Decoded: " + ''.join(decoded_labels))
        axes[i].axis('off')

    # Adjust spacing and display the grid
    plt.tight_layout()
    plt.show()

# Example usage:
decode_and_visualize_samples(model, validation_dataset, int_to_char, num_samples=5)