In [31]:
import numpy as np
import pandas as pd 
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt

from pathlib import Path
from collections import Counter
import os
import math

In [32]:
BASE_DIR = Path('/kaggle/input/captcha-dataset')


IMAGES = sorted(list(map(str, list(BASE_DIR.glob('*.jpg')))))
LABELS = [img.split(os.path.sep)[-1].split('.jpg')[0] for img in IMAGES]
CHARACTERS = set(char for LABEL in LABELS for char in LABEL)

print(f'Number of Images found: {len(IMAGES)}')
print(f'Number of Labels found: {len(LABELS)}')
print(f'Number of Unique Characters found: {len(CHARACTERS)}')
print(f'Character List: {CHARACTERS}')

Number of Images found: 113062
Number of Labels found: 113062
Number of Unique Characters found: 60
Character List: {'T', 'R', '3', 'd', '7', '9', 'M', 'V', 'k', '8', 'f', 'p', 'n', 'x', '1', 'B', 'O', 'D', 'N', 'b', 'H', 'c', 'a', 'v', 'Q', 'i', 'A', 'G', 'r', 'C', 'z', 'Y', 'P', 't', 'Z', 'e', '2', 'y', 'I', '4', 'l', 'F', 'E', 'm', 'h', 'S', '6', 'q', 'X', 'J', '5', 'g', 'K', 'L', 'w', 'U', 'j', 'u', 'W', 's'}


In [33]:

BATCH_SIZE = 64
BUFFER_SIZE = 1000

IMG_WIDTH, IMG_HEIGHT = 200, 40
downsample_factor = 4
max_length = max([len(label) for label in LABELS])
MAX_CHARS = len(CHARACTERS)
max_length

5

In [34]:
char_to_num = layers.StringLookup(vocabulary=list(CHARACTERS),
                                  num_oov_indices=0,
                                  mask_token=None)

num_to_char = layers.StringLookup(vocabulary=char_to_num.get_vocabulary(), 
                                  mask_token=None, 
                                  num_oov_indices=0,
                                  invert=True)

In [35]:
def split_data(images, labels, train_size=0.8, shuffle=True):
    size = len(images)
    
    indices = np.arange(size)
    
    if shuffle:
        np.random.shuffle(indices)
        
    train_samples = int(size * train_size)
    
    x_train, y_train = images[indices[:train_samples]], labels[indices[:train_samples]]
    x_valid, y_valid = images[indices[train_samples:]], labels[indices[train_samples:]]
    
    return x_train, x_valid, y_train, y_valid

In [36]:
def encode_single_sample(image_path, label):
    img = tf.io.read_file(image_path)
    img = tf.io.decode_jpeg(img, channels=1)
    img = tf.image.convert_image_dtype(img, tf.float32)
    img = tf.image.resize(img, [IMG_HEIGHT, IMG_WIDTH])
    img = tf.transpose(img, perm=[1, 0, 2])
    label = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8"))
    label = tf.one_hot(indices=label, depth=MAX_CHARS+1)

    return img, label

In [37]:
X_train, X_valid, y_train, y_valid = split_data(np.array(IMAGES), np.array(LABELS))

In [38]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = (
    train_dataset.map(encode_single_sample, 
                      num_parallel_calls=tf.data.AUTOTUNE)
    .batch(BATCH_SIZE)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

valid_dataset = tf.data.Dataset.from_tensor_slices((X_valid, y_valid))

valid_dataset = (
    valid_dataset.map(encode_single_sample, 
                      num_parallel_calls=tf.data.AUTOTUNE)
    .batch(BATCH_SIZE)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

In [40]:


input_layer = layers.Input(shape=(IMG_WIDTH, IMG_HEIGHT, 1), name='image')

x = layers.Conv2D(filters=32, kernel_size=(3, 3), padding='same', name='conv_1')(input_layer)
x = layers.BatchNormalization(name="conv_1_bn")(x)
x = layers.ReLU(name="conv_1_relu")(x)
x = layers.MaxPooling2D(pool_size=(2, 2), name='pool_1')(x)

x = layers.Conv2D(filters=64, kernel_size=(3, 3), padding='same', name='conv_2')(x)
x = layers.BatchNormalization(name="conv_3_bn")(x)
x = layers.ReLU(name="conv_3_relu")(x)
x = layers.Conv2D(filters=64, kernel_size=(3, 3), padding='same', name='conv_4')(x)
x = layers.BatchNormalization(name="conv_4_bn")(x)
x = layers.ReLU(name="conv_4_relu")(x)
x = layers.MaxPooling2D(pool_size=(2, 2), name='pool_2')(x)

x = layers.Conv2D(filters=128, kernel_size=(3, 3), padding='same', name='conv_5')(x)
x = layers.BatchNormalization(name="conv_5_bn")(x)
x = layers.ReLU(name="conv_5_relu")(x)
x = layers.Conv2D(filters=128, kernel_size=(3, 3), padding='same', name='conv_6')(x)
x = layers.BatchNormalization(name="conv_6_bn")(x)
x = layers.ReLU(name="conv_6_relu")(x)
x = layers.MaxPooling2D(pool_size=(2, 2), name='pool_3')(x)

x = layers.Conv2D(filters=256, kernel_size=(3, 3), padding='same', name='conv_7')(x)
x = layers.BatchNormalization(name="conv_7_bn")(x)
x = layers.ReLU(name="conv_7_relu")(x)
x = layers.Conv2D(filters=256, kernel_size=(3, 3), padding='same', name='conv_8')(x)
x = layers.BatchNormalization(name="conv_8_bn")(x)
x = layers.ReLU(name="conv_8_relu")(x)
x = layers.GlobalAveragePooling2D()(x)

x = layers.Dense(max_length * (len(CHARACTERS) + 4), name='dense_3')(x)
x = layers.BatchNormalization(name="dense_3_bn")(x)
x = layers.ReLU(name="dense_3_relu")(x)
x = layers.Dropout(0.5)(x)

new_shape = (max_length , len(CHARACTERS) + 4)
x = layers.Reshape(target_shape=new_shape, name='reshape')(x)

x = layers.Dense(64, name='dense_4')(x)    
x = layers.BatchNormalization(name="dense_4_bn")(x)
x = layers.ReLU(name="dense_4_relu")(x)
x = layers.Dropout(0.5)(x)

x = layers.Dense(len(CHARACTERS) + 1, name='dense_prediction')(x)
x = layers.BatchNormalization(name="prediction_bn")(x)
out = layers.Activation('softmax', name='prediction')(x)

model = tf.keras.Model(input_layer, out, name="OCR_Model")

optimizer = tf.keras.optimizers.Adam(1e-2)
model.compile(optimizer=optimizer, loss=tf.keras.losses.CategoricalCrossentropy(), metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])
model.summary()




In [42]:
import keras

early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=13, restore_best_weights=True)
model_checkpoint = keras.callbacks.ModelCheckpoint('/kaggle/working/captcha_model.keras', save_best_only=True, monitor='val_loss')
reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.25, patience=4, min_lr=1e-12)

history = model.fit(train_dataset, 
                    validation_data=valid_dataset,
                    epochs=100,
                    callbacks=[early_stopping, model_checkpoint, reduce_lr])

Epoch 1/100
[1m1414/1414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 59ms/step - accuracy: 0.0175 - auc: 0.5130 - loss: 4.1453 - val_accuracy: 0.0220 - val_auc: 0.5354 - val_loss: 4.5427 - learning_rate: 0.0100
Epoch 2/100
[1m1414/1414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 48ms/step - accuracy: 0.1012 - auc: 0.8346 - loss: 3.2677 - val_accuracy: 0.2938 - val_auc: 0.9448 - val_loss: 2.1663 - learning_rate: 0.0100
Epoch 3/100
[1m1414/1414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 49ms/step - accuracy: 0.3330 - auc: 0.9528 - loss: 2.0083 - val_accuracy: 0.5178 - val_auc: 0.9765 - val_loss: 1.4100 - learning_rate: 0.0100
Epoch 4/100
[1m1414/1414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 50ms/step - accuracy: 0.4666 - auc: 0.9692 - loss: 1.5653 - val_accuracy: 0.5995 - val_auc: 0.9826 - val_loss: 1.1383 - learning_rate: 0.0100
Epoch 5/100
[1m1414/1414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 49ms/step - accuracy: 0.535