In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
image_folder = '/content/drive/MyDrive/samples/'

In [6]:
import os
import tqdm
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

# Collect image paths and labels
image_paths = []
labels = []

image_files = os.listdir(image_folder)

for image_file in image_files:
  image_path = os.path.join(image_folder, image_file)
  image_paths.append(image_path)
  label = image_file.split('.')[0]
  labels.append(label)

# Create DataFrame
images_df = pd.DataFrame({"image_path": image_paths, "label": labels})
print(images_df.head())

# Extract unique characters from labels
characters = '0123456789abcdefghijklmnopqrstuvwxyz'

# Create StringLookup layers for char-to-num and num-to-char conversion
char_to_num = layers.StringLookup(vocabulary=list(characters), num_oov_indices=0, mask_token=None)
num_to_char = layers.StringLookup(vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True)

# Display character-to-number mapping
print(pd.DataFrame({"char": char_to_num.get_vocabulary(), "num": np.arange(1, len(char_to_num.get_vocabulary()) + 1)}))

# Function to encode a single sample (image, label)
def encode_single_sample(image_path, label):
    # Load and preprocess the image
    image = tf.io.read_file(image_path)
    image = tf.io.decode_png(image, channels=1)
    image = tf.image.convert_image_dtype(image, tf.float32)
    image = tf.image.resize(image, [50, 200])
    image = tf.transpose(image, perm=[1, 0, 2])

    # Encode label
    label = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8"))
    return image, label

# Function to process dataset
def process_dataset(X, y):
    images = []
    labels = []
    for img_path, label in zip(X, y):
        img, lbl = encode_single_sample(img_path, label)
        images.append(img.numpy())
        labels.append(lbl.numpy())
    return np.array(images), np.array(labels)


                                 image_path  label
0  /content/drive/MyDrive/samples/byfgn.png  byfgn
1  /content/drive/MyDrive/samples/253dc.png  253dc
2  /content/drive/MyDrive/samples/6fgdw.png  6fgdw
3  /content/drive/MyDrive/samples/ewyg7.jpg  ewyg7
4  /content/drive/MyDrive/samples/33n73.png  33n73
   char  num
0     0    1
1     1    2
2     2    3
3     3    4
4     4    5
5     5    6
6     6    7
7     7    8
8     8    9
9     9   10
10    a   11
11    b   12
12    c   13
13    d   14
14    e   15
15    f   16
16    g   17
17    h   18
18    i   19
19    j   20
20    k   21
21    l   22
22    m   23
23    n   24
24    o   25
25    p   26
26    q   27
27    r   28
28    s   29
29    t   30
30    u   31
31    v   32
32    w   33
33    x   34
34    y   35
35    z   36


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(np.array(image_paths), np.array(labels), test_size=0.1, random_state=42)

In [9]:
X_train_processed, y_train_processed = process_dataset(X_train, y_train)
X_test_processed, y_test_processed = process_dataset(X_test, y_test)

In [10]:
X_train_input = {"Input": X_train_processed, "Label": y_train_processed}
X_test_input = {"Input": X_test_processed, "Label": y_test_processed}

In [13]:
from tensorflow.keras.backend import ctc_batch_cost, ctc_decode
class CTCLayer(layers.Layer):
    def __init__(self, name=None):
        super().__init__(name=name)
        self.loss_fn = ctc_batch_cost

    def call(self, y_true, y_pred):
        batch_length = tf.cast(tf.shape(y_true)[0], dtype="int64")
        input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
        label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

        input_length = input_length * tf.ones(shape=(batch_length, 1), dtype="int64")
        label_length = label_length * tf.ones(shape=(batch_length, 1), dtype="int64")

        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
        self.add_loss(loss)

        return y_pred

In [17]:
from tensorflow.keras import layers, models, optimizers, losses
input_layer = layers.Input(shape=(200, 50, 1), name="Input", dtype="float32")
label_layer = layers.Input(shape=(None,), name="Label", dtype="float32")

conv2_1 = layers.Conv2D(filters=32, kernel_size=(3, 3), activation="relu", kernel_initializer="he_normal", padding="same")(input_layer)
max2_1 = layers.MaxPooling2D(strides=(2, 2))(conv2_1)

conv2_2 = layers.Conv2D(filters=64, kernel_size=(3, 3), activation="relu", kernel_initializer="he_normal", padding="same")(max2_1)
max2_2 = layers.MaxPooling2D(strides=(2, 2))(conv2_2)

reshape_layer = layers.Reshape(target_shape=((200 // 4), (50 // 4) * 64))(max2_2)
dense_1 = layers.Dense(units=64, activation="relu")(reshape_layer)
drop_1 = layers.Dropout((0.2))(dense_1)

bilstm_1 = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.25))(drop_1)
bilstm_2 = layers.Bidirectional(layers.LSTM(64, return_sequences=True, dropout=0.25))(bilstm_1)

output_layer = layers.Dense(len(characters) + 1, activation="softmax", name="Output")(bilstm_2)

output = CTCLayer(name="ctc_loss")(label_layer, output_layer)

model = models.Model(inputs=[input_layer, label_layer], outputs=output, name="OCR")
model.compile(optimizer=optimizers.Adam())

In [18]:
model.summary()

In [21]:
history = model.fit(
    X_train_input,
    y_train_processed,
    validation_data=(X_test_input, y_test_processed),
    epochs=100,
    batch_size=32
)

Epoch 1/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 914ms/step - loss: 1828.7715 - val_loss: 534.7390
Epoch 2/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 810ms/step - loss: 555.4443 - val_loss: 503.9787
Epoch 3/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 711ms/step - loss: 532.7684 - val_loss: 499.2783
Epoch 4/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 785ms/step - loss: 527.3724 - val_loss: 497.1676
Epoch 5/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 715ms/step - loss: 525.9396 - val_loss: 496.8898
Epoch 6/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 787ms/step - loss: 525.2247 - val_loss: 496.2593
Epoch 7/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 779ms/step - loss: 525.1682 - val_loss: 495.7740
Epoch 8/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 772ms/step - loss: 523.3863 - val_loss: 

In [22]:
model.save('/content/drive/MyDrive/captcha_model.keras')

In [23]:
preds = model.predict(X_test_input)
input_length = np.ones(preds.shape[0]) * preds.shape[1]
results = ctc_decode(preds, input_length=input_length, greedy=True)[0][0][:, :5]
pred_texts = []
for result in results:
    res = tf.strings.reduce_join(num_to_char(result+1)).numpy().decode("utf-8")
    pred_texts.append(res)
pred_df = pd.DataFrame({
    "image_path": X_test,
    "label": y_test,
    "pred": pred_texts
})
pred_df.head()

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 409ms/step


Unnamed: 0,image_path,label,pred
0,/content/drive/MyDrive/samples/nn6mg.png,nn6mg,nn6mg
1,/content/drive/MyDrive/samples/823p2.png,823p2,823p2
2,/content/drive/MyDrive/samples/n7ebx.png,n7ebx,n7ebx
3,/content/drive/MyDrive/samples/7wyp4.png,7wyp4,7wyp4
4,/content/drive/MyDrive/samples/f75cx.png,f75cx,f75cx
