In [1]:
!gdown 'https://drive.google.com/uc?id=1dCTlUjfHp9aiyX_yrFugmWlaG_I-9Gst'

Downloading...
From (original): https://drive.google.com/uc?id=1dCTlUjfHp9aiyX_yrFugmWlaG_I-9Gst
From (redirected): https://drive.google.com/uc?id=1dCTlUjfHp9aiyX_yrFugmWlaG_I-9Gst&confirm=t&uuid=dfc4d72e-509a-4f9d-8d12-1c4bc173094d
To: /content/words_captcha.zip
100% 4.57G/4.57G [00:50<00:00, 90.5MB/s]


In [2]:
import zipfile

with zipfile.ZipFile("words_captcha.zip", "r") as zip_ref:
    zip_ref.extractall()

# **Data preprocessing:**

In [3]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np

label_file = "words_captcha/spec_train_val.txt"
labels_df = pd.read_csv(label_file, sep=" ", header=None, names=["filename", "label"])
labels_df = labels_df[labels_df["label"].apply(lambda x: isinstance(x, str))]

end_token = "E"
pad_token = "P"
all_words = [word for word in labels_df["label"].values]
char_sequences = [list(word) for word in all_words]

tokenizer = Tokenizer(char_level=True, filters='')
tokenizer.fit_on_texts(char_sequences)
tokenizer.word_index[end_token] = len(tokenizer.word_index) + 1
tokenizer.word_index[pad_token] = 0

end_index = tokenizer.word_index[end_token]
pad_index = tokenizer.word_index[pad_token]

label_indices = tokenizer.texts_to_sequences(char_sequences)
label_indices = [label + [end_index] for label in label_indices]

max_length = 6
label_indices = np.array([np.pad(label, (0, max_length - len(label)), 'constant', constant_values=pad_index) for label in label_indices])

labels_df["encoded_label"] = list(label_indices)

train_df = labels_df.iloc[:100000]
val_df = labels_df.iloc[100000:120000]
test_filenames = [f"a{i}.png" for i in range(120000, 140000)]

print(f"Character to index mapping: {tokenizer.word_index}")
print(f"End token index: {end_index}")
print(f"Pad token index: {pad_index}")
print(f"Training set: {len(train_df)} images")
print(f"Validation set: {len(val_df)} images")
print(f"Testing set: {len(test_filenames)} images")


Character to index mapping: {'e': 1, 'a': 2, 's': 3, 'o': 4, 'i': 5, 't': 6, 'r': 7, 'l': 8, 'n': 9, 'd': 10, 'c': 11, 'p': 12, 'm': 13, 'u': 14, 'h': 15, 'g': 16, 'b': 17, 'f': 18, 'y': 19, 'k': 20, 'w': 21, 'v': 22, 'j': 23, 'x': 24, 'z': 25, 'q': 26, 'E': 27, 'P': 0}
End token index: 27
Pad token index: 0
Training set: 100000 images
Validation set: 19939 images
Testing set: 20000 images


In [4]:
print(labels_df[["filename", "label", "encoded_label"]].head())

example_label = labels_df.iloc[0]
print(f"Original Label: {example_label['label']}")
print(f"Encoded Label: {example_label['encoded_label']}")


  filename label           encoded_label
0       a0  thus   [6, 15, 14, 3, 27, 0]
1       a1   www  [21, 21, 21, 27, 0, 0]
2       a2  tied    [6, 5, 1, 10, 27, 0]
3       a3   ids    [5, 10, 3, 27, 0, 0]
4       a4   jam   [23, 2, 13, 27, 0, 0]
Original Label: thus
Encoded Label: [ 6 15 14  3 27  0]


# **Load image and create dataset:**

In [5]:
import tensorflow as tf
import numpy as np
import os

image_size = (224, 224)
input_shape = image_size + (3,)
vocab_size = 28
max_seq_length = 6

def load_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_png(img, channels=3)
    img = tf.image.resize(img, image_size)
    img = tf.keras.applications.resnet50.preprocess_input(img)
    return img

def create_dataset(filenames, labels, batch_size=32):
    image_paths = [os.path.join("words_captcha", filename + ".png") for filename in filenames]

    dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels))

    def process_path_and_labels(image_path, label):
        img = load_image(image_path)
        label = tf.pad(label, [[0, max_seq_length - tf.shape(label)[0]]], constant_values=0)
        return img, label

    dataset = dataset.map(process_path_and_labels, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
    return dataset

train_labels = [label.tolist() for label in train_df["encoded_label"].values]
val_labels = [label.tolist() for label in val_df["encoded_label"].values]

train_labels_tensor = tf.convert_to_tensor(train_labels, dtype=tf.int32)
val_labels_tensor = tf.convert_to_tensor(val_labels, dtype=tf.int32)

train_dataset = create_dataset(train_df["filename"].values, train_labels_tensor, batch_size=32)
val_dataset = create_dataset(val_df["filename"].values, val_labels_tensor, batch_size=32)

for images, labels in train_dataset.take(1):
    print(f"Batch image shape: {images.shape}")
    print(f"Batch label shape: {labels.shape}")


Batch image shape: (32, 224, 224, 3)
Batch label shape: (32, 6)


# **Build model and train:**

In [6]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Attention, Reshape, GlobalAveragePooling2D, Lambda
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model

def create_resnet_feature_extractor():
    image_input = Input(shape=input_shape)
    resnet_model = ResNet50(include_top=False, weights='imagenet', input_tensor=image_input)
    resnet_output = GlobalAveragePooling2D()(resnet_model.output)
    return Model(inputs=image_input, outputs=resnet_output)

def attention_layer(query, value):
    attention_output = Attention()([query, value])
    return attention_output

def create_rnn_decoder(features_dim, vocab_size, max_seq_length):
    input_features = Input(shape=(None, features_dim))
    lstm_output = LSTM(512, return_sequences=True)(input_features)
    output = Dense(vocab_size, activation='softmax')(lstm_output)
    return Model(inputs=input_features, outputs=output)

def create_model(vocab_size, max_seq_length):
    image_input = Input(shape=input_shape)
    resnet_feature_extractor = create_resnet_feature_extractor()
    cnn_features = resnet_feature_extractor(image_input)
    cnn_features_expanded = Lambda(lambda x: tf.expand_dims(x, axis=1))(cnn_features)
    cnn_features_tiled = Lambda(lambda x: tf.tile(x, [1, max_seq_length, 1]))(cnn_features_expanded)
    attended_features = attention_layer(cnn_features_tiled, cnn_features_tiled)
    rnn_decoder = create_rnn_decoder(features_dim=cnn_features.shape[-1], vocab_size=vocab_size, max_seq_length=max_seq_length)
    lstm_output = rnn_decoder(attended_features)
    model = Model(inputs=image_input, outputs=lstm_output)
    return model

model = create_model(vocab_size, max_seq_length)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 0us/step


In [7]:
import tensorflow as tf

learning_rate = 1e-4
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction='none')

def compute_loss(y_true, y_pred):
    y_true = tf.cast(y_true, tf.int32)
    loss = loss_object(y_true, y_pred)
    return tf.reduce_sum(loss)

@tf.function
def train_step(inputs, targets):
    with tf.GradientTape() as tape:
        predictions = model(inputs, training=True)
        loss = compute_loss(targets, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

def train_model(dataset, epochs):
    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        total_loss = 0.0
        step = 0
        for images, labels in dataset:
            loss = train_step(images, labels)
            total_loss += loss
            step += 1
            if step % 500 == 0:
                print(f"Step {step}, Loss: {loss.numpy():.4f}")
        print(f"Epoch {epoch+1} Loss: {total_loss / step:.4f}")

train_model(train_dataset, epochs=10)


Epoch 1/10
Step 500, Loss: 218.4377
Step 1000, Loss: 74.6333
Step 1500, Loss: 34.4089
Step 2000, Loss: 33.0413
Step 2500, Loss: 16.3476
Step 3000, Loss: 22.9522
Epoch 1 Loss: 96.6465
Epoch 2/10
Step 500, Loss: 7.9130
Step 1000, Loss: 12.6176
Step 1500, Loss: 3.4035
Step 2000, Loss: 6.6950
Step 2500, Loss: 6.5239
Step 3000, Loss: 6.6139
Epoch 2 Loss: 8.5561
Epoch 3/10
Step 500, Loss: 4.5941
Step 1000, Loss: 1.3063
Step 1500, Loss: 2.8815
Step 2000, Loss: 2.0712
Step 2500, Loss: 18.3237
Step 3000, Loss: 4.9365
Epoch 3 Loss: 4.5622
Epoch 4/10
Step 500, Loss: 1.6156
Step 1000, Loss: 1.6294
Step 1500, Loss: 0.6198
Step 2000, Loss: 3.6863
Step 2500, Loss: 1.9806
Step 3000, Loss: 2.2964
Epoch 4 Loss: 3.2009
Epoch 5/10
Step 500, Loss: 6.1899
Step 1000, Loss: 0.6492
Step 1500, Loss: 11.4415
Step 2000, Loss: 1.0438
Step 2500, Loss: 1.1099
Step 3000, Loss: 2.0040
Epoch 5 Loss: 2.4218
Epoch 6/10
Step 500, Loss: 0.3569
Step 1000, Loss: 1.4825
Step 1500, Loss: 1.2036
Step 2000, Loss: 1.4215
Step 250

In [8]:
model.save_weights('model_weights.weights.h5')

# **Validation output:**

In [9]:
import numpy as np

def predict_on_val_dataset(val_dataset, tokenizer, model, display_samples=50):
    correct_predictions = 0
    total_predictions = 0
    display_count = 0

    for images, labels in val_dataset:
        predictions = model(images, training=False)
        predicted_indices = np.argmax(predictions.numpy(), axis=-1)

        for i in range(len(predicted_indices)):
            predicted_word = ''
            for idx in predicted_indices[i]:
                if idx == tokenizer.word_index['E']:
                    break
                predicted_word += tokenizer.index_word.get(idx, '')

            true_word = ''
            for idx in labels[i].numpy():
                if idx == tokenizer.word_index['E']:
                    break
                true_word += tokenizer.index_word.get(idx, '')

            if predicted_word == true_word:
                correct_predictions += 1
            total_predictions += 1

            if display_count < display_samples:
                print(f"Sample {display_count + 1}:")
                print(f"  Predicted: {predicted_word}")
                print(f"  True Label: {true_word}\n")
                display_count += 1

    accuracy = correct_predictions / total_predictions
    return accuracy

accuracy = predict_on_val_dataset(val_dataset, tokenizer, model)
print(f"Validation accuracy: {accuracy * 100:.2f}%")


Sample 1:
  Predicted: yale
  True Label: yale

Sample 2:
  Predicted: leon
  True Label: leon

Sample 3:
  Predicted: radar
  True Label: radar

Sample 4:
  Predicted: sold
  True Label: sold

Sample 5:
  Predicted: six
  True Label: six

Sample 6:
  Predicted: cnn
  True Label: cnn

Sample 7:
  Predicted: rugs
  True Label: rugs

Sample 8:
  Predicted: bat
  True Label: bat

Sample 9:
  Predicted: ref
  True Label: ref

Sample 10:
  Predicted: edit
  True Label: edit

Sample 11:
  Predicted: lil
  True Label: lil

Sample 12:
  Predicted: trio
  True Label: trio

Sample 13:
  Predicted: mail
  True Label: mail

Sample 14:
  Predicted: sheet
  True Label: sheet

Sample 15:
  Predicted: tile
  True Label: tile

Sample 16:
  Predicted: pat
  True Label: pat

Sample 17:
  Predicted: gmc
  True Label: gmc

Sample 18:
  Predicted: dodge
  True Label: dodge

Sample 19:
  Predicted: tune
  True Label: tune

Sample 20:
  Predicted: sas
  True Label: sas

Sample 21:
  Predicted: acre
  True Lab

# **Predict test set:**

In [10]:
def predict_on_test_set(test_filenames, tokenizer, model, batch_size=64, output_file="Lab12-2_113062624.txt"):
    results = []

    batches = [test_filenames[i:i + batch_size] for i in range(0, len(test_filenames), batch_size)]

    for batch in batches:
        batch_images = []
        for filename in batch:
            image_path = os.path.join("words_captcha", filename)
            img = load_image(image_path)
            batch_images.append(img)

        batch_images = np.stack(batch_images, axis=0)

        predictions = model(batch_images, training=False)
        predicted_indices = np.argmax(predictions.numpy(), axis=-1)

        for i, predicted_idx in enumerate(predicted_indices):
            base_filename = batch[i].split('.')[0]
            predicted_word = ''

            for idx in predicted_idx:
                if idx == tokenizer.word_index['E']:
                    break
                predicted_word += tokenizer.index_word.get(idx, '')

            results.append(f"{base_filename} {predicted_word}")

    with open(output_file, "w") as f:
        for line in results:
            f.write(line + "\n")

    print(f"Results written to {output_file}")

predict_on_test_set(test_filenames, tokenizer, model, batch_size=32)


Results written to Lab12-2_113062624.txt


# **Report:**

**Data preprocessing:**
This code processes text labels for a captcha recognition task. It starts by loading a file containing filenames and corresponding text labels into a DataFrame. Non-string labels are filtered out to ensure the data is clean and consistent. The characters in the labels are then tokenized at a character level using Keras' `Tokenizer`. Two special tokens are introduced: `"E"` to signify the end of a sequence and `"P"` for padding, with `"E"` assigned a unique index and `"P"` set to 0.

Next, the text labels are converted into sequences of indices based on the tokenizer's vocabulary. An `"E"` token is appended to the end of each sequence to mark the sequence's termination. These sequences are then padded to a fixed length (6 in this case) using the padding index (`0`) to ensure uniformity. Finally, the processed labels are stored back in the DataFrame under a new column called `encoded_label`, and the dataset is split into training, validation, and test subsets.

**Load image and create dataset:**
This code prepares datasets for training and validation in a captcha recognition task. It defines a function to load and preprocess images, resizing them to \(224 \times 224\) pixels and applying ResNet50 preprocessing. Another function is used to create TensorFlow datasets from filenames and labels by mapping image paths to processed images and padding the labels to a fixed length (6).

The script then converts encoded labels from pandas DataFrames (`train_df` and `val_df`) into TensorFlow tensors. These tensors, along with filenames, are passed to the dataset creation function to generate batched and prefetched datasets for efficient training. Finally, the script verifies the datasets by printing the shapes of one batch of images and labels.

**Build model and train:**This code trains a hybrid model combining ResNet50, Attention, and LSTM to recognize CAPTCHA character sequences. ResNet50 extracts image features, which are expanded and enhanced using an Attention mechanism. An LSTM-based decoder generates character sequences, with a Dense softmax layer predicting each character. The model is trained using a custom loop with Sparse Categorical Crossentropy loss and the Adam optimizer, processing CAPTCHA images and their corresponding encoded labels.

**Validation and test set prediction:**This code contains two functions for evaluating the CAPTCHA recognition model.

The `predict_on_val_dataset` function evaluates the model on a validation dataset, comparing predicted words with true labels. For each batch of images, it processes predictions, converts character indices back to words, and computes accuracy. Optionally, it displays a few predictions alongside their true labels for visual inspection.

The `predict_on_test_set` function handles predictions on a test set. It processes images in batches, predicts their corresponding words, and writes the results to a specified output file. Each prediction includes the image filename and the predicted word, saved in a text format suitable for submission or further analysis.