This notebook creates a baseline model that predicts the captchas text with the most frequently characters in the dataset. More elaborated estimators must have a test score better than this model

## Import statements

In [24]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tensorflow as tf
from math import *

import keras
import keras.backend as K
from keras.layers import *
from keras.models import Model, Sequential
from keras.optimizers import Adam, RMSprop
from keras.callbacks import *
from sklearn.model_selection import train_test_split
from keras.losses import categorical_crossentropy
from sklearn.metrics import accuracy_score, confusion_matrix
from keras.utils import to_categorical

## Load preprocessed data

In [21]:
data = np.load('preprocessed-data.npz')
X, y, ids, alphabet = data['X'], data['y'], data['ids'], data['alphabet']

## Evaluate baseline model

We are going to take the most frequent char in the dataset

In [22]:
ch_label = np.argmax(np.sum(np.sum(y, axis=1), axis=0))

In [20]:
ch_label

13

In [23]:
alphabet[ch_label]

'n'

So for example, to evaluate the model on every sample, we predict that the resulting captcha text will be: 'nnnnn'

In [54]:
y_pred = to_categorical(ch_label, 36).reshape([1, 36]).repeat(5, axis=0).reshape([1, 5, 36]).repeat(y.shape[0], axis=0)

In [66]:
for k in range(0, 5):
    print('Score for the {}th char predictions are: {}'.format(
         k, np.mean(np.all(y[:, k, :] == y_pred[:, k, :], axis=1))))

Score for the 0th char predictions are: 0.09345794392523364
Score for the 1th char predictions are: 0.09532710280373832
Score for the 2th char predictions are: 0.10093457943925234
Score for the 3th char predictions are: 0.10093457943925234
Score for the 4th char predictions are: 0.11401869158878504


In [82]:
print('Number of captcha texts fully matched: {}'.format(
    np.mean(np.all(np.all(y == y_pred, axis=2), axis=1))))

Number of captcha texts fully matched: 0.0


In [83]:
print('Number of captcha texts with only 1 char matched: {}'.format(
    np.mean(np.any(np.all(y == y_pred, axis=2), axis=1))))

Number of captcha texts with only 1 char matched: 0.4130841121495327


In [91]:
print('Number of captcha texts with only 2 char matched: {}'.format(
    np.mean(np.sum(np.all(y == y_pred, axis=2), axis=1) >= 2)))

Number of captcha texts with only 2 char matched: 0.08130841121495327


In [95]:
print('Number of captcha texts with only 3 char matched: {}'.format(
    np.mean(np.sum(np.all(y == y_pred, axis=2), axis=1) >= 3)))

Number of captcha texts with only 3 char matched: 0.010280373831775701


In [97]:
print('Number of captcha texts with only 4 char matched: {}'.format(
    np.mean(np.sum(np.all(y == y_pred, axis=2), axis=1) >= 4)))

Number of captcha texts with only 4 char matched: 0.0


In [93]:
print('Number of chars matched on each captcha text on average {}'.format(
    np.mean(np.sum(np.all(y == y_pred, axis=2), axis=1))))

Number of chars matched on each captcha text on average 0.5046728971962616
