In [None]:
!pip install seaborn==0.11.1
!pip install keract

In [None]:
import io
import time
import math

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.manifold import TSNE
from sklearn.model_selection import StratifiedShuffleSplit, cross_validate, GridSearchCV, cross_val_predict, train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, plot_confusion_matrix, accuracy_score
from sklearn.utils.multiclass import unique_labels
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.data.experimental import AUTOTUNE

from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Input, Flatten, Dense, Dropout, BatchNormalization, Conv2D, MaxPool2D, ReLU, ELU
from keras.models import Model
from keras.optimizers import Adam, Nadam
from keras.callbacks import EarlyStopping, TensorBoard, ReduceLROnPlateau, LearningRateScheduler, ModelCheckpoint
from keras.utils import to_categorical

import keract

<img style="margin-left:0" src="https://i.dlpng.com/static/png/1280814-best-25-number-writing-practice-ideas-on-pinterest-writing-writing-numbers-png-1280_752_preview.png" width="600px">

**Digit Recognizer** is a Kaggle competion on the digit MNIST dataset. The goal is to perform OCR on the bunch of handwritten digits and come up with the solution that provides the best accuracy.

- **Github**: https://github.com/roma-glushko/kaggle-digit-recognizer
- **Experiment Notes**: https://github.com/roma-glushko/kaggle-digit-recognizer/blob/master/experiments.md

In [None]:
import tensorflow as tf

try:
    tpu_cluster = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Device:', tpu_cluster.master())
    
    tf.config.experimental_connect_to_cluster(tpu_cluster)
    tf.tpu.experimental.initialize_tpu_system(tpu_cluster)
    tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu_cluster)
except:
    tpu_strategy = tf.distribute.get_strategy()
print('Number of replicas:', tpu_strategy.num_replicas_in_sync)

In [None]:
RANDOM_SEED = 20210102

np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

In [None]:
train_df = pd.read_csv('../input/digit-recognizer/train.csv')
submission_df = pd.read_csv('../input/digit-recognizer/test.csv')

# Dataset Overview

Let's take a look at the datset at hand:

In [None]:
train_df.info()

In [None]:
submission_df.info()

Kaggle MNIST is differently distributed than <a href="http://yann.lecun.com/exdb/mnist/">the original MNIST</a>. We have
- 48,000 examples in the training set (60,000 in the original training set)
- 28,000 in the test set (10,000 in the original test set)

In [None]:
train_image_df = train_df.copy()
train_image_df.drop(columns=['label'], inplace=True)

train_label_df = train_df['label']

Let's visualize a couple of samples:

In [None]:
plt.figure(figsize=(10, 5))

for index, (image, label) in enumerate(zip(train_image_df[0:30], train_label_df[0:30])):
    ax = plt.subplot(3, 10, index + 1)
    ax.axis('off')

    plt.imshow(np.reshape(train_image_df.values[index], (28, 28)), cmap='binary')

    plt.title('Label: %i\n' % label, fontsize=10);

## Digit Pixel Distribution

In [None]:
train_df.describe()

Digit pictures consists of **28x28=784** pixels, each pixel can take value from 0 to 255 range (gray-scale one channel).

## Label Distribution

In [None]:
sns.histplot(x=train_label_df, bins=10);

In [None]:
train_label_df.value_counts(), train_label_df.value_counts(normalize=True)

Dataset distribution is nearly uniform (~1% of each kind of digits). Therefore, we can assume that dataset is **balanced**.

## Dataset Processing

Let's perform a pixel normalization and then split our full training set into train, validation and test sets:

In [None]:
train_image_df = train_image_df.astype('float32') / 255.0
submission_df = submission_df.astype('float32') / 255.0

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_image_df, train_label_df, test_size=0.2, random_state=RANDOM_SEED)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.15, random_state=RANDOM_SEED)

In [None]:
sns.histplot(data=y_val, bins=10)
sns.histplot(data=y_test, bins=10);

Eventually we get **validation and test set distributed in the same way** (as well as the original dataset). Thanks to **stratified splitting** that Sklearn performs under the hood.

## PCA and Manifold Learning

We can immediately recognize that **some of the pixels always contain zero value** accross all samples (pixel0, pixel1, pixel2, pixel3, pixel4, etc). From the image previews, we can see that these pixels are border of the canvas. **Each digit** is centred on the canvas. So most of the images would have **empty borders** which doesn't bring **any new information** for the classication model.

Another thing to notice: all of the digits are **formed by lines/curves** and they do take special regions of the canvas (comparing to landscape picture which takes the whole canvas). Also, **pixels in the neighborhood** correlate (digit parts formed by more than 1 pixel lines). So there should be less **degree of freedom** because of that and we most likely don't need the whole 28x28 dimention to classify images. 

This is the reason why we would like to perform **principal component analysis**.

In [None]:
from sklearn.decomposition import PCA

pc_analyser = PCA(random_state=RANDOM_SEED)

pc_analyser.fit(train_image_df)

accumulated_variance_ratio = np.cumsum(pc_analyser.explained_variance_ratio_)

plt.plot(accumulated_variance_ratio)
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');

print('95%% of variance can be covered by %d components' % (np.argmax(accumulated_variance_ratio >= 0.95) + 1))

As we expected, we need **only 154 components** to **cover 95% of digit variance** which is 5 times less. Great 🙌

In [None]:
pc_analyser = PCA(n_components=154, random_state=RANDOM_SEED)

reduced_train_image_df = pc_analyser.fit_transform(train_image_df)

In [None]:
pca_analyzer = PCA(
    n_components=2,
    random_state=RANDOM_SEED
)

x2d = pca_analyzer.fit_transform(train_image_df)

pca_df = pd.DataFrame(data=x2d, columns=['pc1', 'pc2'])
pca_df = pd.concat([pca_df, train_label_df], axis=1)

plt.figure(figsize=(10, 10))
sns.scatterplot(data=pca_df, x='pc1', y='pc2', hue='label', alpha=0.3, palette='muted');

Even 2-components PCA shows that there are regins in the space with higher densities of particular digits:
- blue 0s region
- orange 1s region
- green 2s region
- red 3s region
- violet 4s region
- magenta 6s region

5s, 7s, 8s, 9s regions are harder to find on the plot.


PCA is linear approach and we aslo want to try some non-linear visualization approach. 

T-SNE is a good candidate to reveal non-linear dependencies in data. T-SNE tries to save distances between a point and its neighbors in the original space when embbeds them in the reduced space. However, this is not necessarily means saving position of points, but rather their structure in the space. As a result, similar/close points get mapped closer to each other and dissimilar points stay apart.

In [None]:
X_tsne = TSNE(
    n_components=2,
    init='pca',
    perplexity=50,
    early_exaggeration=12,
    random_state=RANDOM_SEED,
    n_jobs=-1,
).fit_transform(reduced_train_image_df)

pca_df = pd.DataFrame(data=X_tsne, columns=['comp1', 'comp2'])
pca_df = pd.concat([pca_df, train_label_df], axis=1)

In [None]:
plt.figure(figsize=(10, 10))
sns.scatterplot(data=pca_df, x='comp1', y='comp2', hue='label', alpha=0.3, palette='muted');

**T-SNE disciminates our digits well**. This mean we should try to **use non-linear classification algorightms** to draw decision boundries between points.

Also, we cannot use T-SN embeddings without hacky approaches as it was designed to visualize relationship and not for **feature representation**.

More Info:
- https://datascience.stackexchange.com/a/25928/57207
- https://stackoverflow.com/a/54266231/4371397

# Classification 🧪

Since our dataset is **balanced** and we need to assign digit class to image representation, this is a **multiclass classification problem**.
We can simply use **accuracy** as metric to maximize during experiments.

In [None]:
def plot_confusion_matrix_by_predictions(y_true, y_predicted, *, labels=None,
                          sample_weight=None, normalize=None,
                          display_labels=None, include_values=True,
                          xticks_rotation='horizontal',
                          values_format=None, colorbar=False,
                          cmap='rocket_r', ax=None):
    
    cm = confusion_matrix(y_true, y_predicted, sample_weight=sample_weight,
                          labels=labels, normalize=normalize)

    if display_labels is None:
        if labels is None:
            display_labels = unique_labels(y_true, y_predicted)
        else:
            display_labels = labels

    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                  display_labels=display_labels)

    return disp.plot(include_values=include_values,
                     cmap=cmap, ax=ax, xticks_rotation=xticks_rotation,
                     values_format=values_format)


In [None]:
def score_classification_model(model, X_train, y_train):
    
    cv_scores = cross_validate(
        model, X_train, y_train, 
        scoring=['accuracy'],
        cv=5,
        n_jobs=-1, verbose=0
    )

    cv_y_predicted = cross_val_predict(
        model, X_train, y_train,
        cv=5,
        n_jobs=-1
    )

    cv_accuracy, accuracy_std = cv_scores['test_accuracy'].mean(), cv_scores['test_accuracy'].std()

    model.fit(X_train, y_train)

    y_train_predicted = model.predict(X_train)

    train_accuracy = accuracy_score(y_train, y_train_predicted)

    print('[Train] Accuracy: %.4f' % (train_accuracy))
    print('Train Set Report:')
    print(classification_report(y_train, y_train_predicted, digits=3))

    print('[CV] Accuracy: %.4f (%.4f)' % (cv_accuracy, accuracy_std))
    print('CV Report:')
    print(classification_report(y_train, cv_y_predicted, digits=3))
    
    # display confusion matrixes

    _, (ax0, ax1) = plt.subplots(1, 2, figsize=(10, 10))

    ax0.set_title('Train Confusion Matrix')
    plot_confusion_matrix(
        model, X_train, y_train,
        cmap=plt.cm.Blues,
        ax=ax0,
    )

    ax1.set_title('CV Confusion Matrix')
    plot_confusion_matrix_by_predictions(
        y_train, cv_y_predicted,
        cmap=plt.cm.Blues,
        ax=ax1,
    )

    return y_train_predicted, cv_y_predicted

## SVM

The very first model we are going to try is **RBF-kerneled SVM classifier on PCA processed dataset**:

In [None]:
ksvm_pipeline = Pipeline([
    ('pca', PCA(n_components=154, random_state=RANDOM_SEED)),
    ('classifier', SVC(kernel='rbf', gamma=0.06)),
])

In [None]:
# score_classification_model(ksvm_pipeline, X_train, y_train);

In [None]:
# y_test_pred = ksvm_pipeline.predict(X_test)

print('Test Report')
print(classification_report(y_test, y_test_pred, digits=4))
plot_confusion_matrix_by_predictions(y_test, y_test_pred)

SVM is pretty **slow** on this amout of data.

### Submission

In [None]:
# y_submission = ksvm_pipeline.predict(submission_df)

submission_label_df = pd.DataFrame({
    'ImageId': list(range(1, len(y_submission) + 1)), 
    'Label': y_submission
})

submission_label_df.to_csv('./ksvm_submission.csv', index=False, header=True)

I sumbitted only one prediction from the SVM model and got **0.97610**. In general this is pretty decent result. I had not spend a lot of time playing with SVM as this task is mainly for NN models where they can achieve even higher accuracy.

## Multilayer Perceptron

In [None]:
def plot_training_history(training_history, metrics=['loss', 'accuracy'], best_epoch_metric='val_accuracy', figsize=(15, 5)):
    """
    Plot Keras training history in two plots: Loss Plot and Metric Plot
    """
    training_history_df = pd.DataFrame(training_history.history)

    best_epoch = np.argmax(training_history_df[best_epoch_metric])

    _, axes = plt.subplots(1, len(metrics), figsize=figsize)

    for idx, metric in enumerate(metrics):
        training_history_df[[metric, 'val_' + metric]].plot(
            title=metric, 
            grid=True, 
            ax=axes[idx]
        )
        plt.gca().set_ylim(0, 1)
        
        axes[idx].axvline(
            best_epoch,
            ls="--",
            c="k",
            lw=1,
        )
        
    digits = 4
    headers = ['Train', 'Validation']
    width = max(len(headers[1]), digits)
    rows = []
    
    for metric in metrics:
        rows.append([metric, training_history_df[metric][best_epoch], training_history_df['val_' + metric][best_epoch]])
    
    head_fmt = '{:>{width}s}' + ' {:>9} ' * len(headers)
    report = head_fmt.format('', *headers, width=width)
    report += '\n\n'
    row_fmt = ' {:>9}' + ' {:>9.{digits}f}' * 2 + '\n' 
    
    for row in rows:
        report += row_fmt.format(*row, width=width, digits=digits)
    
    return report

In [None]:
def plot_misclassified_samples(y_test, y_test_pred, true_label, pred_label):
    missclass_label_indices = [
        idx for idx, (label_true, label_pred) in enumerate(zip(y_test, y_test_pred)) 
        if label_true == true_label and label_pred == pred_label
    ]

    fig = plt.figure(figsize=(20, 10))
    fig.subplots_adjust(hspace=0.4, wspace=0.4)

    for i, idx in enumerate(missclass_label_indices[:30]):    
        ax = fig.add_subplot(3, 10, i + 1)
        ax.axis('off')

        ax.text(0.5, -0.35, 'ID = ' + str(idx), fontsize=10, ha='center', transform=ax.transAxes) 
        ax.text(0.5, -0.6, 'pred = ' + str(y_test_pred[idx]), fontsize=10, ha='center', transform=ax.transAxes) 
        ax.text(0.5, -0.8, 'act = ' + str(y_test[idx]), fontsize=10, ha='center', transform=ax.transAxes)
        ax.imshow(np.reshape(X_test.values[idx], (28, 28)), cmap='binary')

Neural Networks are great models to tackle complex non-linear problems where a bunch of data is available for learning from.
We will be using **Keras** as a NN framework.

Before we start building any models, we need to make sure our labels are **onehot encoded** so we can use cross entropy loss function during network training:

In [None]:
NUM_CLASSES = 10

y_train_onehot = to_categorical(y_train, NUM_CLASSES)
y_val_onehot = to_categorical(y_val, NUM_CLASSES)
y_test_onehot = to_categorical(y_test, NUM_CLASSES)

### Architecture

In [None]:
with tpu_strategy.scope():
    input_layer = Input((784))

    mlp = Dense(200)(input_layer)
    mlp = BatchNormalization()(mlp)
    mlp = ReLU()(mlp)
    mlp = Dropout(0.2)(mlp)

    mlp = Dense(200)(input_layer)
    mlp = BatchNormalization()(mlp)
    mlp = ReLU()(mlp)
    mlp = Dropout(0.2)(mlp)

    output_layer = Dense(NUM_CLASSES, activation='softmax')(mlp)

    mlp_model = Model(input_layer, output_layer, name='MLP')

    mlp_model.compile(
        loss='categorical_crossentropy', 
        optimizer=Adam(lr=0.0005), 
        metrics=['accuracy']
    )

mlp_model.summary()

### Training

In [None]:
early_stopping = EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True,
)

mlp_train_history = mlp_model.fit(
    X_train, y_train_onehot, 
    validation_data=(X_val, y_val_onehot),
    batch_size=32, 
    epochs=50, 
    shuffle=True,
    callbacks=[early_stopping],
)

In [None]:
print(plot_training_history(mlp_train_history))

### Evaluation

In [None]:
mlp_model.evaluate(X_test, y_test_onehot)

In [None]:
y_test_true = np.argmax(y_test_onehot, axis=-1)
y_test_pred = np.argmax(mlp_model.predict(X_test), axis=-1)

In [None]:
print('Test Report')
print(classification_report(y_test_true, y_test_pred, digits=4))
plot_confusion_matrix_by_predictions(y_test_true, y_test_pred)

In [None]:
plot_misclassified_samples(y_test_true, y_test_pred, 9, 7)

### Submission

In [None]:
y_submission = np.argmax(mlp_model.predict(submission_df), axis=-1)

submission_label_df = pd.DataFrame({
    'ImageId': list(range(1, len(y_submission) + 1)), 
    'Label': y_submission
})

submission_label_df.to_csv('./mlp_submission.csv', index=False, header=True)

Had not play with MLP a  lot as well. I have tried up to ten different architectures and stopped on the current one. It gives:
- **~0.9787** on the validation set
- **0.97614** on Kaggle sumission

This brings me invisible improvment to what was achived with SVM model. I belive we could do better with MLP model.

## CNN

I was little bit in hurry to start playing with **Convolutional Neural Networks** 🙌

CNN is a proven leader in Computer Vision tasks. Convolutional layers are something that makes difference. This is a learnable NN blocks that are capable of extracting feeatures from the images and capture low and high level details and patterns from spartial dimentions. 

This is a section of the notebook where I spent most of the time and burnt most of GPU resources 🔥

I have tried 40 different CNN architectures, tens of learning rates and batch sizes, a few optimizers. See full list of my experiments: https://github.com/roma-glushko/kaggle-digit-recognizer/blob/master/experiments.md

As a result, I kept only the final solution that brought me to **Top 7%** with a accuracy score **0.99657**.

The solution is an **ensamble of 10x 3-double-convolutional-layers NNs + 128Dense layer with BatchNorms after each layer and 0.4 Droupouts in the end of each layer**. On top of that I used data augmentation with slight degree of random zooming, rotation to make models generalize better. 

Also, using **Adam(learning_rate=1e-3)** boosted to model accuracy (I tried Nadam as well).

Ensamble of NN models helps a lot with measuring a real impact of changes during experiment as NN experiments stay stochastic even when the random seed is fixed.

An idea of this architecture comes from the following experiments:
- https://www.kaggle.com/cdeotte/how-to-choose-cnn-architecture-mnist
- https://www.kaggle.com/cdeotte/25-million-images-0-99757-mnist

In our LeNet-like architecture, the first layer is Conv2d layer that's why we need to reshape our flat input to 28x28 matrix to be able to feed it to our CNNs:

In [None]:
# will be feed to predict method
X_test2d = X_test.to_numpy().reshape((-1, 28, 28))
X_submission2d = submission_df.to_numpy().reshape((-1, 28, 28))

For final submission, we need to train our ensamble on the whole training dataset. Let's prepare it here:

### Data Augmentation

We are going to perform just a slight changes to the original images. It's going to be random zooming, rotation and height/width shifts.

Data augmentation can hurt in this task. We can imagine that huge rotation can turn 9s into 6s and visa versa.

In [None]:
IMAGE_SHAPE = (28, 28, 1)
batch_size = 16 * tpu_strategy.num_replicas_in_sync

print('Num of Replicas: {}'.format(tpu_strategy.num_replicas_in_sync))
print('Batch Size: {}'.format(batch_size))

In [None]:
def get_transofmation_matrix(rotation, shear, height_zoom, width_zoom, height_shift, width_shift):
    # returns 3x3 transform matrix which transforms indicies
    # https://www.kaggle.com/cdeotte/rotation-augmentation-gpu-tpu-0-96
    
    # degree to radians
    rotation = math.pi * rotation / 180.
    shear = math.pi * shear / 180.
    
    # rotation 
    c1 = tf.math.cos(rotation)
    s1 = tf.math.sin(rotation)
    one = tf.constant([1], dtype='float32')
    zero = tf.constant([0], dtype='float32')
    
    rotation_matrix = tf.reshape(
        tf.concat([
            c1,   s1,   zero, 
            -s1,  c1,   zero, 
            zero, zero, one
        ], 
        axis=0), 
        (3, 3)
    )
        
    # shear
    c2 = tf.math.cos(shear)
    s2 = tf.math.sin(shear)
    
    shear_matrix = tf.reshape(
        tf.concat([
            one,  s2,   zero, 
            zero, c2,   zero, 
            zero, zero, one
        ],
        axis=0),
        (3, 3)
    )    
    
    # zoom
    zoom_matrix = tf.reshape(
        tf.concat([
            one / height_zoom, zero, zero, 
            zero, one / width_zoom, zero, 
            zero, zero, one
        ], 
        axis=0), 
        (3, 3)
    )
    
    # shift
    shift_matrix = tf.reshape(
        tf.concat([
            one, zero, height_shift, 
            zero, one, width_shift, 
            zero, zero, one
        ], 
        axis=0), 
        (3, 3)
    )
    
    return K.dot(
        K.dot(rotation_matrix, shear_matrix), 
        K.dot(zoom_matrix, shift_matrix)
    )

def augment_image(
    image_shape, 
    rotation_range, 
    shear_range, 
    height_zoom_range, 
    width_zoom_range, 
    height_shift_range, 
    width_shift_range
):
    def transform_image(image, label):
        # input image - is one image of size [dim,dim,3] not a batch of image_shape
        # output - image randomly rotated, sheared, zoomed, and shifted

        image_width = image_shape[0]
        xdim = image_width % 2 #fix for size 331

        rotation = rotation_range * tf.random.normal([1], dtype='float32')
        shear = shear_range * tf.random.normal([1], dtype='float32') 
        h_zoom = height_zoom_range + tf.random.normal([1], dtype='float32') / 10.
        w_zoom = width_zoom_range + tf.random.normal([1], dtype='float32') / 10.
        h_shift = height_shift_range * tf.random.normal([1], dtype='float32') 
        w_shift = width_shift_range * tf.random.normal([1], dtype='float32') 

        transformation_matrix = get_transofmation_matrix(rotation, shear, h_zoom, w_zoom, h_shift, w_shift) 

        # LIST DESTINATION PIXEL INDICES
        x = tf.repeat(tf.range(image_width // 2, -image_width // 2, -1), image_width)
        y = tf.tile(tf.range(-image_width // 2, image_width // 2), [image_width])
        z = tf.ones([image_width * image_width], dtype='int32')
        idx = tf.stack([x, y, z])

        # ROTATE DESTINATION PIXELS ONTO ORIGIN PIXELS
        idx2 = K.dot(transformation_matrix, tf.cast(idx, dtype='float32'))
        idx2 = K.cast(idx2, dtype='int32')
        idx2 = K.clip(idx2, -image_width // 2 + xdim + 1, image_width // 2)

        # FIND ORIGIN PIXEL VALUES           
        idx3 = tf.stack([image_width // 2 - idx2[0, ], image_width // 2 - 1 + idx2[1, ]])
        d = tf.gather_nd(image, tf.transpose(idx3))

        return tf.reshape(d, image_shape), label
    
    return transform_image

In [None]:
image_augmentor = augment_image(
    IMAGE_SHAPE,
    rotation_range=5,
    shear_range=5,
    height_zoom_range=1.1, 
    width_zoom_range=1.1, 
    height_shift_range=1., 
    width_shift_range=1.,
)

In [None]:
# will be feed to ImageDataGenerator
# X_train3d = X_train.to_numpy().reshape(-1, 28, 28, 1)
# X_val3d = X_val.to_numpy().reshape(-1, 28, 28, 1)


# data_augmentator = ImageDataGenerator(
#     rotation_range = 10,  
#     zoom_range = 0.1, 
#     width_shift_range = 0.1, 
#     height_shift_range = 0.1
# )

# X_augmentation = X_train3d[:100, ]
# y_augmentation = y_train_onehot[:100, ]

# plt.figure(figsize=(15, 4.5))

# for i in range(30):  
#     plt.subplot(3, 10, i + 1)
    
#     augmented_image, _ = data_augmentator.flow(X_augmentation, y_augmentation).next()
    
#     plt.imshow(augmented_image[0].reshape((28, 28)), cmap='binary')
#     plt.axis('off')
        
# plt.subplots_adjust(wspace=-0.1, hspace=-0.1)
# plt.show()

Let's have a quick preview of augmented images:

In [None]:
X_train3d = X_train.to_numpy().reshape(-1, 28, 28, 1)

augmented_image_batch = (
    tf.data.Dataset
        .from_tensor_slices((X_train3d[11:12, :], y_train_onehot[11:12, :]))
        .map(image_augmentor, num_parallel_calls=AUTOTUNE)
        .repeat()
        .shuffle(4048)
        .take(50)
)

plt.figure(figsize=(15, 4.5))

for i, (augmented_image, label) in enumerate(augmented_image_batch):  
    plt.subplot(5, 10, i + 1)
    
    plt.imshow(tf.reshape(augmented_image, (28, 28)), cmap='binary')
    plt.axis('off')
        
plt.subplots_adjust(wspace=-0.1, hspace=-0.1)
plt.show()

### Architecture

In [None]:
def build_cnn_model():
    cnn_input_layer = Input((28, 28, 1))

    # layer 1
    cnn = Conv2D(32, 3, activation='relu', padding='same')(cnn_input_layer)
    cnn = BatchNormalization()(cnn)

    cnn = Conv2D(32, 3, activation='relu', padding='same')(cnn)
    cnn = BatchNormalization()(cnn)

    cnn = Conv2D(32, 5, strides=2, activation='relu', padding='same')(cnn)
    cnn = BatchNormalization()(cnn)
    cnn = Dropout(0.4)(cnn)

    # layer 2
    cnn = Conv2D(64, 3, activation='relu', padding='same')(cnn)
    cnn = BatchNormalization()(cnn)

    cnn = Conv2D(64, 3, activation='relu', padding='same')(cnn)
    cnn = BatchNormalization()(cnn)

    cnn = Conv2D(64, 5, strides=2, activation='relu', padding='same')(cnn)
    cnn = BatchNormalization()(cnn)
    cnn = Dropout(0.4)(cnn)

    # layer 3
    cnn = Conv2D(128, 3, activation='relu', padding='same')(cnn)
    cnn = BatchNormalization()(cnn)

    cnn = Conv2D(128, 3, activation='relu', padding='same')(cnn)
    cnn = BatchNormalization()(cnn)

    cnn = Conv2D(128, 5, strides=2, activation='relu', padding='same')(cnn)
    cnn = BatchNormalization()(cnn)
    cnn = Dropout(0.4)(cnn)

    cnn = Flatten()(cnn)

    # layer 4
    cnn = Dense(128, kernel_initializer='he_normal')(cnn)
    cnn = BatchNormalization()(cnn)
    cnn = ReLU()(cnn)
    cnn = Dropout(0.4)(cnn)

    cnn_output_layer = Dense(NUM_CLASSES, activation='softmax')(cnn)

    cnn_model = Model(cnn_input_layer, cnn_output_layer, name='CNN')
    cnn_model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=1e-3), metrics=['accuracy'])
    
    return cnn_model

In [None]:
def split_training_set(X, y, test_size=0.1):
    X_train_ensamble, X_val_ensamble, y_train_ensamble, y_val_ensamble = train_test_split(
            X, y,
            test_size=test_size, 
            random_state=RANDOM_SEED
        )

    X_train3d = X_train_ensamble.to_numpy().reshape((-1, 28, 28, 1))
    X_val3d = X_val_ensamble.to_numpy().reshape((-1, 28, 28, 1))
    y_train_onehot = to_categorical(y_train_ensamble, NUM_CLASSES)
    y_val_onehot = to_categorical(y_val_ensamble, NUM_CLASSES)
    
    # convert in-memory train and validation sets into TF Dataset to take advantages of TPU acceleration
    
    X_train_dataset = (
        tf.data.Dataset
            .from_tensor_slices((X_train3d, y_train_onehot))
            .map(image_augmentor, num_parallel_calls=AUTOTUNE)
            .repeat()
            .shuffle(4048)
            .batch(batch_size)
            .prefetch(AUTOTUNE)
    )

    X_val_dataset = (
        tf.data.Dataset
            .from_tensor_slices((X_val3d, y_val_onehot))
            .batch(batch_size)
            .cache()
            .prefetch(AUTOTUNE)
    )
    
    return X_train_dataset, X_val_dataset

In [None]:
early_stopping = EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True,
)

learning_rate_scheduler = LearningRateScheduler(lambda epoch: 1e-3 * 0.95 ** epoch)

n_estimators = 10
step_per_epoch = train_image_df.shape[0] // batch_size

cnn_ensamble = []
training_histories = []


for i in range(n_estimators):
    
    with tpu_strategy.scope():
        model = build_cnn_model()
        
    model_saver = ModelCheckpoint(filepath='model_best_MNIST_{}.h5'.format(i), 
                                   save_best_only=True,
                                   verbose=2)
    
    X_train_dataset, X_val_dataset = split_training_set(train_image_df, 
                                                        train_label_df,
                                                        test_size=0.1)
    
    print("CNN {}: Training...".format(i))

    cnn_train_history = model.fit(
        X_train_dataset,
        validation_data=X_val_dataset,
        steps_per_epoch=step_per_epoch,
        epochs=200, 
        shuffle=True,
        callbacks=[early_stopping, learning_rate_scheduler, model_saver],
        verbose=1,
    )

    cnn_ensamble.append(model)
    training_histories.append(cnn_train_history)

    train_history_df = pd.DataFrame(cnn_train_history.history)
    best_epoch = np.argmax(train_history_df['val_accuracy'])
    print("CNN {}: Train Accuracy: {:0.4f} Validation Accuracy: {:0.4f}".format(
        i, 
        train_history_df['accuracy'][best_epoch], 
        train_history_df['val_accuracy'][best_epoch]
    ))

### Evaluation

In [None]:
def evaluate_ensamble(ensamble_list, X_test):
    vote_list = np.zeros((X_test.shape[0], NUM_CLASSES))
    
    for model in ensamble_list:
        vote_list = vote_list + model.predict(X_test)
        
    return np.argmax(vote_list, axis=1)

In [None]:
for training_history in training_histories:
    print(plot_training_history(training_history))

In [None]:
y_test_true = np.argmax(y_test_onehot, axis=-1)
y_test_pred = evaluate_ensamble(cnn_ensamble, X_test2d)

print('Test Report')
print(classification_report(y_test_true, y_test_pred, digits=4))

matrix = plot_confusion_matrix_by_predictions(y_test_true, y_test_pred)
matrix.figure_.set_figheight(10)
matrix.figure_.set_figwidth(10)

In [None]:
plot_misclassified_samples(y_test_true, y_test_pred, 0, 8)

In [None]:
from keract import get_activations, display_activations, display_heatmaps

sample_image = X_test2d[3201].reshape((28, 28, 1))

activations = get_activations(cnn_ensamble[0], np.array([sample_image]))

display_activations(activations, cmap='binary');

### Submission

In [None]:
y_submission = evaluate_ensamble(cnn_ensamble, X_submission2d)

submission_label_df = pd.DataFrame({
    'ImageId': list(range(1, len(y_submission) + 1)), 
    'Label': y_submission
})

submission_label_df.to_csv('./cnn_ensamble_submission.csv', index=False, header=True)

# Summary

A fundamental computer vision problem such as MNIST can be pretty challenging, especially if you want to understand what is the most precise architecture you can build from the existing blocks.

We tried 3 models and ended up using CNN ensamble to climb to Top %6 (0.999657%). Thanks to awesome Kaggle community that shared a lot of useful information and knowledge.

## References:

- https://www.kaggle.com/jedrzejdudzicz/mnist-dataset-100-top-6
- https://www.kaggle.com/cdeotte/how-to-choose-cnn-architecture-mnist
- https://www.kaggle.com/cdeotte/25-million-images-0-99757-mnist
- https://www.kaggle.com/c/digit-recognizer/discussion/61480
- https://www.kaggle.com/dingli/digits-recognition-with-cnn-keras