In [None]:
!pip install visualkeras  

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import cv2
import visualkeras

from sklearn.model_selection import train_test_split

Read the training and testing dataset files:

In [None]:
Train=pd.read_csv('../input/digit-recognizer/train.csv')
Test=pd.read_csv('../input/digit-recognizer/test.csv')

In [None]:
Train.head()

# Data Exploration:

The shape of testing should be 28000 instances and 784 columns (pixels) and for training should be 42000 instances by 785 columns (one more because the first one is the label):

In [None]:
Train.shape, Test.shape

Defining the dataset to be used in the modeling and its label:

In [None]:
Y_train=Train['label']

X_train=Train.drop(['label'],axis=1)
sns.countplot(Y_train)

Above we can see that the classes in our label are quite balanced, thus there will not  be any problem in classification.

In [None]:
X_train.shape

Also, let's see if there is any null value in both datasets:

In [None]:
X_train.isna().sum().sum()

In [None]:
Test.isna().sum().sum()

As we are dealing with images we have to apply Min-Max Scaling on both datasets:

In [None]:
X_train=X_train/255
Test=Test/255

Now, it's time to reshape the images, where each row in the csv file is an image flattened, so will be converted to an squared image of dimentions 28x28. However, that's not all, as we know images must be in 4-dimentions such as: 

(N° images, height, width, color channels):

In [None]:
X_train = X_train.values.reshape(-1,28,28,1)
Test = Test.values.reshape(-1,28,28,1)

Just to confirm that both datasets have changed their shape properly:

In [None]:
X_train.shape, Y_train.shape

In [None]:
Test.shape

In [None]:
X_train[0].shape

Let's see a random image from the training dataset. In order to plot it using matplotlib we have to firstly reshape to a two-dimentional instance i.e. (28x28), for this task np.squeeze will be useful:

In [None]:
print('Image class is: ', Y_train[34]),
plt.imshow(np.squeeze(X_train[34], axis=-1))

Our label can be used as is or one-hot encoded, I will do this last using to_categorical function, take into account that by doing this our model must use as loss function "categorical_crossentropy":

In [None]:
from keras.utils.np_utils import to_categorical
Y_train = to_categorical(Y_train, num_classes = 10)

Time now to train-test split our data, test set will take 10% of total instances:

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.1, random_state=42)

In [None]:
X_train.shape, Y_train.shape

In [None]:
X_val.shape, Y_val.shape

# Modeling

This step will consider the evaluation of one robust model with four optimizers and the best one will be chosen.

Let's import some libraries which will be used next:

In [None]:
import tensorflow as tf

In [None]:
from sklearn.metrics import confusion_matrix
import itertools

from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
from keras.layers import BatchNormalization
from tensorflow.keras.optimizers import RMSprop,Adam,SGD,Adadelta
from keras.preprocessing.image import ImageDataGenerator

ImageDataGenerator function will be used to increase the number of images by changing a bit of their characteristics in order to make our model even more robust to changes in typical images.

In [None]:
datagen = ImageDataGenerator(
    featurewise_center=False,
    samplewise_center=False,
    featurewise_std_normalization=False,
    samplewise_std_normalization=False,
    zca_whitening=False,
    rotation_range=10,
    zoom_range = 0.1,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=False,
    vertical_flip=False)  

Let's define a CallBack which stops the training when the validation accuracy has reached 99.5%, the evaluation will be done at the end of the epoch:

In [None]:
class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if(logs.get('val_accuracy')>0.995):
      print("\nReached 99.5% accuracy so cancelling training!")
      self.model.stop_training = True

## Adam:

Time to build our network, will consider the following layers: 3 2D-convolution, 3 MaxPooling2D, 3 BatchNormalization, 1 Flatten, 2 Dense, 1 Dropout, with next characteristics:

In [None]:
model=Sequential()
model.add(Conv2D(64,(3,3),strides=1,padding='Same',activation='relu',input_shape=(X_train.shape[1],X_train.shape[2],1)))
model.add(MaxPool2D(2,2))
model.add(BatchNormalization())
model.add(Conv2D(128,(3,3), strides=1,padding= 'Same', activation='relu'))
model.add(MaxPool2D(2,2))
model.add(BatchNormalization())
model.add(Conv2D(128,(3,3), strides=1,padding= 'Same', activation='relu'))
model.add(MaxPool2D(2,2))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(1024, activation = "relu"))
model.add(Dropout(0.2))
model.add(Dense(10, activation = "softmax"))

optimizer = Adam(learning_rate=0.001,beta_1=0.9,beta_2=0.999)
model.compile(optimizer = optimizer , loss = "categorical_crossentropy", metrics=["accuracy"])

The following code will train our model with Adam optimizer, then will be printed as a dataframe its four metrics (Accuracy, Loss, Val accuracy, Val loss) in order to be compared easily with other performances:

In [None]:
datagen.fit(X_train)
callbacks = myCallback()
history = model.fit_generator(datagen.flow(X_train, Y_train, batch_size=32),
                              validation_data=(X_val,Y_val), epochs=5, verbose=1,
                              callbacks=[callbacks])

In [None]:
pd.DataFrame(history.history)

## Stochastic Gradient Descent:

In [None]:
optimizer2=SGD(learning_rate=0.01)
model.compile(optimizer = optimizer2 , loss = "categorical_crossentropy", metrics=["accuracy"])

In [None]:
history2 = model.fit_generator(datagen.flow(X_train, Y_train, batch_size=32),
                              validation_data=(X_val,Y_val), epochs=5, verbose=1,
                              callbacks=[callbacks])

In [None]:
pd.DataFrame(history2.history)

## RMSprop:

In [None]:
optimizer3=RMSprop(learning_rate=0.001,rho=0.9,momentum=0.0,epsilon=1e-07)
model.compile(optimizer = optimizer3, loss = "categorical_crossentropy", metrics=["accuracy"])

In [None]:
history3 = model.fit_generator(datagen.flow(X_train, Y_train, batch_size=32),
                              validation_data=(X_val,Y_val), epochs=5, verbose=1,
                              callbacks=[callbacks])

In [None]:
pd.DataFrame(history3.history)

## Adadelta:

In [None]:
optimizer4=Adadelta(learning_rate=0.001, rho=0.95, epsilon=1e-07)
model.compile(optimizer = optimizer4, loss = "categorical_crossentropy", metrics=["accuracy"])

In [None]:
history4 = model.fit_generator(datagen.flow(X_train, Y_train, batch_size=32),
                              validation_data=(X_val,Y_val), epochs=5, verbose=1,
                              callbacks=[callbacks])

In [None]:
pd.DataFrame(history4.history)

## Comparing performance of the four models:

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(20,17))

axs[0, 0].plot(history.history["loss"],c = "purple")
axs[0, 0].plot(history2.history["loss"],c = "orange")
axs[0, 0].plot(history3.history["loss"],c = "green")
axs[0, 0].plot(history4.history["loss"],c = "blue")
axs[0, 0].set_ylim([-0.1,1.1])
axs[0, 0].legend(["Adam", "SGD", "RMSprop", "Adadelta"])
axs[0, 0].set_title('Optimizers Train-Loss')

axs[0, 1].plot(history.history["val_loss"],c = "purple")
axs[0, 1].plot(history2.history["val_loss"],c = "orange")
axs[0, 1].plot(history3.history["val_loss"],c = "green")
axs[0, 1].plot(history4.history["val_loss"],c = "blue")
axs[0, 1].set_ylim([-0.1,1.1])
axs[0, 1].legend(["Adam", "SGD", "RMSprop", "Adadelta"])
axs[0, 1].set_title('Optimizers Val-Loss')

axs[1, 0].plot(history.history["accuracy"],c = "purple")
axs[1, 0].plot(history2.history["accuracy"],c = "orange")
axs[1, 0].plot(history3.history["accuracy"],c = "green")
axs[1, 0].plot(history4.history["accuracy"],c = "blue")
axs[1, 0].set_ylim([0.6,1.1])
axs[1, 0].legend(["Adam", "SGD", "RMSprop", "Adadelta"])
axs[1, 0].set_title('Optimizers Train-Accuracy')

axs[1, 1].plot(history.history["val_accuracy"],c = "purple")
axs[1, 1].plot(history2.history["val_accuracy"],c = "orange")
axs[1, 1].plot(history3.history["val_accuracy"],c = "green")
axs[1, 1].plot(history4.history["val_accuracy"],c = "blue")
axs[1, 1].set_ylim([0.6,1.1])
axs[1, 1].legend(["Adam", "SGD", "RMSprop", "Adadelta"])
axs[1, 1].set_title('Optimizers Val-Accuracy')

Above we can see how our four model have an outstanding superlative performance, just under 99.5% of validation accuracy. After seeing this plot chosing the best model does not makes much difference, but Adadelta is chosen to continue since now.

In [None]:
model.summary()

In [None]:
visualkeras.layered_view(model)

Unfortunately, this function to print the layers of our model does not details what means each color, but looking at the summary of the network you can understand it perfectly!

The following function will plot the metrics of the best model chosen:

In [None]:
import matplotlib.pyplot as plt

def metrics_plot(history):
  acc = history.history['accuracy']
  val_acc = history.history['val_accuracy']
  loss = history.history['loss']
  val_loss = history.history['val_loss']

  epochs = range(len(acc))

  plt.plot(epochs, acc, 'r', label='Training accuracy')
  plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
  plt.title('Training and validation accuracy')
  plt.legend()
  plt.figure()

  plt.plot(epochs, loss, 'r', label='Training Loss')
  plt.plot(epochs, val_loss, 'b', label='Validation Loss')
  plt.title('Training and validation loss')
  plt.legend()

  plt.show()

In [None]:
print('Train accuracy: ',history2.history['accuracy'][-1]),
print('Val accuracy: ',history2.history['val_accuracy'][-1]),
print('Train loss: ',history2.history['loss'][-1]),
print('Val loss: ',history2.history['val_loss'][-1])

In [None]:
metrics_plot(history2)

Let's predict the classes of the validation dataset which contains 4200 instances and look at the first 5 predictions:

In [None]:
prediction_val=model.predict(X_val),
print(prediction_val[0][:5])

As we can see above the prediction for each instance is one-hot encoded so we need to get the index with the highest probability which in simple words corresponds to the class predicted:

In [None]:
class_pred_val = [np.argmax(i) for i in prediction_val[0]]
print(class_pred_val[:5])

Nice!, our prediction corresponds to integer numbers from 0 to 9, as we want to compare with actual label we have to convert the label too:

In [None]:
val_labels =[np.argmax(i) for i in Y_val]
print(val_labels[:5])

The following code will print the confution matrix of our prediction with actual label as a heat map:

In [None]:
from sklearn.metrics import confusion_matrix

f,ax = plt.subplots(figsize=(15, 15))
confusion_mtx = confusion_matrix(val_labels, class_pred_val)
sns.set(font_scale=1.4)
sns.heatmap(confusion_mtx, annot=True, linewidths=0.01,cmap="Greens",linecolor="gray",ax=ax)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix Validation set")
plt.show()

Also, the classification report is useful to know for each class the recall, precision and F1-score, and obviously the accuracy:

In [None]:
from sklearn.metrics import classification_report

report = classification_report(val_labels, class_pred_val)

print(report)

# Prediction of out-of-bag instances:

Let's predict the classes for the testing dataset using our model:

In [None]:
results=model.predict(Test)

In [None]:
results[0:5]

Change the one-hot encoding of our label to classes predicted:

In [None]:
pred_test = [np.argmax(i) for i in results]
print(pred_test[:5])

The following is to create a dataframe with the results and saving as csv file:

In [None]:
submission=pd.DataFrame(pred_test, index=pd.Series(range(1,28001), name='ImageId'),columns=['Label'])
submission.to_csv("mnist_prediction_submission.csv")

In [None]:
submission.head(10)