In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**According to Pythagoras, " Numbers rule the universe ".**

Hence, in this notebook our goal is to correctly identify the numbers (digits) from a dataset of tens of thousands of handwritten images.

**IMPORTING LIBRARIES:**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import tensorflow as tf

np.random.seed(2)

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import itertools

from tensorflow.keras.optimizers import RMSprop
from keras.callbacks import ReduceLROnPlateau

**LOADING THE DATA:**

In [None]:
# Read training and test data files
train_df = pd.read_csv("/kaggle/input/digit-recognizer/train.csv")
test_df = pd.read_csv("/kaggle/input/digit-recognizer/test.csv")
train_df.head()

In [None]:
print(train_df.shape)
print(test_df.shape)

In [None]:
train_y = train_df['label'].astype('float32')
train_x = train_df.drop(['label'],axis = 1).astype('int32')
test_x = test_df.astype('float32')
train_x.shape, train_y.shape, test_x.shape

In [None]:
g = sns.countplot(train_y)

train_y.value_counts()

**NORMALIZATION & RESHAPE:**

* **NORMALIZATION:**

We perform a grayscale normalization to reduce the effect of illumination's differences.Moreover the CNN converge faster on [0..1] data than on [0..255].

* **RESHAPE:**

Train and test images (28px x 28px) has been stock into pandas.Dataframe as 1D vectors of 784 values. We reshape all data to 28x28x1 3D matrices.Keras requires an extra dimension in the end which correspond to channels. MNIST images are gray scaled so it use only one channel


In [None]:
# Reshape image in 3 dimensions (height = 28px, width = 28px , canal = 1)
train_x = train_x.values.reshape(-1,28,28,1)
# Normalize the data
train_x = train_x / 255.0
# Reshape image in 3 dimensions (height = 28px, width = 28px , canal = 1)
test_x = test_x.values.reshape(-1,28,28,1)
# Normalize the data
test_x = test_x / 255.0
train_x.shape, test_x.shape

**ONE HOT ENCODING:**

One hot encoding is one method of converting data to prepare it for an algorithm and get a better prediction. With one-hot encoding, we convert each categorical value into a new categorical column and assign a binary value of 1 or 0 to those columns. Each integer value is represented as a binary vector.

Here, labels are 10 digits numbers from 0 to 9. We need to encode these lables to one hot vectors (ex : 2 -> [0,0,1,0,0,0,0,0,0,0]).

In [None]:
# Encode labels to one hot vectors (ex : 2 -> [0,0,1,0,0,0,0,0,0,0])
train_y = tf.keras.utils.to_categorical(train_y,10)
train_y.shape

In [None]:
#Printing original labels of top 5 rows
print(train_df['label'].head())

In [None]:
#One hot encoding of the same labels
print(train_y[0:5,:])

**SPLIT THE TRAIN AND THE VALIDATION SET:**

In [None]:
# Set the random seed
random_seed = 2

In [None]:
# Split the train and the validation set for the fitting
#from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(train_x, train_y, test_size = 0.1, random_state=random_seed)

Here, a small fraction (10%) became the validation set which the model is evaluated and the rest (90%) is used to train the model.

**VISUALIZATION OF THE DIGITS BY PLOTTING IMAGES:**
    

In [None]:
rows = 5 # defining no. of rows in figure
cols = 6 # defining no. of colums in figure

f = plt.figure(figsize=(2*cols,2*rows)) # defining a figure 

for i in range(rows*cols): 
    f.add_subplot(rows,cols,i+1) # adding sub plot to figure on each iteration
    plt.imshow(X_train[i].reshape([28,28]),cmap="Blues") 
    plt.axis("off")
    #plt.title(str(Y_train[i]), y=-0.15,color="green")
   
plt.savefig("digits.png")

**VISUALIZATION OF THE FIRST NUMBER:**

In [None]:
# view the first number
plt.imshow(X_train[0].reshape(28, 28))# (rows, columns)
# the label of the first number
plt.title(f"Digit: {Y_train[0]}")
plt.show() 

**VISUALIZATION OF THE FIRST NUMBER IN MORE DETAIL:**

In [None]:
def visualize_input(img, ax):
    ax.imshow(img, cmap='gray')
    width, height = img.shape
    thresh = img.max()/2.5
    for x in range(width):
        for y in range(height):
            ax.annotate(str(round(img[x][y],2)), xy=(y,x),
                        horizontalalignment='center',
                        verticalalignment='center',
                        color='white' if img[x][y]<thresh else 'black')

fig = plt.figure(figsize = (15,15)) 
ax = fig.add_subplot(111)
visualize_input(X_train[0].reshape(28, 28), ax)  # pass image (28, 28)

**MODEL** - **CNN**

* **Convolutional (Conv2D) layer**:

Convolutional (Conv2D) layer is like a set of learnable filters. Each filter transforms a part of the image (defined by the kernel size) using the kernel filter. The kernel filter matrix is applied on the whole image. Filters can be seen as a transformation of the image.

* **Pooling(MaxPool2D) layer:**

This layer simply acts as a downsampling filter. It looks at the 2 neighboring pixels and picks the maximal value. These are used to reduce computational cost, and to some extent also reduce overfitting.

* **Dropout:**

Dropout is a regularization method, where a proportion of nodes in the layer are randomly ignored (setting their wieghts to zero) for each training sample. This drops randomly a propotion of the network and forces the network to learn features in a distributed way. 

* **relu:**

'relu' is the rectifier (activation function max(0,x)). The rectifier activation function is used to add non linearity to the network.

* **Flatten layer:**

The Flatten layer is use to convert the final feature maps into a one single 1D vector. This flattening step is needed so that we can make use of fully connected layers after some convolutional/maxpool layers.

* **Dense layer:**

Dense layer is just artificial an neural networks (ANN) classifier.Finally, in the last layer(Dense(10,activation="softmax")) indicates the net outputs distribution of probability of each class.


In [None]:
#Defining the model
model = tf.keras.models.Sequential([
  tf.keras.layers.Conv2D(32,(3,3),activation = 'relu', input_shape=(28,28,1)),
  tf.keras.layers.Conv2D(32,(3,3),activation = 'relu'),
  tf.keras.layers.MaxPooling2D(2,2),
  tf.keras.layers.Conv2D(64,(3,3),activation = 'relu',padding = 'Same'),
  tf.keras.layers.Conv2D(64,(3,3),activation = 'relu',padding = 'Same'),
  tf.keras.layers.MaxPooling2D(pool_size = (2,2), strides = (2,2)),
  tf.keras.layers.Dropout(0.25),
  tf.keras.layers.Conv2D(64,(3,3),activation = 'relu',padding = 'Same'),
  tf.keras.layers.Conv2D(64,(3,3),activation = 'relu',padding = 'Same'),
  tf.keras.layers.MaxPooling2D(pool_size = (2,2), strides = (2,2)),
  tf.keras.layers.Dropout(0.25),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(256, activation='relu'),
  tf.keras.layers.Dense(256, activation='relu'),
  tf.keras.layers.Dropout(0.50),
  tf.keras.layers.Dense(10, activation='softmax')
])
model.summary()

**SETTING THE OPTIMIZER AND ANNEALER:**


* **Optimizer:**

This function will iteratively improve parameters (filters kernel values, weights and bias of neurons) in order to minimise the loss.

* **RMSprop():**

RMSprop (with default values) is a very effective optimizer. The RMSProp update adjusts the Adagrad method in a very simple way in an attempt to reduce its aggressive, monotonically decreasing learning rate.

* **Annealing method of the learning rate (LR):**

The LR is the step by which the optimizer walks through the 'loss landscape'. Its better to have a decreasing learning rate during the training to reach efficiently the global minimum of the loss function.

* **ReduceLROnPlateau function:**

Reduce learning rate when a metric has stopped improving. Models often benefit from reducing the learning rate by a factor of 2-10 once learning stagnates. This callback monitors a quantity and if no improvement is seen for a 'patience' number of epochs, the learning rate is reduced.

With the ReduceLROnPlateau function from Keras.callbacks, let us choose to reduce the LR by half if the accuracy is not improved after 3 epochs.



In [None]:
# Define the optimizer

#from tensorflow.keras.optimizers import RMSprop

optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)

In [None]:
# Compile the model
model.compile(optimizer = optimizer , loss = "categorical_crossentropy", metrics=["accuracy"])

In [None]:
# Set a learning rate annealer
#from keras.callbacks import ReduceLROnPlateau
learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', 
                                            patience=3, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.00001)

In [None]:
epochs = 15 
batch_size = 86

**DATA AUGMENTATION:**

Data augmentation is a technique to artificially create new training data from existing training data. This is done by applying domain-specific techniques to examples from the training data that create new and different training examples.

The idea is to alter the training data with small transformations to reproduce the variations occuring when someone is writing a digit in order to avoid overfitting problem

**For example:**

The number is not centered 

The scale is not the same (some who write with big/small numbers) 

The image is rotated

In [None]:
from keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(
        featurewise_center=False,  # set input mean to 0 over the dataset
        samplewise_center=False,  # set each sample mean to 0
        featurewise_std_normalization=False,  # divide inputs by std of the dataset
        samplewise_std_normalization=False,  # divide each input by its std
        zca_whitening=False,  # apply ZCA whitening
        rotation_range=10,  # randomly rotate images in the range (degrees, 0 to 180)
        zoom_range = 0.1, # Randomly zoom image 
        width_shift_range=0.1,  # randomly shift images horizontally (fraction of total width)
        height_shift_range=0.1,  # randomly shift images vertically (fraction of total height)
        horizontal_flip=False,  # randomly flip images
        vertical_flip=False)  # randomly flip images


datagen.fit(X_train)

Thus, by applying above transformations to our training data, we can easily double or triple the number of training examples and create a very robust model.

In [None]:
# Fit the model
final = model.fit_generator(datagen.flow(X_train,Y_train, batch_size=batch_size),
                              epochs = epochs, validation_data = (X_val,Y_val),
                              verbose = 2, steps_per_epoch=X_train.shape[0] // batch_size
                              , callbacks=[learning_rate_reduction])

**MODEL EVALUATION:**

In [None]:
# Plot the loss and accuracy curves for training and validation 
#Diffining Figure
f = plt.figure(figsize=(20,7))

#Adding Subplot 1 (For Accuracy)
f.add_subplot(121)

plt.plot(final.epoch,final.history['accuracy'],label = "accuracy") # Accuracy curve for training set
plt.plot(final.epoch,final.history['val_accuracy'],label = "val_accuracy") # Accuracy curve for validation set

plt.title("Accuracy Curve",fontsize=18)
plt.xlabel("Epochs",fontsize=15)
plt.ylabel("Accuracy",fontsize=15)
plt.grid(alpha=0.3)
plt.legend()

#Adding Subplot 1 (For Loss)
f.add_subplot(122)

plt.plot(final.epoch,final.history['loss'],label="loss") # Loss curve for training set
plt.plot(final.epoch,final.history['val_loss'],label="val_loss") # Loss curve for validation set

plt.title("Loss Curve",fontsize=18)
plt.xlabel("Epochs",fontsize=15)
plt.ylabel("Loss",fontsize=15)
plt.grid(alpha=0.3)
plt.legend()

plt.show()


In [None]:
# Plot the loss and accuracy curves for training and validation 
fig, ax = plt.subplots(2,1)
ax[0].plot(final.history['loss'], color='g', label="Training loss")
ax[0].plot(final.history['val_loss'], color='r', label="validation loss",axes =ax[0])
legend = ax[0].legend(loc='best', shadow=True)

ax[1].plot(final.history['accuracy'], color='g', label="Training accuracy")
ax[1].plot(final.history['val_accuracy'], color='r',label="Validation accuracy")
legend = ax[1].legend(loc='best', shadow=True)

Hence, we can observe that the model reaches almost 99%  accuracy on the validation dataset after 2 epochs. The validation accuracy is greater than the training accuracy almost every time during the training. That means that our model dosen't not overfit the training set.

Therefore,our model is very well trained👍

**CONFUSION MATRIX:**

A Confusion matrix is an N x N matrix used for evaluating the performance, where N is the number of target classes. The matrix compares the actual target values with those predicted by the machine learning model


In [None]:
# Look at confusion matrix 
#from sklearn.metrics import confusion_matrix
#import intertools
#from sklearn.model_selection import train_test_split
#from sklearn.metrics import confusion_matrix
#import itertools



def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Purples):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Predict the values from the validation dataset
Y_pred = model.predict(X_val)
# Convert predictions classes to one hot vectors 
Y_pred_classes = np.argmax(Y_pred,axis = 1) 
# Convert validation observations to one hot vectors
Y_true = np.argmax(Y_val,axis = 1) 
# compute the confusion matrix
confusion_mtx = confusion_matrix(Y_true, Y_pred_classes) 
# plot the confusion matrix
plot_confusion_matrix(confusion_mtx, classes = range(10)) 

We can observe that our CNN performs very well on all digits with few errors considering the size of the validation set.

However, it seems that our CNN has some little troubles with the 4 digits,where they are misclassified as 9. Sometimes it is very difficult to catch the difference between 4 and 9 when curves are smooth.

**INVESTIGATING THE ERRORS:**

In [None]:

# Display some error results 

# Errors are difference between predicted labels and true labels
errors = (Y_pred_classes - Y_true != 0)

Y_pred_classes_errors = Y_pred_classes[errors]
Y_pred_errors = Y_pred[errors]
Y_true_errors = Y_true[errors]
X_val_errors = X_val[errors]

def display_errors(errors_index,img_errors,pred_errors, obs_errors):
    """ This function shows 6 images with their predicted and real labels"""
    n = 0
    nrows = 2
    ncols = 2
    fig, ax = plt.subplots(nrows,ncols,sharex=True,sharey=True)
    for row in range(nrows):
        for col in range(ncols):
            error = errors_index[n]
            ax[row,col].imshow((img_errors[error]).reshape((28,28)))
            ax[row,col].set_title("Predicted label :{}\nTrue label :{}".format(pred_errors[error],obs_errors[error]))
            n += 1

# Probabilities of the wrong predicted numbers
Y_pred_errors_prob = np.max(Y_pred_errors,axis = 1)

# Predicted probabilities of the true values in the error set
true_prob_errors = np.diagonal(np.take(Y_pred_errors, Y_true_errors, axis=1))

# Difference between the probability of the predicted label and the true label
delta_pred_true_errors = Y_pred_errors_prob - true_prob_errors

# Sorted list of the delta prob errors
sorted_dela_errors = np.argsort(delta_pred_true_errors)

# Top 6 errors 
most_important_errors = sorted_dela_errors[-10:]

# Show the top 6 errors
display_errors(most_important_errors, X_val_errors, Y_pred_classes_errors, Y_true_errors)

**PREDICTION:**

In [None]:
# predict results
results = model.predict(test_x)

# select the index with the maximum probability
results = np.argmax(results,axis = 1)

results = pd.Series(results,name="Label")

**SUBMISSION:**

In [None]:
submission = pd.concat([pd.Series(range(1,28001),name = "ImageId"),results],axis = 1)
#submission
submission.to_csv("submission.csv",index=False)
print("Successfully Completed!")

**According to Shakuntala Devi, " Numbers have life, they are not just symbols on paper ".**

Hence, in this notebook we have successfully identified the numbers (digits) from a dataset of tens of thousands of handwritten images.