# 1. Frame The Problem And Look At The Big Picture

Try to build a classifier for the MNIST dataset that achieves over 97% accuracy
on the test set.
<br>
My solution will be used to classify the further images of the digits
<br>
The task is supervised and it is the classification task
<br>
Performance will be measured through accuracy. We need more than 97% accuracy.
<br>

# 2. Get the Data

We need whole MNIST dataset
<br>
We can get the data from sklearn.datasets
<br>
It takes almost 500MB space
<br>

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt

In [52]:
def GetData():
    from sklearn.datasets import fetch_openml
    mnist = fetch_openml('mnist_784')
    X, y = mnist['data'], mnist['target'].astype(int)
    shuffle_index = np.random.permutation(70000)
    X_train, X_test, y_train, y_test = X.iloc[shuffle_index[:60000]], X.iloc[shuffle_index[60000:]], y[shuffle_index[:60000]], y[shuffle_index[60000:]]
    X_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)
    return X_train, X_test, y_train, y_test

In [84]:
X_train, X_test, y_train, y_test = GetData()

# 3. Explore the Data

In [11]:
X_train.describe()

Unnamed: 0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,pixel10,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
count,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,...,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0
mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.194783,0.0963,0.03845,0.013233,0.01475,0.002,0.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.93817,4.207777,2.366263,1.40801,1.675882,0.3466,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,254.0,254.0,252.0,253.0,254.0,62.0,0.0,0.0,0.0,0.0


In [12]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 60000 entries, 37237 to 6813
Columns: 784 entries, pixel1 to pixel784
dtypes: int64(784)
memory usage: 359.3 MB


In [15]:
X_train.isna().sum()

pixel1      0
pixel2      0
pixel3      0
pixel4      0
pixel5      0
           ..
pixel780    0
pixel781    0
pixel782    0
pixel783    0
pixel784    0
Length: 784, dtype: int64

# 4. Prepare the Data

In [99]:
def PrepareData(data, is_train_dataset): # Set is_train_dataset as True if the argument of the data is training dataset
    # Downcasting the columns' datatype to reduce the memory usage
    for column in data.columns.values:
        data[column] = pd.to_numeric(data[column], downcast='integer')
        
    # Handling the missing values in the data
    if(data.isnull().values.any()):
        from sklearn.impute import KNNImputer
        knn_imputer = KNNImputer(n_neighbors=3)
        data = pd.DataFrame(knn_imputer.fit_transform(data), columns=data.columns)

    return data

In [86]:
X_train = PrepareData(X_train, is_train_dataset=True)

# 5. Short-listing the Promising Models

In [87]:
## Function to show the cross validation scores summary
def Cross_Val_Score_Summary(cv_scores):
    print(f"The Cross Validation Scores are: {cv_scores}")
    print(f"Mean: {round(np.mean(cv_scores), 3)}")
    print(f"Standard Deviation: {round(np.std(cv_scores), 3)}")

In [88]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()

In [89]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
stratified_cv = StratifiedKFold(n_splits=5)
cv_scores_raw = cross_val_score(knn_clf, X_train, y_train, cv=stratified_cv, scoring='accuracy')
Cross_Val_Score_Summary(cv_scores_raw)

The Cross Validation Scores are: [0.9715     0.96975    0.97083333 0.9715     0.96766667]
Mean: 0.97
Standard Deviation: 0.001


In the raw model with the default hyperparameters I am getting the accuracy almost close to 97%

# 6. Fine Tune the System

In [90]:
from sklearn.model_selection import GridSearchCV
parameter_grid = {
    'n_neighbors': [2, 3, 4, 5, 6, 7],
    'weights': ['uniform', 'distance']
}

grid_search = GridSearchCV(estimator=knn_clf, param_grid=parameter_grid, cv=stratified_cv, scoring='accuracy')

In [91]:
grid_search.fit(X_train, y_train)

In [92]:
grid_search.best_score_

0.97325

In [93]:
grid_search.best_params_

{'n_neighbors': 4, 'weights': 'distance'}

# 7. Testing the Model

In [94]:
knn_clf_fine_tuned = KNeighborsClassifier(n_neighbors=4, weights='distance')
knn_clf_fine_tuned.fit(X_train, y_train)

In [95]:
X_test = PrepareData(X_test, False)

In [96]:
y_predicted_test = knn_clf_fine_tuned.predict(X_test)

In [98]:
from sklearn.metrics import accuracy_score
test_accuracy = accuracy_score(y_test, y_predicted_test)
print(f"Test Accuracy Score is: {test_accuracy}")

Test Accuracy Score is: 0.9737
