In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import joblib

In [2]:
# Read in the data. Label is the numeric label (0-9). Other columns in X data represent the pixel intensity (0-255) of the image
# at the designated pixel. The MNIST data is already quite clean and well pre-processed, so we can just feed the data into
# our model.

X_train = pd.read_csv("mnist_train.csv")
y_train = X_train.label
X_train.drop('label', axis = 1, inplace = True)

X_test = pd.read_csv("mnist_test.csv")
y_test = pd.DataFrame(X_test.label)
X_test.drop('label', axis = 1, inplace = True)

X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [3]:
from scipy.ndimage.interpolation import shift

# Function that shifts the original images by dy in the y direction and dx in the x direction (using image coordinates)
# i.e. the origin is in the top left corner of the image.
def shift_digit(image, dx, dy):
    image = image.reshape(28,28)
    shift_image = np.array(shift(image, [dy, dx], cval = 0, mode = "constant"))
    return shift_image.reshape([-1])


# Convert to a list, to make it less of a hassle to append to the data set
X_train_expanded = X_train.tolist()
y_train_expanded = y_train.tolist()

# Augment the data, by shifting each image four pixels to the right, four pixels to the left, four pixels down,
# then four pixels up. This will lead to better generalization and accuracy.
for dx, dy in ((4,0), (-4, 0), (0, 4), (0, -4)):
    for ix in range (0, len(X_train)):
        shifted_image = shift_digit(X_train[ix], dx, dy)
        X_train_expanded.append(shifted_image.tolist())
        y_train_expanded.append(y_train[ix])

In [4]:
# # Testing shift_digit function
# original = X_train_expanded[1]
# shifted = X_train_expanded[60001]

# plt.imshow(original.reshape(28,28), cmap = "Greys")
# plt.show()
# plt.imshow(shifted.reshape(28,28), cmap = "Greys")
# plt.show()

In [8]:
from sklearn.ensemble import RandomForestClassifier

image_clf = RandomForestClassifier(random_state = 0, n_jobs = 10, n_estimators = 250)
image_clf.fit(X_train_expanded, y_train_expanded)

RandomForestClassifier(n_estimators=250, n_jobs=10, random_state=0)

In [9]:
y_test_pred = image_clf.predict(X_test)
accuracy = image_clf.score(X_test, y_test)
print (accuracy)

0.97


In [10]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

y_train_pred = cross_val_predict(image_clf, X_train, y_train, cv = 2, n_jobs = 8)
conf_matrix = confusion_matrix(y_train, y_train_pred)
print (conf_matrix)

[[5844    1   10    3    5    8   17    1   31    3]
 [   1 6635   38   18   11    2    8   12   11    6]
 [  30   13 5739   32   32    4   21   42   37    8]
 [   9   12   92 5789    6   69    9   45   71   29]
 [  11    9   14    0 5637    0   26   12   21  112]
 [  26   14   10   67   10 5175   50    7   35   27]
 [  28   10    4    0   10   44 5803    0   19    0]
 [   7   23   70    7   39    2    0 6018   13   86]
 [  11   36   32   52   24   50   33    5 5541   67]
 [  24   11   19   83   78   20    3   54   44 5613]]


## Random Forest Classifier Model Evaluation:

Model_v3 utilizes a RandomForestClassifier with 250 decision trees. It yields a marginally better accuracy score on the test data (97% as opposed to model_v2's 96.72%). However, it is MUCH faster since it utilizes Decision trees instead of the Nearest Neighbors algorithm. 

The Confusion Matrix for this data set (columns from 0-9, representing the classified digit):


    [[5844    1   10    3    5    8   17    1   31    3]
     [   1 6635   38   18   11    2    8   12   11    6]
     [  30   13 5739   32   32    4   21   42   37    8]
     [   9   12   92 5789    6   69    9   45   71   29]
     [  11    9   14    0 5637    0   26   12   21  112]
     [  26   14   10   67   10 5175   50    7   35   27]
     [  28   10    4    0   10   44 5803    0   19    0]
     [   7   23   70    7   39    2    0 6018   13   86]
     [  11   36   32   52   24   50   33    5 5541   67]
     [  24   11   19   83   78   20    3   54   44 5613]]

As evidenced by the confusion matrix, model performance on 8's, 6's, and 9's are the worst. 6's often get classified as 5's, 8's often get classified as 3's, and 9's often get classified as 4's. These all make sense since these numbers are very structurally similar, so this type of error will always exist to some degree (and sometimes even humans can't tell the difference).

In [12]:
# Save model to file called "KNN_MNIST_ImageClassifier_v3"
joblib.dump(image_clf, "KNN_MNIST_ImageClassifier_v3")


['KNN_MNIST_ImageClassifier_v3']