In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

The data files train.csv and test.csv contain gray-scale images of hand-drawn digits, from zero through nine.

Each image is 28 pixels in height and 28 pixels in width, for a total of 784 pixels in total. Each pixel has a single pixel-value associated with it, indicating the lightness or darkness of that pixel, with higher numbers meaning darker. This pixel-value is an integer between 0 and 255, inclusive.

The training data set, (train.csv), has 785 columns. The first column, called "label", is the digit that was drawn by the user. The rest of the columns contain the pixel-values of the associated image.

Each pixel column in the training set has a name like pixelx, where x is an integer between 0 and 783, inclusive. To locate this pixel on the image, suppose that we have decomposed x as x = i * 28 + j, where i and j are integers between 0 and 27, inclusive. Then pixelx is located on row i and column j of a 28 x 28 matrix, (indexing by zero).

For example, pixel31 indicates the pixel that is in the fourth column from the left, and the second row from the top, as in the ascii-diagram below.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [None]:
#Import training and test files and examine basic information
test = pd.read_csv("../input/test.csv")
train = pd.read_csv("../input/train.csv")
display(train.head(1))
display(train.isna().sum().any())
display(test.shape)
train.shape

In [None]:
#Examine example digit by extracting a single image row, converting to an array and then reshaping to 28x28
#There are no NaNs in the training data set
X_disp = np.array(train.iloc[4,1:])
X_disp = X_disp.reshape(28,28)

plt.figure()
plt.axis('off')
plt.imshow(X_disp)
plt.title('Digit is {}'.format(train.iloc[4,1]));


In [None]:
#display row of an example of each digit which run 0-9
ax1 = plt.figure(figsize=(20,4))
for i in range(10):
    idx = (train['label']==i).idxmax()
    x = np.array(train.iloc[idx,1:]).reshape(28,28)
    lab = train.iloc[idx,0]
    plt.subplot(1,10,i+1)
    plt.axis('off')
    plt.imshow(x)
    plt.title('Digit is {}'.format(lab))

In [None]:
test_labs = test.iloc[:,0]
test_labs.shape
base_pred = np.random.randint(0,9,size = (28000,))
accuracy_score(test_labs,base_pred)

In [None]:
#Separate training data into train and validation
X = train.iloc[:,1:]
y = train.iloc[:,0]

X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.25,random_state = 27)

Simple random forest on all training data has 93.8% accuracy on training data set. Next I will simplify the model. From the images of the digits most pixels are null values, thus the number of features can likely be reduced by removing these without compromising the model. This should at the very least improve the speed of the model

In [None]:
X.iloc[1,:].hist(bins=10) #Inspect a histogram of image intensities to see what proportion of piexls have high intesnity.

As expected, there is a large proportion of null values so it is likely that pca can reduce the number of components without compromising accuracy. Next step is to determine the optimum number of components.

In [None]:
n_components = ([1,2,3,4,5,10,20,50,100,200,500,X_train.shape[1]])
var_ratio = np.zeros(len(n_components))
i=0;
for components in n_components:
    pca = PCA(n_components = components)
    pca.fit(X_train)
    var_ratio[i] = sum(pca.explained_variance_ratio_)
    i+=1   

In [None]:
plt.figure(figsize=(12,6))
plt.plot(n_components,var_ratio,'k.-')
plt.xscale("log")
plt.yticks(np.linspace(0.2,1.0,9))
plt.xticks(np.arange(1,1000,100))
plt.xlabel("number of PCA components",size=20)
plt.ylabel("variance ratio",size=20)

~200 component capture 95% of the variance. I experimented with scaling the data but this increased the number of components required for 95% variance and reduced model accuracy.

In [None]:
#now use pca to transform the data with 200 components
pca = PCA(n_components=200)
X_train_pca = pca.fit_transform(X_train)
X_valid_pca = pca.transform(X_valid)
test_pca = pca.transform(test)

I will try random forest and k-nearest neighbours to preidct the digits. I will use the validation data set to optimise the hyperparameters. I have also tried Random Forest but kNN performs better so I will continue with that.

In [None]:
#KNN unoptimised
knn = KNeighborsClassifier()
knn.fit(X_train_pca,y_train)
knn_predict = knn.predict(X_valid_pca[:2000])
accuracy = accuracy_score(y_valid[:2000],knn_predict)
display('The unoptimised knn model predicts the validation data set with an accuracy of {0:.3f}'.format(accuracy))

Having experimented with tuned and untuned models I have so far achieved better performance with more training data but an untuned model - I have been using less data with the tuned model due to speed. This could be because for this data set more traininf data is required, or because by tuning with a trainig set of only 500 images I am using to small a set to optimally tune the model. For this final run I will tune the model with 1000 data points and the give the tuned model the full training data set. I will submit whichever of the tuned and untuned model has the highest accuracy on 2000 images from the validation data set.

In [None]:
#KNN tuning

#Hyper Parameters
params = {'n_neighbors':[5,7,9,11,13,15],
          'leaf_size':[5,10,15,20,25,30], #small leaf size reduce the model speed too much
          'weights':['uniform', 'distance'],
          'algorithm':['auto', 'ball_tree','kd_tree','brute'],
          }

knn_tuned = GridSearchCV(knn, param_grid=params)
knn_tuned.fit(X_train_pca[:1000],y_train[:1000]) #only using 500 data points to increase speed

knn_params = knn_tuned.best_params_
print("Optimum Hyper Parameters:\n",knn_params)

prediction=knn_tuned.predict(X_valid_pca[:2000])
accuracy = accuracy_score(prediction,y_valid[:2000])

print("Accuracy:{:.3f}".format(accuracy))
print("Confusion Matrix:\n",confusion_matrix(prediction,y_valid))

In [None]:
#Construct KNN model using optimum hyperparamters and more training data.
knn_tuned.fit(X_train_pca,y_train)
knn_tuned_predict = knn_tuned.predict(X_valid_pca[:2000])
accuracy = accuracy_score(y_valid[:2000],knn_tuned_predict)
display('The optimised knn model predicts the validation data set with an accuracy of {0:.3f}'.format(accuracy))

In [None]:
#Create submission files

#kNN
knn_predict = knn.predict(test_pca)

submissions=pd.DataFrame({"ImageId": list(range(1,len(test_pca)+1)),
                         "Label": rf_predict})
submissions.to_csv("kNN_NMIST.csv", index=False, header=True)

#KNN Tuned
knn_tuned_predict = knn_tuned.predict(test_pca)

submissions=pd.DataFrame({"ImageId": list(range(1,len(test_pca)+1)),
                         "Label": knn_predict})
submissions.to_csv("kNN_Tuned_NMIST.csv", index=False, header=True)