In [16]:
import pandas as pd
import numpy as np
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report

train_path = './Data/train.csv'
test_path = './Data/test.csv'
sub_path = './Data/sample_submission.csv'
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)
submission = pd.read_csv(sub_path)

In [2]:
#this is where the training & test data is fitted

train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

X_train = train_data.drop(columns=['label']).values
Y_train = train_data['label'].values
X_test = test_data.values

In [3]:
#this is where we tune the hyperparamater k & check performance of base model

param_grid = {'n_neighbors': [3, 4, 5, 6]}
GSCV_kNN = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid)

#stratified k-fold is used to ensure there is an equal distribution of classifiers in each model
stratified_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=43)
for fold, indi in enumerate(stratified_fold.split(X_train, Y_train)):
  X_train_, Y_train_ = X_train[indi[0]], Y_train[indi[0]]
  X_test_, Y_test_ = X_train[indi[1]], Y_train[indi[1]]

  GSCV_kNN.fit(X_train_, Y_train_)
  pred = GSCV_kNN.predict(X_test_)

  #prints some basic info about each model in the list
  print(f"Classification report for Fold {fold + 1}:")
  print(classification_report(Y_test_, pred, digits=3))
    
  print(f"Confusion Matrix for Fold {fold + 1}:")
  print(confusion_matrix(Y_test_, pred), end="\n\n")

  del X_train_
  del X_test_
  del Y_train_
  del Y_test_

Classification report for Fold 1:
              precision    recall  f1-score   support

           0      0.976     0.998     0.987       827
           1      0.951     0.994     0.972       937
           2      0.977     0.959     0.968       835
           3      0.958     0.961     0.959       870
           4      0.984     0.957     0.970       814
           5      0.961     0.962     0.961       759
           6      0.986     0.990     0.988       827
           7      0.954     0.967     0.960       880
           8      0.984     0.930     0.956       813
           9      0.946     0.949     0.948       838

    accuracy                          0.967      8400
   macro avg      0.968     0.967     0.967      8400
weighted avg      0.967     0.967     0.967      8400

Confusion Matrix for Fold 1:
[[825   0   0   0   0   0   0   1   1   0]
 [  0 931   2   1   0   0   1   2   0   0]
 [  6   8 801   2   0   0   1  15   2   0]
 [  0   3   5 836   0  10   1   6   6   3]
 [  1 

In [8]:
prediction = GSCV_kNN.predict(X_test)

In [18]:
prediction = pd.DataFrame(prediction,columns=['Label'])
sub = pd.concat([submission.ImageId,prediction],axis=1)
sub.set_index('ImageId',inplace=True)
sub.to_csv('prediction.csv')