In [8]:
import pandas as pd
import numpy as np
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report

train_path = './Data/train.csv'
test_path = './Data/test.csv'
#sub_path = './Data/sample_submission.csv'
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)
#submission = pd.read_csv(sub_path)


In [9]:
#this is where the training & test data is fitted

train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

X_train = train_data.drop(columns=['label']).values
Y_train = train_data['label'].values
X_test = test_data.values

In [1]:
# alternative data produced through compression algorithm

import pandas as pd
import numpy as np
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report

compressed_training_path = './Data/compressed_train.csv'
compressed_training_data = np.loadtxt(compressed_training_path,delimiter=',',dtype='int')

compressed_testing_path = './Data/compressed_train.csv'
    
X_train = compressed_training_data[:,1:]
Y_train = compressed_training_data[:,0]
X_test = np.loadtxt(compressed_testing_path,delimiter=',',dtype='int')

print('X train samples')
for sample in X_train[:10]:
    for i, value in enumerate(sample):
        if i % 9 == 0:
            print()
        if i % 81 == 0:
            print()
        print(f'{value:^3}',sep='',end='')

print('\nY train samples')
for sample in Y_train[:10]:
    print(sample)

X train samples


 0  0  0  0  0  0  0  0  0 
 0  0  0  0  0  0  1 12  0 
 0  0  0  0  0 11 12  0  0 
 0  0  0  0 11 12  0  0  0 
 0  0  0 11  1  0  0  0  0 
 0  0  0  1 12  0  0  0  0 
 0  0 11  6  0  0  0  0  0 
 0  0 13 12  0  0  0  0  0 
 0  0  0  0  0  0  0  0  0 

 0  0  0  0  0  0  0  0  0 
 0  0 11 11  1 10  0  0  0 
 0 11  1  1  8  1  1  0  0 
 0 11 12  0  0 13  1  6  0 
 0  7  6  0  0  0  1  6  0 
 0 13  1 10  0  0  1 12  0 
 0  0  1  1  9  1  1  0  0 
 0  0  0  8  8 12  0  0  0 
 0  0  0  0  0  0  0  0  0 

 0  0  0  0  0  0  0  0  0 
 0  0  0  0  6  0  0  0  0 
 0  0  0  0  1  0  0  0  0 
 0  0  0  0  1  0  0  0  0 
 0  0  0  0  1  0  0  0  0 
 0  0  0  0  1  0  0  0  0 
 0  0  0  7  1  0  0  0  0 
 0  0  0 13  8  0  0  0  0 
 0  0  0  0  0  0  0  0  0 

 0  0  0  0  0  0  0  0  0 
 0  0 10  0  0 11  0  0  0 
 0  0  6  0  0  7  6  0  0 
 0  0  6  0  0  7  6  0  0 
 0  0 13  0  0 13 10  0  0 
 0  0  7 10 11  9  1  0  0 
 0  0  0  8  8  0  7  0  0 
 0  0  0  0  0  0  7  0  0 

In [2]:
#this is where we tune the hyperparamater k & check performance of base model

param_grid = {'n_neighbors': [1, 3, 4, 5, 6]}
GSCV_kNN = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid)

#stratified k-fold is used to ensure there is an equal distribution of classifiers in each model
stratified_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=43)
for fold, indi in enumerate(stratified_fold.split(X_train, Y_train)):
  X_train_, Y_train_ = X_train[indi[0]], Y_train[indi[0]]
  X_test_, Y_test_ = X_train[indi[1]], Y_train[indi[1]]

  GSCV_kNN.fit(X_train_, Y_train_)
  pred = GSCV_kNN.predict(X_test_)

  #prints some basic info about each model in the list
  print(f"Classification report for Fold {fold + 1}:")
  print(classification_report(Y_test_, pred, digits=3))
    
  print(f"Confusion Matrix for Fold {fold + 1}:")
  print(confusion_matrix(Y_test_, pred), end="\n\n")

  del X_train_
  del X_test_
  del Y_train_
  del Y_test_

Classification report for Fold 1:
              precision    recall  f1-score   support

           0      0.761     0.926     0.836       827
           1      0.700     0.986     0.819       937
           2      0.859     0.685     0.762       835
           3      0.727     0.663     0.694       870
           4      0.754     0.790     0.771       814
           5      0.752     0.636     0.690       759
           6      0.899     0.878     0.888       827
           7      0.814     0.830     0.822       880
           8      0.763     0.598     0.670       813
           9      0.766     0.710     0.737       838

    accuracy                          0.774      8400
   macro avg      0.779     0.770     0.769      8400
weighted avg      0.779     0.774     0.770      8400

Confusion Matrix for Fold 1:
[[766   8   1   9  13  11   6   2  10   1]
 [  3 924   3   1   1   1   2   1   0   1]
 [ 40  57 572  43  26  13  29  13  35   7]
 [ 44  60  33 577   6  55  12  18  51  14]
 [ 12 

In [None]:
prediction = GSCV_kNN.predict(X_test)

In [18]:
prediction = pd.DataFrame(prediction,columns=['Label'])
sub = pd.concat([submission.ImageId,prediction],axis=1)
sub.set_index('ImageId',inplace=True)
sub.to_csv('prediction.csv')