In [1]:
import pandas as pd
import numpy as np
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report

train_path = './Data/train.csv'
test_path = './Data/test.csv'
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

In [2]:
#this is where the training & test data is fitted

train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

X_train = train_data.drop(columns=['label']).values
Y_train = train_data['label'].values
X_test = test_data.values

In [None]:
#this is where we tune the hyperparamater k & check performance of base model

param_grid = {'n_neighbors': [3, 4, 5, 6, 7, 8, 9, 10]}
GSCV_kNN = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid)

#stratified k-fold is used to ensure there is an equal distribution of classifiers in each model
stratified_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=43)
for fold, indi in enumerate(stratified_fold.split(X_train, Y_train)):
  X_train_, Y_train_ = X_train[indi[0]], Y_train[indi[0]]
  X_test_, Y_test_ = X_train[indi[1]], Y_train[indi[1]]

  GSCV_kNN.fit(X_train_, Y_train_)
  pred = GSCV_kNN.predict(X_test_)

  #prints some basic info about each model in the list
  print(f"Classification report for Fold {fold + 1}:")
  print(classification_report(Y_test_, pred, digits=3))
    
  print(f"Confusion Matrix for Fold {fold + 1}:")
  print(confusion_matrix(Y_test_, pred), end="\n\n")

  del X_train_
  del X_test_
  del Y_train_
  del Y_test_

In [None]:
#this is where the bagging classifer is generated

digit_bag = BaggingClassifier(estimator=GSCV_kNN, oob_score=True)