### Load the dataset

In [2]:
import tensorflow as tf
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

(X_train, y_train), (X_test, y_test) = tf.keras.datasets.cifar10.load_data()




### Vectorizing images

In [3]:
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_test_flat = X_test.reshape(X_test.shape[0], -1)
X_train_flat.shape

(50000, 3072)

### Preprocessing

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train_flat)
X_test_standardized = scaler.fit_transform(X_test_flat)


### Using inbuilt KNN algorithm from scikit-learn

In [5]:
from sklearn.neighbors import KNeighborsClassifier
import time 


neigh = KNeighborsClassifier(n_neighbors=1)

start_time = time.time()

neigh.fit(X_train_standardized, y_train)

knn_prediction = neigh.predict(X_test_standardized)

end_time = time.time()

  return self._fit(X, y)


In [6]:
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

print("time taken by inbuilt KNN implementation: ", end_time - start_time, "seconds")

print("The proportion of correctly classified instances out of the total instances.")
acc_score = accuracy_score(y_test,knn_prediction)
print("Accuracy: ", acc_score*100,"%")
print()

print("The ability of the model to avoid false positives.")
precision = precision_score(y_test, knn_prediction, average=None)
print("precision: ", precision*100)
print()

print("The ability of the model to identify all relevant instances.")
recall = recall_score(y_test, knn_prediction, average=None)
print("recall: ", recall*100)

time taken by inbuilt KNN implementation:  30.15088200569153 seconds
The proportion of correctly classified instances out of the total instances.
Accuracy:  35.68 %

The ability of the model to avoid false positives.
precision:  [42.99732382 65.14285714 24.46463335 30.04587156 25.59012876 36.16751269
 33.14659197 56.89981096 39.61290323 59.55414013]

The ability of the model to identify all relevant instances.
recall:  [48.2 22.8 37.7 26.2 47.7 28.5 35.5 30.1 61.4 18.7]


#### Preparing sample data for self-implemented KNN

In [7]:
import pandas as pd

X_combined = np.concatenate((X_train_flat, X_test_flat), axis=0)
y_combined = np.concatenate((y_train, y_test), axis=0)

df_feature = pd.DataFrame(X_combined)
df_label = pd.DataFrame(y_combined)

train_combined = pd.concat([df_feature, df_label], ignore_index=True, axis=1)

overall_sample_size = 5000
stratified_sample = train_combined.groupby(train_combined.iloc[:, -1], group_keys=False).apply(lambda x: x.sample(frac=overall_sample_size/len(train_combined)))
stratified_sample.reset_index(drop=True, inplace=True)

import sklearn
from sklearn.model_selection import train_test_split
train, test=sklearn.model_selection.train_test_split(stratified_sample,train_size=0.8,test_size=0.2)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
sample_X_train, sample_y_train = train.iloc[:,0:-1], train.iloc[:,-1]
sample_X_test, sample_y_test = test.iloc[:,0:-1], test.iloc[:,-1]

#### self implemented knn

In [8]:
import numpy as np
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

class KNN:
    def __init__(self, X_train, y_train,k=1):
        self.X_train = X_train
        self.y_train = y_train
        self.k = k
    
    def preprocess(self):
        self.X_train_flat = self.X_train.reshape(self.X_train.shape[0], -1)
        
        self.X_train_standardized = scaler.fit_transform(self.X_train_flat)       
        
    def train(self):
        self.Xtr = self.X_train_standardized
        self.Ytr = self.y_train
        
    def predict(self, X_test):
        y_pred = []
        self.X_test_flat = X_test.reshape(X_test.shape[0], -1)
        self.X_test_standardized = scaler.fit_transform(self.X_test_flat)   
        for i,sample in enumerate(self.X_test_standardized):
            dist = np.sqrt(np.sum((self.Xtr - sample)**2, axis=1))
            predicted_label = self.y_train[np.argmin(dist)]
            y_pred.append(predicted_label)
            self.y_pred = y_pred
        return np.array(y_pred)
    
    def evaluate(self,y_test):
        acc_score = accuracy_score(y_test,self.y_pred)
        print("Accuracy: ", acc_score)
        
        precision = precision_score(y_test, self.y_pred, average=None)
        print("Precision: ", precision*100, "%")
        
        recall = recall_score(y_test, self.y_pred, average=None)
        print("Recall: ", recall*100,"%")     

#### Applying self implemented KNN on CIFAR10 sample data

In [9]:
start_time = time.time()

data = KNN(sample_X_train.values, sample_y_train.values)
data.preprocess()
data.train()
data.predict(sample_X_test.values)

end_time = time.time()

data.evaluate(sample_y_test)
print("time taken by self implemented KNN: ", end_time - start_time, "seconds")

Accuracy:  0.27
Precision:  [36.84210526 36.         21.6        20.48192771 20.2247191  26.5060241
 22.95081967 52.63157895 28.57142857 40.90909091] %
Recall:  [40.         10.11235955 23.07692308 18.88888889 37.5        22.68041237
 26.92307692 19.8019802  60.60606061  8.82352941] %
time taken by inbuilt KNN implementation:  86.95178365707397 seconds


#### Changing euclidean distance to manhattan distance

In [15]:
import numpy as np
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

class KNN_L1:
    def __init__(self, X_train, y_train,k=1):
        self.X_train = X_train
        self.y_train = y_train
        self.k = k
    
    def preprocess(self):
        self.X_train_flat = self.X_train.reshape(self.X_train.shape[0], -1)
        
        self.X_train_standardized = scaler.fit_transform(self.X_train_flat)       
        
    def train(self):
        self.Xtr = self.X_train_standardized
        self.Ytr = self.y_train
        
    def predict(self, X_test):
        y_pred = []
        self.X_test_flat = X_test.reshape(X_test.shape[0], -1)
        self.X_test_standardized = scaler.fit_transform(self.X_test_flat)   
        for i,sample in enumerate(self.X_test_standardized):
            dist = np.sum(np.abs(self.Xtr - sample), axis=1)
            predicted_label = self.y_train[np.argmin(dist)]
            y_pred.append(predicted_label)
            self.y_pred = y_pred
        return np.array(y_pred)
    
    def evaluate(self,y_test):
        acc_score = accuracy_score(y_test,self.y_pred)
        print("Accuracy: ", acc_score)
        
        precision = precision_score(y_test, self.y_pred, average=None)
        print("Precision: ", precision*100, "%")
        
        recall = recall_score(y_test, self.y_pred, average=None)
        print("Recall: ", recall*100,"%")     

#### Checking accuracy for manhattan distance

In [16]:
start_time = time.time()

data_L1 = KNN_L1(sample_X_train.values, sample_y_train.values)
data_L1.preprocess()
data_L1.train()
data_L1.predict(sample_X_test.values)

end_time = time.time()

data.evaluate(sample_y_test)
print("time taken by self implemented KNN with Manhattan distance: ", end_time - start_time, "seconds")

Accuracy:  0.27
Precision:  [36.84210526 36.         21.6        20.48192771 20.2247191  26.5060241
 22.95081967 52.63157895 28.57142857 40.90909091] %
Recall:  [40.         10.11235955 23.07692308 18.88888889 37.5        22.68041237
 26.92307692 19.8019802  60.60606061  8.82352941] %
time taken by self implemented KNN with Manhattan distance:  76.59070467948914 seconds


#### Improved KNN algorithm using hyperparameter tuning and cross-validation 

In [20]:
import numpy as np

class KNN_improved:
    def __init__(self, X_train, y_train, k=1):
        self.X_train = X_train
        self.y_train = y_train
        self.k = k
        self.scaler_mean = None
        self.scaler_std = None
    
    def preprocess(self):
        self.X_train_flat = self.X_train.reshape(self.X_train.shape[0], -1)
        
        self.scaler_mean = np.mean(self.X_train_flat, axis=0)
        self.scaler_std = np.std(self.X_train_flat, axis=0)
        self.X_train_standardized = (self.X_train_flat - self.scaler_mean) / self.scaler_std
        
    def train(self):
        self.Xtr = self.X_train_standardized
        self.Ytr = self.y_train
    
    def predict(self, X_test):
        y_pred = []
        X_test_flat = X_test.reshape(X_test.shape[0], -1)
        X_test_standardized = (X_test_flat - self.scaler_mean) / self.scaler_std

        for i, sample in enumerate(X_test_standardized):
            dist = np.sum(np.abs(self.Xtr - sample), axis=1)
            
            k_nearest_indices = np.argsort(dist)[:self.k]
            k_nearest_labels = self.y_train[k_nearest_indices]

            unique, count = np.unique(k_nearest_labels, return_counts=True)
            predicted_label = unique[np.argmax(count)]
            y_pred.append(predicted_label)

        self.y_pred = np.array(y_pred)
        return self.y_pred

    def evaluate(self, X, y, n_splits=5):
        accuracies = []
        
        data_size = X.shape[0]
        fold_size = data_size // n_splits

        for i in range(n_splits):
            test_start = i * fold_size
            test_end = (i + 1) * fold_size

            X_test = X[test_start:test_end]
            y_test = y[test_start:test_end]

            X_train = np.concatenate((X[:test_start], X[test_end:]), axis=0)
            y_train = np.concatenate((y[:test_start], y[test_end:]), axis=0)

            self.X_train = X_train
            self.y_train = y_train
            self.preprocess()
            self.train()
            self.y_test_pred = self.predict(X_test)

            accuracy = np.sum(y_test == self.y_test_pred) / len(y_test)
            accuracies.append(accuracy)

            print(f"Fold Accuracy: {accuracy}")

        mean_accuracy = np.mean(accuracies)
        print(f"Mean Cross-Validation Accuracy: {mean_accuracy}")
        return mean_accuracy
    
    def find_best_k(self, X, y, k_values, n_splits=5):
        best_k = None
        best_accuracy = 0.0

        for k in k_values:
            self.k = k
            mean_cv_accuracy = self.evaluate(X, y, n_splits)

            print(f"Cross-Validation Accuracy for k={k}: {mean_cv_accuracy}")

            if mean_cv_accuracy > best_accuracy:
                best_accuracy = mean_cv_accuracy
                best_k = k

        print(f"Best k: {best_k}, Best Cross-Validation Accuracy: {best_accuracy}")
        return best_k


In [None]:
start_time = time.time()

data2 = KNN_improved(sample_X_train.values, sample_y_train.values)
data2.find_best_k(sample_X_train.values,sample_y_train.values,[1,3,4,5,7,9])

end_time = time.time()

print("time taken by improved KNN implementation: ", end_time - start_time, "seconds")


Fold Accuracy: 0.265
Fold Accuracy: 0.2975
Fold Accuracy: 0.24875
Fold Accuracy: 0.27375


#### Using best k value obtained from improved algorithm

In [14]:
start_time2 = time.time()

data3 = KNN_improved(sample_X_train.values, sample_y_train.values,9)
data3.preprocess()
data3.train()
data3.predict(sample_X_test.values)

end_time2 = time.time()
print("time taken by improved KNN implementation: ",(( end_time - start_time)+( end_time2 - start_time2)), "seconds")


time taken by improved KNN implementation:  1772.672188282013 seconds


In [18]:
 print(accuracy_score(sample_y_test, data3.y_pred)*100,'%')

27.900000000000002 %
