<a href="https://colab.research.google.com/github/Noor-Z1/Machine-Learning/blob/main/KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np


class Distance:
    @staticmethod
    def calculateCosineDistance(x, y, redundantparameter):
        dot  =  np.dot(x,y)
        mag =   np.linalg.norm(x) * np.linalg.norm(y)
        # returning the arc-cos value since that represents the distance
        return np.arccos(dot/mag)
    @staticmethod
    def calculateMinkowskiDistance(x, y, p=2):
        sum=0
        for i in range(0, len(x) ):
              sum+=   pow(abs(x[i] - y [i]), p)
        return pow(sum, 1/p)
    @staticmethod
    def calculateMahalanobisDistance(x,y, S_inverse):
          #print("Inside Mahalanobis")
          #print( np.transpose(np.subtract(x,y)) * S_inverse * np.subtract(x,y ))
          return np.sqrt( (np.transpose(np.subtract(x,y)))  @ S_inverse @ np.subtract(x,y) )




In [None]:

import numpy as np
from scipy.stats import mode

class KNN:
    def __init__(self, dataset, data_label, similarity_function, similarity_function_parameters=None, K=1):
        """
        :param dataset: dataset on which KNN is executed, 2D numpy array
        :param data_label: class labels for each data sample, 1D numpy array
        :param similarity_function: similarity/distance function, Python function
        :param similarity_function_parameters: auxiliary parameter or parameter array for distance metrics
        :param K: how many neighbors to consider, integer
        """
        self.K = K
        self.dataset = dataset
        self.dataset_label = data_label
        self.similarity_function = similarity_function
        self.similarity_function_parameters = similarity_function_parameters

    def predict(self, instance):

        dist = []

        for j in range(len(self.dataset)):
            distances = self.similarity_function(np.array(self.dataset[j,:]), np.array(instance), self.similarity_function_parameters)
            dist.append(distances)

        dist =  np.array(dist)
        dist =  np.argsort(dist)[:self.K]

        labels = self.dataset_label[dist]
        #print(labels)
        labels = mode(labels)
        #print(labels)


        return int(labels[0])






In [None]:

def confidence_interval(x, N):

   lower = np.mean(x) - ((1.96) * ( np.std(x) / np.sqrt(N) ))
   upper = np.mean(x) + ((1.96) * ( np.std(x) / np.sqrt(N) ))

   return lower, upper



In [None]:
import pickle
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold,StratifiedKFold
from sklearn.metrics import accuracy_score


dataset, labels = pickle.load(open("part1_dataset.data", "rb"))



n, dim = dataset.shape

#hyperparamters: K value, distance function
K_value = [5,10,15]
distance = {1: "Cosine", 2: "Minkowski", 3:"Mahalanobis"}


##shuffle left



for i in range(len(K_value)):
  for j in distance:
    alpha =[]
    for iter in range(5):

      kfold = RepeatedStratifiedKFold(n_splits=10,n_repeats=5, random_state= None)
      k_foldacc =[]

      for train_indices, test_indices in kfold.split(dataset,labels):
         current_train= dataset[train_indices]
         current_train_label = labels[train_indices]

         #print(i)
         #print(j)
         #print("---")

         if(j== 1):
           model = KNN(np.array(current_train), np.array(current_train_label), Distance.calculateCosineDistance, None, K_value[i])
         elif (j==2):
            model = KNN(np.array(current_train), np.array(current_train_label), Distance.calculateMinkowskiDistance, 2 ,  K_value[i])
         else:
            S_inverse = np.cov(dataset, rowvar= False)
            S_inverse= np.linalg.inv(S_inverse)

            model = KNN(np.array(current_train), np.array(current_train_label), Distance.calculateMahalanobisDistance, S_inverse ,  K_value[i])

         current_test = dataset[test_indices]
         current_test_labels = labels[test_indices]
         predicted = []

         for k in range(0,15):

           predict = model.predict(current_test[k])
           predicted.append(predict)
         accuracy = accuracy_score(np.array(current_test_labels),np.array(predicted))
         k_foldacc.append(accuracy)

      alpha.append(np.mean(np.array(k_foldacc)))

    print("Hyperparamter configuration-> K value %d   Distance Metric: " %(K_value[i]) )
    print(distance[j])
    print(" Confidence Interval: ")
    print("[ %f  %f  ] \n" %( confidence_interval(np.array(alpha)*100, 5  )  ))
    print("\n")








Hyperparamter configuration-> K value 5   Distance Metric: 
Cosine
 Confidence Interval: 
[ 93.400044  93.853290  ] 



Hyperparamter configuration-> K value 5   Distance Metric: 
Minkowski
 Confidence Interval: 
[ 93.980390  94.552943  ] 



Hyperparamter configuration-> K value 5   Distance Metric: 
Mahalanobis
 Confidence Interval: 
[ 89.013308  90.186692  ] 



Hyperparamter configuration-> K value 10   Distance Metric: 
Cosine
 Confidence Interval: 
[ 95.462745  95.577255  ] 



Hyperparamter configuration-> K value 10   Distance Metric: 
Minkowski
 Confidence Interval: 
[ 94.667883  95.305450  ] 



Hyperparamter configuration-> K value 10   Distance Metric: 
Mahalanobis
 Confidence Interval: 
[ 86.771603  87.841730  ] 



Hyperparamter configuration-> K value 15   Distance Metric: 
Cosine
 Confidence Interval: 
[ 94.162133  94.371200  ] 



Hyperparamter configuration-> K value 15   Distance Metric: 
Minkowski
 Confidence Interval: 
[ 95.456764  96.009903  ] 



Hyperparamter co