In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy.random import default_rng

**Load dataset**

(Dataset from Kaggle. Link: <a href='https://www.kaggle.com/datasets/gkalpolukcu/knn-algorithm-dataset'>Here</a>)

In [None]:
data = pd.read_csv('KNNAlgorithmDataset.csv')
data.head(10)

**Train-test split**

In [None]:
X = data.drop(columns=['id', 'diagnosis', 'Unnamed: 32'])
y = data['diagnosis']

# Train-test split
# Choose random index
rng = default_rng()
train_index = rng.choice(data.shape[0], size=int((data.shape[0])*0.8), replace=False)
test_index = np.array([i for i in range(data.shape[0]) if i not in train_index])
# Get train-test based on train_index, test_index
X_train, X_test = X.copy().loc[train_index], X.copy().loc[test_index]
y_train, y_test = y.copy().loc[train_index], y.copy().loc[test_index]
X_train, X_test, y_train, y_test = X_train.values, X_test.values, y_train.values, y_test.values

**Function**

In [None]:
def distance(point_A, point_B, metric=2):
    return np.sum(np.power(np.sqrt(np.abs(point_A - point_B)), metric))

In [None]:
class KNN:
    def __init__(self, X_train, y_train, weight='same', algorithm='bruteForce', metric=2):
        self.X_train = X_train
        self.y_train = y_train
        self.weight = weight
        self.algorithm = algorithm
        self.metric = metric
    

    def KNearest(self, data_point, K=3):
        # Save distances in a dictionary (keys: index of each point in X_train, value: distance between point in X_train and data_point)
        _ = dict()
        # If using bruteForce
        if self.algorithm == 'bruteForce' or self.algorithm == 1:
            for index in range(self.X_train.shape[0]):
                _[index] = distance(self.X_train[index], data_point, self.metric)
            
            # Sort dictionary by values
            _ = dict(sorted(_.items(), key=lambda item: item[1], reverse=True))
            # Get K key-value (from 1st to K-th highest values)
            return dict((_.popitem() for i in range(K)))
        # <Add more algorithm here...>
        return None


    def findLabel(self, result, data_point):
        # First, I will change an ouput (a number) into a vector by using one-hot-encoding
        # Based on the index keys in result variable, we can get the label of each point
        numPerClass, numClass = self.getNumPerClass()
        KeyValue, ValueKey, y_oneHot, num = [], [], [], 0
        # Set each class as an interger
        for key in numPerClass.keys():
            KeyValue.append((key, num))
            ValueKey.append((num, key))
            num += 1
        KeyValue = dict(KeyValue)
        ValueKey = dict(ValueKey)
        # Set y_oneHot as a list contain key (index of that point) and label (type one-hot vector)
        for key in result.keys():
            y_oneHot.append([key, [1 if i==KeyValue[self.y_train[key]] else 0 for i in range(numClass)]])
        # Calculate sum of all one-hot vector
        total = [0 for i in range(numClass)]
        if self.weight=='same' or self.weight==0:
            for i in range(len(y_oneHot)):
                total = np.add(total, 1 * y_oneHot[i][1])
            # To get a better result if there are multi class which having same number of point near dataPoint (with highest number)
            # We can choose a class which has lots point in dataset
            Max = np.max(total)
            index = [i for i in range(len(total)) if total[i]==Max]
            # Convert index to class
            classBasedOnIndex, Max, nameMax = [ValueKey[item] for item in index], 0, None
            for item in classBasedOnIndex:
                if numPerClass[item] > Max:
                    Max = numPerClass[item]
                    nameMax = item
            return nameMax
        elif self.weight=='distance' or self.weight==1:
            for i in range(len(y_oneHot)):
                total = np.add(total, np.multiply(1/result[y_oneHot[i][0]], y_oneHot[i][1]))
            # Find the final label for that datapoint
            for (key, value) in KeyValue.items():
                if value == np.argmax(total):
                    return key
        return None

    
    def predict(self, data_point, K=3):
        # Find K-nearest points (based on algorithm)
        result = self.KNearest(data_point, K)
        # Find label (based on weight)
        return self.findLabel(result, data_point)


    def evaluate(self, X_test, y_test, K=3):
        count = 0
        for index in range(X_test.shape[0]):
            if self.predict(X_test[index], K) == y_test[index]:
                count += 1
        return count / X_test.shape[0] * 100


    def getNumPerClass(self):
        Class, numClass = dict(), 0
        for label in self.y_train:
            if label not in Class.keys():
                Class[label] = 1
                numClass += 1
            else:
                Class[label] += 1
        return Class, numClass

**Default model**

In [None]:
model = KNN(X_train, y_train)
model.evaluate(X_test, y_test)

**Hyperparameter**

In [None]:
# Here, we will see the rate change by changing 2 hyperparameters: Number nearest point (K) and the distance formula (p)
# These accuracy using weight = 'same'
K = [i for i in range(15)]
metrics =[1, 2, 3]

result_metrics = []
for metric in metrics:
    temp = []
    model = KNN(X_train, y_train, metric=metric)
    for k in K:
        temp.append(model.evaluate(X_test, y_test, K=k))
    result_metrics.append(temp)

In [None]:
plt.figure(figsize=(10,6))
plt.plot(K, result_metrics[0], 'o:r')
plt.plot(K, result_metrics[1], '*-b')
plt.plot(K, result_metrics[2], '.--g')
plt.legend(["Manhattan", "Euclidean", "Minkowski"])
plt.show()

In [None]:
# Here, we will see the rate change by changing 2 hyperparameters: Number nearest point (K) and the distance formula (p)
# These accuracy using weight = 'distance'
K = [i for i in range(15)]
metrics =[1, 2, 3]

result_metrics = []
for metric in metrics:
    temp = []
    model = KNN(X_train, y_train, weight='distance', metric=metric)
    for k in K:
        temp.append(model.evaluate(X_test, y_test, K=k))
    result_metrics.append(temp)

In [None]:
plt.figure(figsize=(10,6))
plt.plot(K, result_metrics[0], 'o:r')
plt.plot(K, result_metrics[1], '*-b')
plt.plot(K, result_metrics[2], '.--g')
plt.legend(["Manhattan", "Euclidean", "Minkowski"])
plt.show()