In [1]:
import numpy as np
import statistics

In [16]:
class KNN_classifier:

    def __init__(self, distance_metric='euclidean'):
        self.distance_metric = distance_metric

    def get_distance_metric(self, training_data_point, test_data_point):
        if self.distance_metric == 'euclidean':
            dist = 0
            for i in range(len(training_data_point) - 1):
                dist += (training_data_point[i] - test_data_point[i]) ** 2
            return np.sqrt(dist)

        elif self.distance_metric == 'manhattan':
            dist = 0
            for i in range(len(training_data_point) - 1):
                dist += abs(training_data_point[i] - test_data_point[i])
            return dist

    def nearest_neighbors(self, X_train, test_data, k):
        distance_list = []

        for training_data in X_train:
            distance = self.get_distance_metric(training_data, test_data)
            distance_list.append((training_data, distance))

        distance_list.sort(key=lambda x: x[1])  # sort by distance
        neighbors_list = [distance_list[j][0] for j in range(k)]
        return neighbors_list

    def predict(self, X_train, test_data, k):
        neighbors = self.nearest_neighbors(X_train, test_data, k)

        labels = []
        for data in neighbors:
            labels.append(data[-1])  # assuming label is last column

        predicted_class = statistics.mode(labels)
        return predicted_class


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
data = pd.read_csv("/content/sample_data/diabetes.csv")

In [5]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
data.shape

(768, 9)

In [7]:
x = data.drop(columns="Outcome", axis=1)
y = data["Outcome"]

In [8]:
print(x)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  


In [9]:
# converting the data to numpy array
X = x.to_numpy()
Y = y.to_numpy()

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size =0.2, random_state=42)


In [11]:
print(x_train.shape)

(614, 8)


In [12]:
#Adding target value in x_train according to the function
x_train = np.insert(x_train, 8,y_train, axis=1)

In [13]:
print(x_train.shape)

(614, 9)


X_train --> training data with features and target

X_test  --> test data without target

Model training

In [17]:
classifier = KNN_classifier(distance_metric="manhattan")

NOTE: The KNN CLassifier can predict the label for only one data point at a time

In [19]:
prediction = classifier.predict(x_train, x_test.iloc[2], k=5)


  dist += abs(training_data_point[i] - test_data_point[i])


In [21]:
print(y_test[2])

1
