In [1]:
import pandas as pd
import numpy as np

In [2]:
# utilities
def euclidean_distance(instance1, instance2):
    distance = 0.0
    for i in range(len(instance1)):
        distance += (instance1[i] - instance2[i])**2
    return distance**0.5

def get_neighbors(X, instance, k):
    neighbors = list()
    for j in range(len(X)):
        distance = euclidean_distance(instance, X.iloc[j])
        neighbors.append((j, distance))

    neighbors.sort(key = lambda t:t[1])
    k_neighbors = list()
    for index in range(k):
        k_neighbors.append(neighbors[index][0])
    return k_neighbors

def most_frequent(lst):
    return max(set(lst), key = lst.count)

def KNN(X, target, test, k = 3):
    predictions = []
    for i in range(len(test)):
        instance = test.iloc[i]
        k_neighbors = get_neighbors(X, instance, k)
        outcomes = []
        for index in k_neighbors:
            outcomes.append(target.iloc[index])
        predictions.append(most_frequent(outcomes))
    return predictions

In [3]:
data = pd.read_csv("diabetes.csv")
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
X = data.drop(columns = ['Outcome'])
y = data['Outcome']
# normalizing data using min-max scaler
for feature in X.columns:
    X[feature] = (X[feature] - X[feature].min()) / (X[feature].max() - X[feature].min())*9 + 1

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [6]:
predictions = KNN(X_train, y_train, X_test)

In [7]:
def accuracy(actual, predicted):
    total_instance = len(actual)
    correctly_predicted = 0.0
    for i in range(len(actual)):
        if(actual[i] == predicted[i]):
            correctly_predicted += 1
    return (correctly_predicted/total_instance) * 100

In [18]:
for k in range(1,11):
    predictions = KNN(X_train, y_train, X_test, k)
    print(f"Accuracy for k = {k}: {accuracy(list(y_test), predictions)}")

Accuracy for k = 1: 66.88311688311688
Accuracy for k = 2: 68.18181818181817
Accuracy for k = 3: 69.48051948051948
Accuracy for k = 4: 66.88311688311688
Accuracy for k = 5: 68.83116883116884
Accuracy for k = 6: 68.18181818181817
Accuracy for k = 7: 67.53246753246754
Accuracy for k = 8: 69.48051948051948
Accuracy for k = 9: 69.48051948051948
Accuracy for k = 10: 71.42857142857143
