In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [5]:
class ModelKNN():
    def __init__(self):
        self.distances = []
        self.final_label = []

    def euclidean_distance(self, test_vector, train_feature_vectors, train_labels):
        for i in range(len(train_feature_vectors)):
            train_vector = train_feature_vectors[i]
            distance = np.sqrt(np.sum((test_vector - train_vector)**2))
            self.distances.append([distance, train_labels[i]]) #distance
        self.distances = sorted(self.distances, key=lambda x: (x[0],))

    def get_k_nearest_neighbours(self, k):
        labels = []
        for i in range(k):
            labels.append(self.distances[i][1])  # get only classes
        return labels

    def get_nearest_neighbor(self, k, train_labels):
        labels = self.get_k_nearest_neighbours(k)
        freq = [0] * 2  # holds [number of class 0, number of class 1] in the nearest k neighbours

        weights = [1 / (d[0] + 1e-6) for d in self.distances[:k]]

        for i in range(len(labels)):
            if labels[i] == 0:
                freq[0] += weights[i]
            else:
                freq[1] += weights[i]

        if freq[0] == freq[1]:
            return train_labels[0]
        return max(set(labels), key=labels.count)

    def classifier(self, k, train_features, test_features, train_labels):
        for test in test_features:
            self.distances = []
            self.euclidean_distance(test, train_features, train_labels)
            self.final_label.append(self.get_nearest_neighbor(k, train_labels))
        return self.final_label

    def MinMaxNormlization(self, feature):
        # Min-Max Scaling
        min_val = np.min(feature)
        max_val = np.max(feature)
        scaled = (feature - min_val) / (max_val - min_val)#(x-min)/(max-min)
        return scaled

    def loadD(self, data):
        X = np.delete(data,8,axis=1)#all data but not the last column
        Y = [] #the last column
        for i in range(len(data)):
            Y.append(data[i][8])
        #shuffling data important to ensure training and testing sets are representative of data.
        train_data, test_data, train_labels, test_labels = train_test_split(X, Y, test_size=0.3, shuffle=True)

        for i in range(8):#terates over the range from 0 to 7 the(number of features)
            train_data[:, [i]] = self.MinMaxNormlization(train_data[:, [i]])
            test_data[:, [i]] = self.MinMaxNormlization(test_data[:, [i]])

        return train_data, test_data, train_labels, test_labels


In [None]:
if __name__ == "__main__":

    data = pd.read_csv('diabetes.csv')

    Model = ModelKNN()
    train_data, test_data, train_labels, test_labels = Model.loadD(data.values)

    k_values = [2, 3, 4, 5, 6]

    # Perform multiple iterations of k on the dataset
    num_iterations = 5  # Number of iterations for each k value
    for k in k_values:
        KNN_Predictions=ModelKNN.classifier(Model,k,train_data, test_data, train_labels)

        wrong_classifier =0
        for i in range(len(test_labels)):
            if KNN_Predictions[i] != test_labels[i]:
                wrong_classifier+=1

        print("K = ",k)
        print("Number of correctly classified instances :",len(test_labels) - wrong_classifier)
        print("Total number of instances : ",len(test_labels))
        accuracy = (1-(wrong_classifier/len(test_labels)))*100
        print("Accuracy: ", accuracy , "%")

K =  2
Number of correctly classified instances : 160
Total number of instances :  231
Accuracy:  69.26406926406928 %
K =  3
Number of correctly classified instances : 160
Total number of instances :  231
Accuracy:  69.26406926406928 %
K =  4
Number of correctly classified instances : 160
Total number of instances :  231
Accuracy:  69.26406926406928 %
K =  5
Number of correctly classified instances : 160
Total number of instances :  231
Accuracy:  69.26406926406928 %
