In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from scipy.stats import mode
from sklearn.neighbors import KNeighborsClassifier

# K Nearest Neighbors Classification
class K_Nearest_Neighbors_Classifier():
    def __init__(self, K):
        self.K = K

    # Function to store training set
    def fit(self, X_train, Y_train):
        self.X_train = X_train
        self.Y_train = Y_train
        self.m, self.n = X_train.shape

    # Function for prediction
    def predict(self, X_test):
        self.X_test = X_test
        self.m_test, self.n = X_test.shape
        Y_predict = np.zeros(self.m_test)
        for i in range(self.m_test):
            x = self.X_test[i]
            neighbors = np.zeros(self.K)
            neighbors = self.find_neighbors(x)
            Y_predict[i] = mode(neighbors)[0][0]
        return Y_predict

    # Function to find the K nearest neighbors to current test example
    def find_neighbors(self, x):
        euclidean_distances = np.zeros(self.m)
        for i in range(self.m):
            d = self.euclidean(x, self.X_train[i])
            euclidean_distances[i] = d
        inds = euclidean_distances.argsort()
        Y_train_sorted = self.Y_train[inds]
        return Y_train_sorted[:self.K]

    # Function to calculate euclidean distance
    def euclidean(self, x, x_train):
        return np.sqrt(np.sum(np.square(x - x_train)))


# Driver code
def main():
    # Importing dataset
    df = pd.read_csv("C:/Users/5th-NLP-Batch2/Downloads/diabetes.csv")
    X = df.iloc[:, :-1].values
    Y = df.iloc[:, -1:].values

    # Splitting dataset into train and test set
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=1/3, random_state=0)

    # Model training
    model = K_Nearest_Neighbors_Classifier(K=3)
    model.fit(X_train, Y_train)

    model1 = KNeighborsClassifier(n_neighbors=3)
    model1.fit(X_train, Y_train)

    # Prediction on test set
    Y_pred = model.predict(X_test)
    Y_pred1 = model1.predict(X_test)

    # Measure performance on test set
    accuracy_test = accuracy_score(Y_test, Y_pred)
    f1_test = f1_score(Y_test, Y_pred, average='binary')

    accuracy_test1 = accuracy_score(Y_test, Y_pred1)
    f1_test1 = f1_score(Y_test, Y_pred1, average='binary')

    print("Accuracy on test set by our model       :  ", accuracy_test * 100)
    print("F1 Score on test set by our model       :  ", f1_test * 100)
    print("Accuracy on test set by sklearn model   :  ", accuracy_test1 * 100)
    print("F1 Score on test set by sklearn model   :  ", f1_test1 * 100)

    # Prediction on train set
    Y_pred_train = model.predict(X_train)
    Y_pred_train1 = model1.predict(X_train)

    # Measure performance on train set
    accuracy_train = accuracy_score(Y_train, Y_pred_train)
    f1_train = f1_score(Y_train, Y_pred_train, average='binary')

    accuracy_train1 = accuracy_score(Y_train, Y_pred_train1)
    f1_train1 = f1_score(Y_train, Y_pred_train1, average='binary')

    print("Accuracy on train set by our model       :  ", accuracy_train * 100)
    print("F1 Score on train set by our model       :  ", f1_train * 100)
    print("Accuracy on train set by sklearn model   :  ", accuracy_train1 * 100)
    print("F1 Score on train set by sklearn model   :  ", f1_train1 * 100)


if __name__ == "__main__":
    main()

Accuracy on test set by our model       :   63.888888888888886
F1 Score on test set by our model       :   43.47826086956522
Accuracy on test set by sklearn model   :   63.888888888888886
F1 Score on test set by sklearn model   :   43.47826086956522
Accuracy on train set by our model       :   79.16666666666666
F1 Score on train set by our model       :   63.41463414634146
Accuracy on train set by sklearn model   :   79.16666666666666
F1 Score on train set by sklearn model   :   63.41463414634146


  return self._fit(X, y)
