In [166]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.model_selection as skl

In [167]:
# Importing the data
# Let's assume 1 = Diabetes and 0 = no diabetes
df = pd.read_csv('diabetes.csv')
# df.head()

# data pre-processing

# Setting the dependent variable (Y):
y = df.iloc[:,-1]
X = df.iloc[:,:-1]

column_names = np.array(df.columns)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = skl.train_test_split(X, y, test_size=0.3, random_state=42)

In [168]:
### Since we can't technically visualize a 9D graph, (well we can using a 2D comparator matrix for all dimensions)
# Hence, no data visualization.
# So, let's just start coding the KNN

In [169]:
def calculate_euclidean_distance(point, dataset):
    peer_point_dist = []
    ind_dist = 0
    for rows in dataset:
        for i in range(len(rows)):
            ind_dist += (rows[i] - point[i])**2
        ind_dist = ind_dist**0.5
        peer_point_dist.append(ind_dist)
        ind_dist = 0
    return peer_point_dist

In [170]:
def create_resulting_dataframe(data, columns, sort_by):
    new_dataframe = pd.DataFrame(data=data, columns=columns)
    df = new_dataframe.sort_values(by=sort_by)
    return df

In [171]:
def custom_knn(dataframe, k, outcome):
    df = dataframe.head(k)[outcome]
    return df

In [172]:
def voting_system(outputs, check_for_0, check_for_1):
    vote_0 = 0
    vote_1 = 0
    for result in outputs:
        if(result == check_for_0):
            vote_0 += 1
        else:
            vote_1 += 1
    print("Output {0} with votes : {1}".format(check_for_0, vote_0))
    print("Output {0} with votes : {1}".format(check_for_1, vote_1))
    if(vote_0 > vote_1):
        return check_for_0
    return check_for_1

In [173]:
def cross_validation_result(cross_validation, X_all, Y_all):
    result = []
    for k in range(1, cross_validation):
        res = main(k, 'train', X_all, Y_all)
        result.append((res, k))
    return result

In [228]:
def diabetes(X_data, y_data, i):
    X_train = np.array(X_data)
    y_train = np.array(y_data)


    actual_result = y_train[i]
    point = np.delete(X_train, i, 0)
    distance = calculate_euclidean_distance(X_train[i], X_train)

    new_X = np.insert(X_train, 0, values=distance, axis=1)
    new_X = np.insert(new_X, 9, values=y_train, axis=1)
    new_column_names = np.insert(column_names,0, values='Euclidean Distance', axis=0)
    output = create_resulting_dataframe(new_X, new_column_names, ['Euclidean Distance'])



    k = 48
    if(k%2 == 0):
        k += 1
    vals = custom_knn(output, k, 'Outcome')
    vals = np.array(vals)

    final_answer = voting_system(vals, 0, 1)
    if(final_answer == actual_result):
        print("\nResult matches label : ",1) #This means k value is correct.
    else:
        print("\nResult DOESN'T match label : ",0) #this means k value is incorrect
    if(final_answer == 0):
        print("\nCongratulations! You don't have diabetes")
    else:
        print("\nSorry, you have diabetes!")
        
    return output.head()


In [230]:
diabetes(X_train, y_train, 123)

Output 0 with votes : 33
Output 1 with votes : 16

Result matches label :  1

Congratulations! You don't have diabetes


Unnamed: 0,Euclidean Distance,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
123,0.0,4.0,127.0,88.0,11.0,155.0,34.5,0.598,28.0,0.0
376,12.104664,7.0,133.0,88.0,15.0,155.0,32.4,0.262,37.0,0.0
441,22.848151,1.0,117.0,88.0,24.0,145.0,34.5,0.403,40.0,1.0
478,24.068112,4.0,123.0,80.0,15.0,176.0,32.0,0.443,34.0,0.0
3,25.281453,4.0,131.0,68.0,21.0,166.0,33.1,0.16,28.0,0.0
