# OBJECTIVE
### KNN - Predict whether a person will be diagnosed with diabetes or not

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('diabetes.csv')
print(len(df))
df.head()

768


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [4]:
# Replace Zeros
Zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin']

for column in Zero_not_accepted:
    df[column] = df[column].replace(0, np.NaN)
    mean = int(df[column].mean(skipna=True))
    df[column] = df[column].replace(np.NaN, mean)

In [5]:
#splitting Dataset
x = df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']]
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

In [6]:
#Feature scaling
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [9]:
import math
math.sqrt(len(y_test))-1 # subtract one to have an odd number, therefore 11 would be the k value, p value is diabeties or not which is 2

11.409673645990857

In [10]:
clf = KNeighborsClassifier(n_neighbors=11, p=2, metric='euclidean') 

In [11]:
clf.fit(X_train,y_train)
y_predict = clf.predict(X_test)

In [13]:
#Evaluate the model
print(metrics.classification_report(y_test,y_predict ))

              precision    recall  f1-score   support

           0       0.85      0.87      0.86       107
           1       0.69      0.66      0.67        47

    accuracy                           0.81       154
   macro avg       0.77      0.76      0.77       154
weighted avg       0.80      0.81      0.80       154



In [14]:
metrics.confusion_matrix(y_test,y_predict)

array([[93, 14],
       [16, 31]], dtype=int64)

In [15]:
metrics.f1_score(y_test,y_predict)

0.6739130434782609

In [16]:
metrics.accuracy_score(y_test,y_predict)*100

80.51948051948052

we have an accuracy of 80%