##Importing libraries for our KNN Implementation

In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

##Reading our dataset file i.e. the diabetes.csv files using the 'pd.read_csv' command

In [0]:
data = pd.read_csv('/content/diabetes.csv')

# Link to the diabetes dataset - https://www.kaggle.com/johndasilva/diabetes

##The len command is used to find the length of the dataset i.e. the number of rows in our dataset.

In [35]:
len(data)

768

##The ".head()" command is used to check the first 5 rows in our dataset with all the columns included. We can specify the number of rows in the brackets, but the default value is 5.

In [36]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [0]:
zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']
for column in zero_not_accepted:
     data[column] = data[column].replace(0,np.NaN)
     mean = int(data[column].mean(skipna=True))
     data[column] = data[column].replace(np.NaN,mean)

##Initializing the training and the target columns

In [0]:
X = data.iloc[:,:8]
y = data.iloc[:,8]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .2, random_state = 0) 
#The train_test_split function to split our training and testing data into 80% for training and 20% for testing

In [0]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [40]:
import math
math.sqrt(len(y_test))


12.409673645990857

## Creating a KNN object called clf and training the model using the fit function

In [41]:
clf = KNeighborsClassifier(n_neighbors = 11, p = 2, metric = 'euclidean')
clf.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=11, p=2,
                     weights='uniform')

#Using the ".predict" function on our testing dataset to predict the output

In [0]:
y_pred = clf.predict(X_test)

In [43]:
y_pred # Displaying the output array

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## Confusion matrix is used to represent the True and the False values.

In [44]:
cm = confusion_matrix(y_test,y_pred)
cm

array([[94, 13],
       [15, 32]])

## The "accuracy_score" function shows the prediction accuracy of our KNN model. The accuracy of our KNN Diabetes Detection Model is 81.81%

In [45]:
accuracy_score(y_test,y_pred)

0.8181818181818182