KNN model to predict whether a person will be diagnosed with diabetes or not.

In [1]:
#importing the libraries
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Loading the dataset
dataset = pd.read_csv('diabetes.csv')
print(dataset.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [3]:
# Values of columns like 'Glucose','Blood Presure' cannot accepted as zeroes because it will affect the outcome
# We can replace such values with the 'mean' of the respective columns

#Replacing zeroes
zero_not_accepted = ['Glucose','BloodPressure','SkinThickness','BMI','Insulin']
for columns in zero_not_accepted:
    dataset[columns] = dataset[columns].replace(0,np.NaN)
    mean = int(dataset[columns].mean(skipna = True))
    dataset[columns] = dataset[columns].replace(np.NaN,mean)
print(dataset.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6    148.0           72.0           35.0    155.0  33.6   
1            1     85.0           66.0           29.0    155.0  26.6   
2            8    183.0           64.0           29.0    155.0  23.3   
3            1     89.0           66.0           23.0     94.0  28.1   
4            0    137.0           40.0           35.0    168.0  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [4]:
# Splitting the dataset into Train and Test set

X = dataset.iloc[:,0:8]
y = dataset.iloc[:,8]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 0)

In [5]:
# Feature Scaling
scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)

In [6]:
# Defining the Model 
classifier = KNeighborsClassifier(n_neighbors = 11,p = 2,metric = 'euclidean')

In [7]:
# fitting it to the model
classifier.fit(X_train,y_train)

In [8]:
# Predict the Test set result
y_pred = classifier.predict(X_test)
print(y_pred)

[1 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0
 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 1 1 0 0 0 1 0 1 1 0 0 1 1 1 1 0 0 0 0 0 0 1
 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 0 0 0 1 1 0 1 0 1 0
 0 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0
 0 0 0 0 0 0]


In [9]:
# Evaluating Model using confusion matrix
cm = confusion_matrix(y_test,y_pred)
print(cm)

[[94 13]
 [15 32]]


In [10]:
print('F1 SCORE IS: ',f1_score(y_test,y_pred))

F1 SCORE IS:  0.6956521739130436


In [11]:
print('ACCURACY SCORE IS: ',accuracy_score(y_test,y_pred))

ACCURACY SCORE IS:  0.8181818181818182
