**Predict whether a person will have diabetes or not.**

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn import metrics

In [2]:
data=pd.read_csv('/content/KNN_Dataset.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
data.shape

(768, 9)

In [4]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

**Replace zeros as zeros are not acceptable**

In [5]:
zero_not_accp=['Pregnancies',	'Glucose',	'BloodPressure',	'SkinThickness',	'Insulin',	'BMI']

for column in  zero_not_accp:
  data[column]=data[column].replace(0,np.NaN)
  mean= int(data[column].mean(skipna=True))
  data[column]=data[column].replace(np.NaN,mean)

In [6]:
X=data.iloc[:,0:8]
y=data.iloc[:,8]

In [7]:
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,72.0,35.0,155.0,33.6,0.627,50
1,1.0,85.0,66.0,29.0,155.0,26.6,0.351,31
2,8.0,183.0,64.0,29.0,155.0,23.3,0.672,32
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21
4,4.0,137.0,40.0,35.0,168.0,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.0,180.0,32.9,0.171,63
764,2.0,122.0,70.0,27.0,155.0,36.8,0.340,27
765,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30
766,1.0,126.0,60.0,29.0,155.0,30.1,0.349,47


In [8]:
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

#Splitting the Dataset

In [9]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)

In [10]:
print("X_train",X_train.shape)
print("X_test",X_test.shape)
print("y_train",y_train.shape)
print("y_test",y_test.shape)

X_train (576, 8)
X_test (192, 8)
y_train (576,)
y_test (192,)


#Feature Scaling

In [11]:
scaler= StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

#Choosing the value of K

In [12]:
import math
math.sqrt(len(y_test))

13.856406460551018

#Model Building

In [13]:
model=KNeighborsClassifier(n_neighbors=13,metric='euclidean',p=2)

In [14]:
model.fit(X_train,y_train)

In [15]:
#checking accuracy of training data
y_pred_train=model.predict(X_train)


In [16]:
metrics.accuracy_score(y_pred_train,y_train)

0.7864583333333334

In [17]:
#checking accuracy of testing data
y_pred_test=model.predict(X_test)


In [18]:
metrics.accuracy_score(y_pred_test,y_test)

0.8072916666666666

In [19]:
print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

           0       0.83      0.89      0.86       130
           1       0.74      0.63      0.68        62

    accuracy                           0.81       192
   macro avg       0.79      0.76      0.77       192
weighted avg       0.80      0.81      0.80       192



In [20]:
confusion_matrix(y_test,y_pred_test)

array([[116,  14],
       [ 23,  39]])

#Model Building - (Used different paarmeters but got same results only)

In [21]:
model1=KNeighborsClassifier(n_neighbors=13,metric='minkowski',p=2)

In [22]:
model1.fit(X_train,y_train)

In [23]:
#checking accuracy of training data
y_pred_train1=model1.predict(X_train)


In [24]:
metrics.accuracy_score(y_pred_train1,y_train)

0.7864583333333334

In [25]:
#checking accuracy of testing data
y_pred_test1=model1.predict(X_test)


In [26]:
metrics.accuracy_score(y_pred_test1,y_test)

0.8072916666666666

In [27]:
print(classification_report(y_test,y_pred_test1))

              precision    recall  f1-score   support

           0       0.83      0.89      0.86       130
           1       0.74      0.63      0.68        62

    accuracy                           0.81       192
   macro avg       0.79      0.76      0.77       192
weighted avg       0.80      0.81      0.80       192



In [28]:
confusion_matrix(y_test,y_pred_test1)

array([[116,  14],
       [ 23,  39]])