In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Problem Statement :-

# Data Gathering :-

In [2]:
df = pd.read_csv("diabetes.csv")
df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,148,50,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
2,183,64,0,0,23.3,0.672,52,1
3,150,66,23,94,28.1,0.167,21,0
4,150,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63,0
764,122,70,27,0,36.8,0.340,27,0
765,121,72,23,112,26.2,0.245,30,0
766,126,60,0,0,30.1,0.349,47,1


In [3]:
df["Outcome"].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [4]:
x = df.drop("Outcome", axis=1)
y = df["Outcome"]

# Model Selection :-

In [5]:
# Train Test Split :-

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20, random_state=42, stratify=y)
x_test

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
44,159,64,0,0,27.4,0.294,40
672,68,106,23,49,35.5,0.285,47
700,122,76,27,200,35.9,0.483,26
630,114,64,0,0,27.4,0.732,34
81,74,0,0,0,0.0,0.102,22
...,...,...,...,...,...,...,...
32,88,58,11,54,24.8,0.267,22
637,94,76,18,66,31.6,0.649,23
593,82,52,22,115,28.5,1.699,25
425,184,78,39,277,37.0,0.264,31


# Model Training :-

In [6]:
knn_cls = KNeighborsClassifier()
knn_cls.fit(x_train,y_train)

# Model Evaluation :-

In [7]:
y_pred = knn_cls.predict(x_test)
y_pred[30:35]

array([0, 0, 0, 1, 1], dtype=int64)

In [8]:
y_test[30:35]

  y_test[30:35]


158    0
83     0
437    0
153    0
469    0
Name: Outcome, dtype: int64

In [9]:
# Testing Data Evaluation :-

def eval_matrix(a,b):
    cnf_matrix = confusion_matrix(a,b)
    print("Confusion matrix is :-\n", cnf_matrix)
    print("*"*80)
    
    accuracy = accuracy_score(a,b)
    print("Accuracy is :- ", accuracy)
    print("*"*80)
    
    clf_report = classification_report(a,b)
    print("Classifcation report is :- ", clf_report)
    
eval_matrix(y_test,y_pred)

Confusion matrix is :-
 [[82 18]
 [28 26]]
********************************************************************************
Accuracy is :-  0.7012987012987013
********************************************************************************
Classifcation report is :-                precision    recall  f1-score   support

           0       0.75      0.82      0.78       100
           1       0.59      0.48      0.53        54

    accuracy                           0.70       154
   macro avg       0.67      0.65      0.66       154
weighted avg       0.69      0.70      0.69       154



In [10]:
# Training Data Evaluation :-

y_train_pred = knn_cls.predict(x_train)

eval_matrix(y_train,y_train_pred)

Confusion matrix is :-
 [[356  44]
 [ 72 142]]
********************************************************************************
Accuracy is :-  0.8110749185667753
********************************************************************************
Classifcation report is :-                precision    recall  f1-score   support

           0       0.83      0.89      0.86       400
           1       0.76      0.66      0.71       214

    accuracy                           0.81       614
   macro avg       0.80      0.78      0.78       614
weighted avg       0.81      0.81      0.81       614



# Hyperparameter Tunning :-

In [11]:
knn_cls = KNeighborsClassifier()
hyperparameter = {"n_neighbors" : np.arange(3,30),
                  "p": [1,2]}

gscv_knn_cls = GridSearchCV(knn_cls, hyperparameter, cv=5)
gscv_knn_cls.fit(x_train,y_train)

In [12]:
gscv_knn_cls.best_estimator_

In [13]:
gscv_knn_cls.best_params_

{'n_neighbors': 14, 'p': 2}

In [14]:
knn_cls_hyper = KNeighborsClassifier(n_neighbors=14,p=2)

knn_cls_hyper.fit(x_train,y_train)

y_pred = knn_cls_hyper.predict(x_test)
y_pred

array([1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
      dtype=int64)

In [15]:
# Testing Data Evaluation :- 

eval_matrix(y_pred, y_test)

Confusion matrix is :-
 [[86 31]
 [14 23]]
********************************************************************************
Accuracy is :-  0.7077922077922078
********************************************************************************
Classifcation report is :-                precision    recall  f1-score   support

           0       0.86      0.74      0.79       117
           1       0.43      0.62      0.51        37

    accuracy                           0.71       154
   macro avg       0.64      0.68      0.65       154
weighted avg       0.76      0.71      0.72       154



In [16]:
# Training Data Evaluation :-

y_pred_train = knn_cls_hyper.predict(x_train)

eval_matrix(y_pred_train, y_train)

Confusion matrix is :-
 [[364 101]
 [ 36 113]]
********************************************************************************
Accuracy is :-  0.7768729641693811
********************************************************************************
Classifcation report is :-                precision    recall  f1-score   support

           0       0.91      0.78      0.84       465
           1       0.53      0.76      0.62       149

    accuracy                           0.78       614
   macro avg       0.72      0.77      0.73       614
weighted avg       0.82      0.78      0.79       614



# Normalization :-

In [17]:
x_df = df.drop("Outcome",axis=1)
x_df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,148,50,35,0,33.6,0.627,50
1,85,66,29,0,26.6,0.351,31
2,183,64,0,0,23.3,0.672,52
3,150,66,23,94,28.1,0.167,21
4,150,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63
764,122,70,27,0,36.8,0.340,27
765,121,72,23,112,26.2,0.245,30
766,126,60,0,0,30.1,0.349,47


In [18]:
x_df.describe()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,121.117188,69.076823,20.536458,79.799479,31.992578,0.471876,33.24349
std,31.805091,19.367794,15.952218,115.244002,7.88416,0.331329,11.758182
min,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,99.0,62.0,0.0,0.0,27.3,0.24375,24.0
50%,117.0,72.0,23.0,30.5,32.0,0.3725,29.0
75%,142.0,80.0,32.0,127.25,36.6,0.62625,41.0
max,199.0,122.0,99.0,846.0,67.1,2.42,81.0


In [19]:
normal_scaler = MinMaxScaler()
array = normal_scaler.fit_transform(x_df)
x_normal_df = pd.DataFrame(array, columns=x_df.columns)
x_normal_df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.743719,0.409836,0.353535,0.000000,0.500745,0.234415,0.483333
1,0.427136,0.540984,0.292929,0.000000,0.396423,0.116567,0.166667
2,0.919598,0.524590,0.000000,0.000000,0.347243,0.253629,0.516667
3,0.753769,0.540984,0.232323,0.111111,0.418778,0.038002,0.000000
4,0.753769,0.327869,0.353535,0.198582,0.642325,0.943638,0.200000
...,...,...,...,...,...,...,...
763,0.507538,0.622951,0.484848,0.212766,0.490313,0.039710,0.700000
764,0.613065,0.573770,0.272727,0.000000,0.548435,0.111870,0.100000
765,0.608040,0.590164,0.232323,0.132388,0.390462,0.071307,0.150000
766,0.633166,0.491803,0.000000,0.000000,0.448584,0.115713,0.433333


In [20]:
# Train Test Split :-

x = x_normal_df
y = df["Outcome"]

x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.20, random_state=42, stratify=y)

In [21]:
knn_cls_normal = KNeighborsClassifier()
knn_cls_normal.fit(x_train,y_train)

In [22]:
# Testing Data Evaluation :-

y_pred = knn_cls_normal.predict(x_test)

eval_matrix(y_pred, y_test)


Confusion matrix is :-
 [[82 26]
 [18 28]]
********************************************************************************
Accuracy is :-  0.7142857142857143
********************************************************************************
Classifcation report is :-                precision    recall  f1-score   support

           0       0.82      0.76      0.79       108
           1       0.52      0.61      0.56        46

    accuracy                           0.71       154
   macro avg       0.67      0.68      0.67       154
weighted avg       0.73      0.71      0.72       154



In [23]:
# Traing Data Evaluation :-

y_pred_train = knn_cls_normal.predict(x_train)

eval_matrix(y_pred_train, y_train)

Confusion matrix is :-
 [[354  68]
 [ 46 146]]
********************************************************************************
Accuracy is :-  0.8143322475570033
********************************************************************************
Classifcation report is :-                precision    recall  f1-score   support

           0       0.89      0.84      0.86       422
           1       0.68      0.76      0.72       192

    accuracy                           0.81       614
   macro avg       0.78      0.80      0.79       614
weighted avg       0.82      0.81      0.82       614



# Normalization with Hyperparameter Tunning :-

In [24]:
x_train

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
353,0.452261,0.508197,0.121212,0.050827,0.405365,0.214347,0.050000
711,0.633166,0.639344,0.272727,0.026005,0.441133,0.154142,0.316667
373,0.527638,0.475410,0.404040,0.111111,0.520119,0.062767,0.066667
46,0.733668,0.459016,0.000000,0.000000,0.442623,0.207515,0.133333
682,0.477387,0.524590,0.393939,0.124113,0.664680,0.122972,0.016667
...,...,...,...,...,...,...,...
451,0.673367,0.573770,0.000000,0.000000,0.430700,0.198121,0.033333
113,0.381910,0.508197,0.000000,0.000000,0.506706,0.133646,0.066667
556,0.487437,0.573770,0.404040,0.000000,0.567809,0.059778,0.150000
667,0.557789,0.573770,0.272727,0.000000,0.409836,0.026900,0.316667


In [25]:
knn_cls_normal_hyp = KNeighborsClassifier()
hyperparameter = {"n_neighbors" : np.arange(3,30),
                  "p" : [1,2]}

gscv_knn_cls_normal_hyp = GridSearchCV(knn_cls_normal_hyp, hyperparameter, cv=5)
gscv_knn_cls_normal_hyp.fit(x_train,y_train)

In [26]:
gscv_knn_cls.best_estimator_

In [27]:
knn_cls_normal_hyp = gscv_knn_cls.best_estimator_
knn_cls_normal_hyp.fit(x_train,y_train)

In [28]:
# Testing Data Evaluation :-

y_pred = knn_cls_normal_hyp.predict(x_test)

eval_matrix(y_pred, y_test)

Confusion matrix is :-
 [[91 26]
 [ 9 28]]
********************************************************************************
Accuracy is :-  0.7727272727272727
********************************************************************************
Classifcation report is :-                precision    recall  f1-score   support

           0       0.91      0.78      0.84       117
           1       0.52      0.76      0.62        37

    accuracy                           0.77       154
   macro avg       0.71      0.77      0.73       154
weighted avg       0.82      0.77      0.79       154



In [29]:
# Training Data Evaluation :-

y_pred_train = knn_cls_normal_hyp.predict(x_train)

eval_matrix(y_pred_train, y_train)

Confusion matrix is :-
 [[365  95]
 [ 35 119]]
********************************************************************************
Accuracy is :-  0.7882736156351792
********************************************************************************
Classifcation report is :-                precision    recall  f1-score   support

           0       0.91      0.79      0.85       460
           1       0.56      0.77      0.65       154

    accuracy                           0.79       614
   macro avg       0.73      0.78      0.75       614
weighted avg       0.82      0.79      0.80       614



# Standardization :-

In [30]:
x_df1 = df.drop("Outcome", axis=1)
y_df1 = df["Outcome"]

In [31]:
std_scaler = StandardScaler()
array = std_scaler.fit_transform(x_df1)
x_std_df = pd.DataFrame(array, columns=x_df1.columns)
x_std_df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.845787,-0.985618,0.907270,-0.692891,0.204013,0.468492,1.426022
1,-1.136319,-0.158966,0.530902,-0.692891,-0.684422,-0.365061,-0.190927
2,1.946957,-0.262298,-1.288212,-0.692891,-1.103255,0.604397,1.596227
3,0.908711,-0.158966,0.154533,0.123302,-0.494043,-0.920763,-1.041953
4,0.908711,-1.502276,0.907270,0.765836,1.409746,5.484909,-0.020722
...,...,...,...,...,...,...,...
763,-0.632927,0.357691,1.722735,0.870031,0.115169,-0.908682,2.532356
764,0.027775,0.047697,0.405445,-0.692891,0.610154,-0.398282,-0.531337
765,-0.003687,0.151028,0.154533,0.279594,-0.735190,-0.685193,-0.276029
766,0.153623,-0.468961,-1.288212,-0.692891,-0.240205,-0.371101,1.170715


In [41]:
x = x_std_df
y = df["Outcome"]

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20, random_state=42, stratify=y)
x_train

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
353,-0.979009,-0.365629,-0.535475,-0.319526,-0.608270,0.326546,-0.786645
711,0.153623,0.461023,0.405445,-0.501867,-0.303664,-0.099290,0.574996
373,-0.507079,-0.572292,1.220910,0.123302,0.369008,-0.745596,-0.701542
46,0.782863,-0.675624,-1.288212,-0.692891,-0.290972,0.278225,-0.361132
682,-0.821699,-0.262298,1.158182,0.218813,1.600124,-0.319759,-0.956850
...,...,...,...,...,...,...,...
451,0.405319,0.047697,-1.288212,-0.692891,-0.392508,0.211782,-0.871747
113,-1.419477,-0.365629,-1.288212,-0.692891,0.254780,-0.244256,-0.701542
556,-0.758775,0.047697,1.220910,-0.692891,0.775149,-0.766737,-0.276029
667,-0.318307,0.047697,0.405445,-0.692891,-0.570195,-0.999286,0.574996


In [42]:
knn_cls_std = KNeighborsClassifier()
knn_cls_std.fit(x_train,y_train)

In [43]:
# Testing Data Evaluation :-

y_pred = knn_cls_std.predict(x_test)

eval_matrix(y_pred, y_test)

Confusion matrix is :-
 [[87 25]
 [13 29]]
********************************************************************************
Accuracy is :-  0.7532467532467533
********************************************************************************
Classifcation report is :-                precision    recall  f1-score   support

           0       0.87      0.78      0.82       112
           1       0.54      0.69      0.60        42

    accuracy                           0.75       154
   macro avg       0.70      0.73      0.71       154
weighted avg       0.78      0.75      0.76       154



In [45]:
# Training Data Evaluation :-

y_pred_train = knn_cls_std.predict(x_train)

eval_matrix(y_pred_train,y_train)

Confusion matrix is :-
 [[350  69]
 [ 50 145]]
********************************************************************************
Accuracy is :-  0.8061889250814332
********************************************************************************
Classifcation report is :-                precision    recall  f1-score   support

           0       0.88      0.84      0.85       419
           1       0.68      0.74      0.71       195

    accuracy                           0.81       614
   macro avg       0.78      0.79      0.78       614
weighted avg       0.81      0.81      0.81       614



# Standardization with Hyperparameter Tunning :-

In [46]:
x_train

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
353,-0.979009,-0.365629,-0.535475,-0.319526,-0.608270,0.326546,-0.786645
711,0.153623,0.461023,0.405445,-0.501867,-0.303664,-0.099290,0.574996
373,-0.507079,-0.572292,1.220910,0.123302,0.369008,-0.745596,-0.701542
46,0.782863,-0.675624,-1.288212,-0.692891,-0.290972,0.278225,-0.361132
682,-0.821699,-0.262298,1.158182,0.218813,1.600124,-0.319759,-0.956850
...,...,...,...,...,...,...,...
451,0.405319,0.047697,-1.288212,-0.692891,-0.392508,0.211782,-0.871747
113,-1.419477,-0.365629,-1.288212,-0.692891,0.254780,-0.244256,-0.701542
556,-0.758775,0.047697,1.220910,-0.692891,0.775149,-0.766737,-0.276029
667,-0.318307,0.047697,0.405445,-0.692891,-0.570195,-0.999286,0.574996


In [47]:
knn_cls_std_hyp = KNeighborsClassifier()

hyperparameter = {"n_neighbors" : np.arange(3,30),
                  "p" : [1,2]}

gscv_knn_cls_std_hyp = GridSearchCV(knn_cls_std_hyp, hyperparameter, cv=5)

gscv_knn_cls_std_hyp.fit(x_train, y_train)

In [48]:
gscv_knn_cls_std_hyp.best_estimator_

In [49]:
knn_cls_std_hyp = gscv_knn_cls_std_hyp.best_estimator_
knn_cls_std_hyp.fit(x_train,y_train)

In [50]:
# Testing Data Evaluation :-

y_pred = knn_cls_std_hyp.predict(x_test)

eval_matrix(y_pred, y_test)

Confusion matrix is :-
 [[89 31]
 [11 23]]
********************************************************************************
Accuracy is :-  0.7272727272727273
********************************************************************************
Classifcation report is :-                precision    recall  f1-score   support

           0       0.89      0.74      0.81       120
           1       0.43      0.68      0.52        34

    accuracy                           0.73       154
   macro avg       0.66      0.71      0.67       154
weighted avg       0.79      0.73      0.75       154



In [51]:
# Training Data Evaluation :-

y_pred_train = knn_cls_std_hyp.predict(x_train)

eval_matrix(y_pred_train, y_train)

Confusion matrix is :-
 [[372 103]
 [ 28 111]]
********************************************************************************
Accuracy is :-  0.7866449511400652
********************************************************************************
Classifcation report is :-                precision    recall  f1-score   support

           0       0.93      0.78      0.85       475
           1       0.52      0.80      0.63       139

    accuracy                           0.79       614
   macro avg       0.72      0.79      0.74       614
weighted avg       0.84      0.79      0.80       614



In [53]:
# Test array length :-

l = len(x.columns)
l

7

In [57]:
x.head(1).T

Unnamed: 0,0
Glucose,0.845787
BloodPressure,-0.985618
SkinThickness,0.90727
Insulin,-0.692891
BMI,0.204013
DiabetesPedigreeFunction,0.468492
Age,1.426022


In [58]:
# Testing on single row :-

test = np.zeros(l)
test

array([0., 0., 0., 0., 0., 0., 0.])

In [60]:
Glucose = 103
BloodPressure = 30
SkinThickness = 38
Insulin = 83
BMI = 43.3
DiabetesPedigreeFunction = 0.183
Age = 33

In [61]:
test[0] = Glucose
test[1] = BloodPressure
test[2] = SkinThickness
test[3] = Insulin
test[4] = BMI
test[5] = DiabetesPedigreeFunction
test[6] = Age

In [62]:
test

array([103.   ,  30.   ,  38.   ,  83.   ,  43.3  ,   0.183,  33.   ])

In [63]:
# Need to scale the data :-

test = normal_scaler.transform([test])
test



array([[0.51758794, 0.24590164, 0.38383838, 0.09810875, 0.64530551,
        0.04483348, 0.2       ]])

In [65]:
prediction = knn_cls_normal_hyp.predict(test)
prediction[0]



0

In [81]:
import pickle
import json

In [82]:
with open("Diabetes_Classification.pkl", "wb") as f:
    pickle.dump(knn_cls_normal_hyp, f)

In [83]:
with open("Normalization.pkl", "wb") as f:
    pickle.dump(normal_scaler, f)

In [84]:
project_data = {"columns" : list(x.columns)}
project_data

{'columns': ['Glucose',
  'BloodPressure',
  'SkinThickness',
  'Insulin',
  'BMI',
  'DiabetesPedigreeFunction',
  'Age']}

In [86]:
with open("project_data.json", "w") as f:
    json.dump(project_data, f)