### Importing Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [4]:
df = pd.read_csv("diabetes.csv")

In [5]:
df.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,148,50,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
2,183,64,0,0,23.3,0.672,52,1
3,150,66,23,94,28.1,0.167,21,0
4,150,40,35,168,43.1,2.288,33,1


### Feature Engineering

In [6]:
x = df.drop("Outcome", axis = 1)

In [7]:
y = df["Outcome"]

In [8]:
x.describe()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,121.117188,69.076823,20.536458,79.799479,31.992578,0.471876,33.24349
std,31.805091,19.367794,15.952218,115.244002,7.88416,0.331329,11.758182
min,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,99.0,62.0,0.0,0.0,27.3,0.24375,24.0
50%,117.0,72.0,23.0,30.5,32.0,0.3725,29.0
75%,142.0,80.0,32.0,127.25,36.6,0.62625,41.0
max,199.0,122.0,99.0,846.0,67.1,2.42,81.0


### Scaling Data

In [9]:
# scaler = MinMaxScaler()
# scaler.fit(x)
# x_transformed = scaler.transform(x)

In [10]:
# scaler = MinMaxScaler()
# x_transformed = scaler.fit_transform(x)

In [11]:
# Generally we split the data first and then scale it

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x,y)

In [13]:
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)

In [14]:
x_train

array([[0.72727273, 0.47540984, 0.33333333, ..., 0.4709389 , 0.14688301,
        0.06666667],
       [0.        , 0.55737705, 0.41414141, ..., 0.58122206, 0.27711358,
        0.33333333],
       [0.6969697 , 0.        , 0.        , ..., 0.54098361, 0.36507259,
        0.06666667],
       ...,
       [0.42929293, 0.60655738, 0.        , ..., 0.4485842 , 0.09479078,
        0.23333333],
       [0.6969697 , 0.67213115, 0.        , ..., 0.5976155 , 0.06746371,
        0.11666667],
       [0.47474747, 0.        , 0.        , ..., 0.        , 0.07600342,
        0.06666667]])

In [15]:
x_train = pd.DataFrame(x_train, columns = x.columns)

In [16]:
x_train

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.727273,0.475410,0.333333,0.159574,0.470939,0.146883,0.066667
1,0.000000,0.557377,0.414141,0.000000,0.581222,0.277114,0.333333
2,0.696970,0.000000,0.000000,0.000000,0.540984,0.365073,0.066667
3,0.464646,0.508197,0.323232,0.148936,0.476900,0.002989,0.416667
4,0.621212,0.606557,0.404040,0.091017,0.508197,0.081554,0.116667
...,...,...,...,...,...,...,...
571,0.545455,0.491803,0.464646,0.210402,0.529061,0.143894,0.050000
572,0.646465,0.557377,0.191919,0.212766,0.454545,0.560632,0.066667
573,0.429293,0.606557,0.000000,0.000000,0.448584,0.094791,0.233333
574,0.696970,0.672131,0.000000,0.000000,0.597615,0.067464,0.116667


In [17]:
x_test = scaler.transform(x_test)

In [18]:
x_test

array([[0.53030303, 0.73770492, 0.        , ..., 0.44113264, 0.05081127,
        0.41666667],
       [0.54040404, 0.50819672, 0.13131313, ..., 0.34128167, 0.25619129,
        0.03333333],
       [0.83333333, 0.72131148, 0.        , ..., 0.45305514, 0.09564475,
        0.46666667],
       ...,
       [0.50505051, 0.59016393, 0.12121212, ..., 0.37704918, 0.24765158,
        0.11666667],
       [0.46464646, 0.6557377 , 0.        , ..., 0.62891207, 0.06789069,
        0.13333333],
       [0.91414141, 0.72131148, 0.44444444, ..., 0.64530551, 0.06148591,
        0.08333333]])

In [19]:
x_test = pd.DataFrame(x_test, columns=x.columns)

In [20]:
x_test

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.530303,0.737705,0.000000,0.000000,0.441133,0.050811,0.416667
1,0.540404,0.508197,0.131313,0.056738,0.341282,0.256191,0.033333
2,0.833333,0.721311,0.000000,0.000000,0.453055,0.095645,0.466667
3,0.782828,0.426230,0.272727,0.638298,0.576751,0.069172,0.066667
4,0.777778,0.508197,0.313131,0.335697,0.488823,0.067891,0.033333
...,...,...,...,...,...,...,...
187,0.777778,0.639344,0.303030,0.118203,0.460507,0.036721,0.400000
188,0.909091,0.540984,0.393939,0.000000,0.625931,0.774979,0.066667
189,0.505051,0.590164,0.121212,0.082742,0.377049,0.247652,0.116667
190,0.464646,0.655738,0.000000,0.000000,0.628912,0.067891,0.133333


### Model Training

In [21]:
KNN_clf = KNeighborsClassifier()
KNN_clf.fit(x_train,y_train)

### Evaluation

In [22]:
y_pred = KNN_clf.predict(x_test)

cnf_matrix = confusion_matrix(y_test, y_pred)
print("Cinfusion Matrix: ")
print(cnf_matrix)
print()
print("Classification Report: ")
print(classification_report(y_test,y_pred))


Cinfusion Matrix: 
[[98 21]
 [33 40]]

Classification Report: 
              precision    recall  f1-score   support

           0       0.75      0.82      0.78       119
           1       0.66      0.55      0.60        73

    accuracy                           0.72       192
   macro avg       0.70      0.69      0.69       192
weighted avg       0.71      0.72      0.71       192



In [23]:
from sklearn.metrics import accuracy_score

In [24]:
accuracy_score(y_test,y_pred)

0.71875

### Hyperparameter Tuning

In [25]:
knn_clf = KNeighborsClassifier()

parameters = {"n_neighbors":np.arange(3,30),
             "p":[1,2]
             }

# The hyperparameter p decides if Manhatten Distance is to be calculated or Eucledian Distance is to be calculated
# If p = 1, distance calculated is Eucledian
# If p = 2, distance calculated is Manhatten

gscv_knn_clf = GridSearchCV(knn_clf,param_grid=parameters,cv = 5)
gscv_knn_clf.fit(x_train,y_train)
gscv_knn_clf.best_estimator_

In [26]:
knn_clf = KNeighborsClassifier(n_neighbors=21,p=1)

knn_clf.fit(x_train,y_train)
y_pred = knn_clf.predict(x_test)

cnf_matrix = confusion_matrix(y_test, y_pred)
print("Cinfusion Matrix: ")
print(cnf_matrix)
print()
print("Classification Report: ")
print(classification_report(y_test,y_pred))

Cinfusion Matrix: 
[[110   9]
 [ 36  37]]

Classification Report: 
              precision    recall  f1-score   support

           0       0.75      0.92      0.83       119
           1       0.80      0.51      0.62        73

    accuracy                           0.77       192
   macro avg       0.78      0.72      0.73       192
weighted avg       0.77      0.77      0.75       192



In [31]:
new_test_dp = pd.Series(np.zeros(len(x.columns)), index = x.columns)

In [32]:
new_test_dp

Glucose                     0.0
BloodPressure               0.0
SkinThickness               0.0
Insulin                     0.0
BMI                         0.0
DiabetesPedigreeFunction    0.0
Age                         0.0
dtype: float64

In [33]:
new_test_dp["Glucose"] = 120
new_test_dp["BloodPressure"] = 80
new_test_dp["SkinThickness"] = 15
new_test_dp["Insulin"]  = 30
new_test_dp["BMI"] = 29
new_test_dp["DiabetesPedigreeFunction"] = 0.36
new_test_dp["Age"] = 45

In [34]:
new_test_dp

Glucose                     120.00
BloodPressure                80.00
SkinThickness                15.00
Insulin                      30.00
BMI                          29.00
DiabetesPedigreeFunction      0.36
Age                          45.00
dtype: float64

In [40]:
new_test_dp_transformed = scaler.transform([new_test_dp])



In [41]:
knn_clf.predict(new_test_dp_transformed)[0]



0

In [42]:
import pickle

In [43]:
with open("data_scaler.pickle","wb") as file:
    pickle.dump(scaler,file)

In [44]:
with open("data_scaler.pickle","rb") as file2:
    imported_scaler = pickle.load(file2)

In [45]:
imported_scaler

In [48]:
new_test_dp = new_test_dp.to_numpy()

In [50]:
new_test_dp_transformed = imported_scaler.transform([new_test_dp])



In [51]:
knn_clf.predict(new_test_dp_transformed)[0]



0

In [52]:
with open("knn_model_diabetic.pickle","wb") as file:
    pickle.dump(knn_clf,file)

In [53]:
import json

In [57]:
column_names = {"columns":list(x.columns)}

In [58]:
column_names

{'columns': ['Glucose',
  'BloodPressure',
  'SkinThickness',
  'Insulin',
  'BMI',
  'DiabetesPedigreeFunction',
  'Age']}

In [59]:
with open("project_data.json","w") as file:
    json.dump(column_names,file)