# K-Nearest Neighbor (KNN)

**Import Libraries**

In [16]:
import pandas as pd
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)

In [2]:
df = pd.read_csv('diabetes.csv')

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## 1) Exploratory Data Analysis (EDA)

In [4]:
def check_df(dataframe, head=5):
    print('######################## Shape ########################')
    print(dataframe.shape, '\n')
    print('######################## Types ########################')
    print(dataframe.dtypes, '\n')
    print('######################## Head ########################')
    print(dataframe.head(head), '\n')
    print('######################## Tail ########################')
    print(dataframe.tail(head), '\n')
    print('######################## NA ########################')
    print(dataframe.isnull().sum(), '\n')

In [5]:
check_df(df)

######################## Shape ########################
(768, 9) 

######################## Types ########################
Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object 

######################## Head ########################
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  DiabetesPedigreeFunction  Age  Outcome
0            6      148             72             35        0  33.6                     0.627   50        1
1            1       85             66             29        0  26.6                     0.351   31        0
2            8      183             64              0        0  23.3                     0.672   32        1
3            1       89             66        

## 2) Data Preprocessing and Feature Engineering

In [6]:
y = df[['Outcome']]
X = df.drop('Outcome', axis=1)

In [13]:
X_scalled = StandardScaler().fit_transform(X)
X = pd.DataFrame(X_scalled, columns = X.columns)

In [14]:
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.639947,0.848324,0.149641,0.907270,-0.692891,0.204013,0.468492,1.425995
1,-0.844885,-1.123396,-0.160546,0.530902,-0.692891,-0.684422,-0.365061,-0.190672
2,1.233880,1.943724,-0.263941,-1.288212,-0.692891,-1.103255,0.604397,-0.105584
3,-0.844885,-0.998208,-0.160546,0.154533,0.123302,-0.494043,-0.920763,-1.041549
4,-1.141852,0.504055,-1.504687,0.907270,0.765836,1.409746,5.484909,-0.020496
...,...,...,...,...,...,...,...,...
763,1.827813,-0.622642,0.356432,1.722735,0.870031,0.115169,-0.908682,2.532136
764,-0.547919,0.034598,0.046245,0.405445,-0.692891,0.610154,-0.398282,-0.531023
765,0.342981,0.003301,0.149641,0.154533,0.279594,-0.735190,-0.685193,-0.275760
766,-0.844885,0.159787,-0.470732,-1.288212,-0.692891,-0.240205,-0.371101,1.170732


## 3) Model

In [17]:
knn_model = KNeighborsClassifier().fit(X, y)

**Get random user and check prediction**

In [19]:
random_user = X.sample(1, random_state=45)
random_user

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
195,0.342981,1.161295,0.770014,1.283638,1.130518,0.940144,-0.232176,-0.360847


In [21]:
knn_model.predict(random_user)

array([1], dtype=int64)

In [20]:
df[df.index == 195]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
195,5,158,84,41,210,39.4,0.395,29,1


## 4) Model Evaluation

In [22]:
y_predictions = knn_model.predict(X)

In [26]:
y_probability = knn_model.predict_proba(X)[:, 1]

In [28]:
print(classification_report(y, y_predictions))

              precision    recall  f1-score   support

           0       0.85      0.90      0.87       500
           1       0.79      0.70      0.74       268

    accuracy                           0.83       768
   macro avg       0.82      0.80      0.81       768
weighted avg       0.83      0.83      0.83       768



In [30]:
roc_auc_score(y, y_probability)

0.9017686567164179

**K-Fold Cross Validation**

In [31]:
cv_result = cross_validate(knn_model, X, y, cv=5, scoring=['accuracy', 'f1', 'roc_auc'])

In [32]:
cv_result

{'fit_time': array([0.00296307, 0.00200796, 0.00299335, 0.00199652, 0.00299191]),
 'score_time': array([0.01097131, 0.01395011, 0.01096821, 0.00997162, 0.00897598]),
 'test_accuracy': array([0.72077922, 0.73376623, 0.71428571, 0.77124183, 0.7254902 ]),
 'test_f1': array([0.58252427, 0.60952381, 0.54166667, 0.63917526, 0.58      ]),
 'test_roc_auc': array([0.77555556, 0.78759259, 0.73194444, 0.83226415, 0.77528302])}

In [34]:
cv_result['test_accuracy'].mean()

0.733112638994992

In [35]:
cv_result['test_f1'].mean()

0.5905780011534191

In [36]:
cv_result['test_roc_auc'].mean()

0.7805279524807827

## 5) Hyperparameter Optimization

In [38]:
knn_model.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

**n_neighbors params**

In [39]:
knn_params = {'n_neighbors': range(2, 50)}

In [41]:
knn_gs_best = GridSearchCV(knn_model, knn_params, cv=5, n_jobs=-1, verbose=1).fit(X, y)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [42]:
knn_gs_best.best_params_

{'n_neighbors': 17}

## 6) Final Model

In [48]:
knn_final_model = knn_model.set_params(**knn_gs_best.best_params_).fit(X, y)

In [49]:
cv_result = cross_validate(knn_final_model, X, y, cv=5, scoring=['accuracy', 'f1', 'roc_auc'])

In [50]:
cv_result['test_accuracy'].mean()

0.7669892199303965

In [51]:
cv_result['test_f1'].mean()

0.6170909049720137

In [52]:
cv_result['test_roc_auc'].mean()

0.8127938504542278