In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    confusion_matrix,
    precision_score,
    recall_score,
    precision_recall_curve
)


### Read & Split Data

In [2]:
data = pd.read_csv('..\..\..\data\diabetes\diabetes.csv')
print(data.info)
y = data.pop('Outcome')

X_train, X_test, y_train, y_test = train_test_split(
    data.to_numpy(), y.to_numpy(), test_size=0.2, random_state=3
)

<bound method DataFrame.info of      Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   5

### Model

In [3]:
model = KNeighborsClassifier(
    n_neighbors=3,
    metric="minkowski"
)
model.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [4]:
model.score(X_test,y_test)

0.6818181818181818

In [5]:
predictions = model.predict(X_test)

In [6]:
confusion_matrix(y_test,predictions)

array([[75, 17],
       [32, 30]], dtype=int64)

In [7]:
def get_metrics(y, y_hat):
    accuracy = accuracy_score(y, y_hat)
    precision = precision_score(y, y_hat)
    recall = recall_score(y, y_hat)
    f1score = f1_score(y, y_hat)
    print(f"Accuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nf1_score: {f1score}")
get_metrics(y_test,predictions)


Accuracy: 0.6818181818181818
Precision: 0.6382978723404256
Recall: 0.4838709677419355
f1_score: 0.5504587155963303


### Cross-Validation
Evaluation estimator performance on unseen data. 

##### K-Fold Cross Validation  
- The training set is split into k smaller sets.
- A model is trained using k-1 of the folds as training data
- The resulting model is validated on remaining part of the data as a Test set to mesure the performance (Accuracy)

The performance measure reported by k-fold cv is then the average of the values computed in loop.

In [8]:
from sklearn.model_selection import cross_val_score

knn_cv = KNeighborsClassifier(
    n_neighbors=3,
    metric="minkowski"
)
cv_scores = cross_val_score(
    estimator=knn_cv,
    X=X_train,
    y=y_train,
    cv=5
)

print(cv_scores)
print(f'Mean of CV scores: {np.mean(cv_scores)}')

[0.71544715 0.70731707 0.70731707 0.69105691 0.71311475]
Mean of CV scores: 0.7068505930960949


### Grid Search CV

Finding the optimal parameter values from give set of param grid  


GridSearchCV works by training our model multiple times on a range of parameters that we specify. That way, we can test our model with each parameter and figure out the optimal values to get the best accuracy results.

In [9]:
from sklearn.model_selection import GridSearchCV

knn_classifier = KNeighborsClassifier()
param_grid = {'n_neighbors': np.arange(3, 20)}
knn_gscv = GridSearchCV(
    estimator=knn_classifier,
    param_grid=param_grid,
    cv=5
)

In [10]:
knn_gscv.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])})

In [11]:
model = knn_gscv.best_estimator_
model

KNeighborsClassifier(n_neighbors=12)

In [12]:
knn_gscv.best_score_

0.7540317206450753

In [13]:
model.score(X_test, y_test)

0.7077922077922078

In [14]:
print(f"We can notice that {model.n_neighbors} is the optimal value for n_neighbors, getting {knn_gscv.best_score_}")

We can notice that 12 is the optimal value for n_neighbors, getting 0.7540317206450753


#### Complete results



In [15]:
knn_gscv.cv_results_ 

{'mean_fit_time': array([0.00091338, 0.00044127, 0.00019999, 0.00060315, 0.00070224,
        0.00080128, 0.0005034 , 0.00043597, 0.00028062, 0.00090742,
        0.00080442, 0.00010071, 0.00083923, 0.0010426 , 0.00050297,
        0.00050268, 0.00103645]),
 'std_fit_time': array([4.93963389e-04, 5.44385030e-04, 3.99971008e-04, 3.75074465e-04,
        4.00506273e-04, 4.00653421e-04, 4.48798302e-04, 5.36980329e-04,
        5.61237335e-04, 4.95846199e-04, 4.02218319e-04, 2.01416016e-04,
        4.25522331e-04, 7.31824654e-05, 4.48369364e-04, 4.48790095e-04,
        5.63792531e-04]),
 'mean_score_time': array([0.00279713, 0.00261827, 0.0033288 , 0.00243697, 0.00264492,
        0.00251231, 0.00270472, 0.00271692, 0.00317373, 0.00281405,
        0.00304308, 0.00288553, 0.00308347, 0.00249538, 0.00293961,
        0.00382042, 0.00274982]),
 'std_score_time': array([0.00037738, 0.00042527, 0.00033839, 0.00033931, 0.00037732,
        0.00030033, 0.00067629, 0.00068278, 0.00034636, 0.00060816,
    