In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    confusion_matrix,
    precision_score,
    recall_score,
    precision_recall_curve
)


### Read & Split Data

In [3]:
data = pd.read_csv('..\..\..\data\diabetes\diabetes.csv')
y = data.pop('Outcome')

X_train, X_test, y_train, y_test = train_test_split(
    data.to_numpy(), y.to_numpy(), test_size=0.2, random_state=3
)

FileNotFoundError: [Errno 2] No such file or directory: '..\\..\\data\\diabetes\\diabetes.csv'

### Model

In [3]:
model = KNeighborsClassifier(
    n_neighbors=3,
    metric="minkowski"
)
model.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [4]:
model.score(X_test,y_test)

0.6818181818181818

In [5]:
predictions = model.predict(X_test)

In [6]:
confusion_matrix(y_test,predictions)

array([[75, 17],
       [32, 30]], dtype=int64)

In [7]:
def get_metrics(y, y_hat):
    accuracy = accuracy_score(y, y_hat)
    precision = precision_score(y, y_hat)
    recall = recall_score(y, y_hat)
    f1score = f1_score(y, y_hat)
    print(f"Accuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nf1_score: {f1score}")
get_metrics(y_test,predictions)


Accuracy: 0.6818181818181818
Precision: 0.6382978723404256
Recall: 0.4838709677419355
f1_score: 0.5504587155963303


### Cross-Validation

In [10]:
from sklearn.model_selection import cross_val_score

knn_cv = KNeighborsClassifier(
    n_neighbors=3,
    metric="minkowski"
)
cv_scores = cross_val_score(estimator=knn_cv, X=X_train, y=y_train, cv=5)

print(cv_scores)
print(f'Mean of CV scores: {np.mean(cv_scores)}')

[0.71544715 0.70731707 0.70731707 0.69105691 0.71311475]
Mean of CV scores: 0.7068505930960949


### Grid Search CV

In [16]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

knn_classifier = KNeighborsClassifier()
param_grid = {'n_neighbors': np.arange(3, 20)}
knn_gscv = GridSearchCV(knn_classifier, param_grid, cv=5)

In [17]:
knn_gscv.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])})

In [22]:
model = knn_gscv.best_estimator_
model

KNeighborsClassifier(n_neighbors=12)

In [20]:
knn_gscv.best_score_

0.7540317206450753

In [24]:
model.score(X_test, y_test)

0.7077922077922078