In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.datasets import make_classification

X, y = make_classification(
    n_samples=1000, # 1000 observations
    n_features=3, # 3 total features
    n_redundant=1, 
    n_classes=2, # binary target/label
    random_state=999
)

In [5]:
X # 3 features, 1000 data points

array([[-0.33504974,  0.02852654,  1.16193084],
       [-1.37746253, -0.4058213 ,  0.44359618],
       [-1.04520026, -0.72334759, -3.10470423],
       ...,
       [-0.75602574, -0.51816111, -2.20382324],
       [ 0.56066316, -0.07335845, -2.15660348],
       [-1.87521902, -1.11380394, -4.04620773]])

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [6]:
from sklearn.neighbors import KNeighborsClassifier

In [7]:
classifier=KNeighborsClassifier(n_neighbors=5,algorithm='auto') # by default p=2, which is eucledian distance 
classifier.fit(X_train,y_train)

In [8]:
y_pred=classifier.predict(X_test)

In [10]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [11]:
print(confusion_matrix(y_pred,y_test))
print(accuracy_score(y_pred,y_test))
print(classification_report(y_pred,y_test))

[[158  20]
 [ 11 141]]
0.906060606060606
              precision    recall  f1-score   support

           0       0.93      0.89      0.91       178
           1       0.88      0.93      0.90       152

    accuracy                           0.91       330
   macro avg       0.91      0.91      0.91       330
weighted avg       0.91      0.91      0.91       330



# Task
# Best K Value
GridSearch CV
for i k=1,2,3,4,5,6,7,8,9,10



In [12]:
from sklearn.model_selection import GridSearchCV

In [13]:
# Define the parameter grid for k values

param_grid = {'n_neighbors': range(1, 11)}

In [14]:
knn = KNeighborsClassifier(algorithm='auto')

In [15]:
# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=knn, 
    param_grid=param_grid, 
    scoring='accuracy',
    cv=5,
    n_jobs=1
)

In [16]:
# Fit GridSearchCV on training data
grid_search.fit(X_train,y_train)

In [17]:
grid_search.best_params_

{'n_neighbors': 9}

In [18]:
grid_search.best_score_

0.9029850746268657

In [25]:
best_k = grid_search.best_params_

In [20]:
best_score = grid_search.best_score_

In [27]:
print(f"Best k: {best_k}")
print(f"Best cross-validated accuracy: {best_score:.4f}")

Best k: {'n_neighbors': 9}
Best cross-validated accuracy: 0.9030
