# KNN

#### The k-nearest neighbors (KNN) algorithm is a non-parametric, supervised learning classifier, which uses proximity to make classifications or predictions about the grouping of an individual data point.

## Import Libraries

In [1]:
import optuna
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score

  from .autonotebook import tqdm as notebook_tqdm


## Load the dataset

In [2]:
mnist_train = pd.read_csv("./MNIST_training.csv")
mnist_test = pd.read_csv("./MNIST_test.csv")
mnist_train

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
944,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
945,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
946,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
947,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Split X, y

In [3]:
x_train = mnist_train.drop(columns='label')
y_train = mnist_train['label']
x_test = mnist_test.drop(columns='label')
y_test = mnist_test['label']

## Check the train and test are divided well

In [4]:
print('train data lebel bincount:', np.bincount(y_train))
print('test data label bincount:', np.bincount(y_test))

train data lebel bincount: [95 95 95 95 95 95 94 95 95 95]
test data label bincount: [5 5 5 5 5 5 5 5 5 5]


## Check KNeighborsClassifier parameters

In [5]:
KNeighborsClassifier().get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

## Parameter optimization using optuna

In [6]:
def objective(trial):
    n_neighbors = trial.suggest_int('n_neighbors', 3, 15, step=2)
    metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan', 'cosine'])

    # Train model
    model = KNeighborsClassifier(n_neighbors=n_neighbors, metric=metric)
    model.fit(x_train, y_train)

    # Evaluate model
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

In [7]:
study = optuna.create_study(direction='maximize')   # maximize accuracy(return value)
study.optimize(objective, n_trials=50)

print('Best parameters: ', study.best_params)
print('Best scores: ', study.best_value)

[I 2025-03-11 11:05:37,051] A new study created in memory with name: no-name-2f8f3f86-f52e-4eca-8b1d-e318e98bfae6
[I 2025-03-11 11:05:37,138] Trial 0 finished with value: 0.8 and parameters: {'n_neighbors': 11, 'metric': 'manhattan'}. Best is trial 0 with value: 0.8.
[I 2025-03-11 11:05:37,182] Trial 1 finished with value: 0.88 and parameters: {'n_neighbors': 11, 'metric': 'cosine'}. Best is trial 1 with value: 0.88.
[I 2025-03-11 11:05:37,217] Trial 2 finished with value: 0.9 and parameters: {'n_neighbors': 7, 'metric': 'euclidean'}. Best is trial 2 with value: 0.9.
[I 2025-03-11 11:05:37,254] Trial 3 finished with value: 0.84 and parameters: {'n_neighbors': 11, 'metric': 'euclidean'}. Best is trial 2 with value: 0.9.
[I 2025-03-11 11:05:37,365] Trial 4 finished with value: 0.78 and parameters: {'n_neighbors': 15, 'metric': 'manhattan'}. Best is trial 2 with value: 0.9.
[I 2025-03-11 11:05:37,432] Trial 5 finished with value: 0.8 and parameters: {'n_neighbors': 11, 'metric': 'manhatta

Best parameters:  {'n_neighbors': 9, 'metric': 'cosine'}
Best scores:  0.92


## Train best model

In [8]:
best_model = KNeighborsClassifier(**study.best_params)
best_model.fit(x_train, y_train)

y_pred = best_model.predict(x_test)

## Evaluate model performance

In [9]:
print('MAE: ', mean_absolute_error(y_test, y_pred))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R²: ', r2_score(y_test, y_pred))

MAE:  0.34
RMSE:  1.3341664064126333
R²:  0.7842424242424242
