# KNN Models

## Objective

The objective of this notebook is to train and test different KNN models, by changing their hyperparameters, in order to obtain the best KNN model.

## Loading libraries and data

In [3]:
# model library
from LibrasModel import LibrasModel, weighted_accuracy_score, weighted_accuracy_scorer

# model
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import GridSearchCV

# loading data
import pickle
import joblib

# other modules
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [4]:
# get base_dataset
data_path = "TrainTestData/train_data.pickle"
train_data = pickle.load(open(data_path, "rb"))

## Choosing hyperparameters

In [5]:
# hyperparameters for first Grid Search
# some experiments showed that high values of p lead to good results in training
# therefore, some high values were used
param_grid  = {
    "n_neighbors": [3, 5, 10],
    "p": [1, 2, 3, 5, 7]  
}

## Training with Z

In [6]:
model = LibrasModel(KNeighborsClassifier(n_jobs=-1), has_z=True)
X = np.array(train_data["features"])
y = np.array(train_data["labels"])
X = model.transform_data(X)

In [7]:
# for some reason, if X is an np array some bugs appear in grid search with values of p != 2
# turning it into a list works, but makes training slower
X = list(X)
gd = GridSearchCV(model.model, param_grid, scoring=weighted_accuracy_scorer, return_train_score=True, cv=5, n_jobs=-1)
gd.fit(X, y)

In [8]:
cvres = gd.cv_results_ 
results = sorted(zip(cvres["mean_test_score"], cvres["params"]), reverse=True)
for mean_score, params in results:
    print(mean_score, params)

0.9338306632247972 {'n_neighbors': 3, 'p': 7}
0.9305443887598711 {'n_neighbors': 3, 'p': 5}
0.9290606782802687 {'n_neighbors': 5, 'p': 7}
0.9277149697992764 {'n_neighbors': 3, 'p': 3}
0.9259054043728068 {'n_neighbors': 5, 'p': 5}
0.9227771775968108 {'n_neighbors': 5, 'p': 3}
0.9160117261844836 {'n_neighbors': 10, 'p': 7}
0.9155764213849448 {'n_neighbors': 3, 'p': 2}
0.9112703867233213 {'n_neighbors': 10, 'p': 5}
0.9086503022374846 {'n_neighbors': 5, 'p': 2}
0.9005575899575111 {'n_neighbors': 10, 'p': 3}
0.8962958252790708 {'n_neighbors': 3, 'p': 1}
0.8878004472945351 {'n_neighbors': 10, 'p': 2}
0.8788956450615434 {'n_neighbors': 5, 'p': 1}
0.854605685125672 {'n_neighbors': 10, 'p': 1}


## Training without Z

In [9]:
model = LibrasModel(KNeighborsClassifier(n_jobs=-1), has_z=False)
X = np.array(train_data["features"])
y = np.array(train_data["labels"])
X = model.transform_data(X)

In [10]:
# for some reason, if X is an np array some bugs appear in grid search with values of p != 2
# turning it into a list works, but makes training slower
X = list(X)
gd = GridSearchCV(model.model, param_grid, scoring=weighted_accuracy_scorer, return_train_score=True, cv=5, n_jobs=-1)
gd.fit(X, y)

In [11]:
cvres = gd.cv_results_ 
results = sorted(zip(cvres["mean_test_score"], cvres["params"]), reverse=True)
for mean_score, params in results:
    print(mean_score, params)

0.9357541980492717 {'n_neighbors': 3, 'p': 7}
0.9321117465513563 {'n_neighbors': 3, 'p': 5}
0.9311820133965669 {'n_neighbors': 3, 'p': 3}
0.9309471369079126 {'n_neighbors': 5, 'p': 7}
0.9306726209484755 {'n_neighbors': 5, 'p': 5}
0.9236130231967048 {'n_neighbors': 3, 'p': 2}
0.9225791514531556 {'n_neighbors': 5, 'p': 3}
0.9174779460686651 {'n_neighbors': 10, 'p': 7}
0.9138522237925659 {'n_neighbors': 10, 'p': 5}
0.9130389054007816 {'n_neighbors': 5, 'p': 2}
0.9051467980373777 {'n_neighbors': 10, 'p': 3}
0.9044444821436926 {'n_neighbors': 3, 'p': 1}
0.893116367939645 {'n_neighbors': 5, 'p': 1}
0.891548333623638 {'n_neighbors': 10, 'p': 2}
0.8671805853957819 {'n_neighbors': 10, 'p': 1}


## Fine tunning the search

The best model so far used n = 3, p = 7 and no Z dimension. We will try to make it better by fine tunning it. We will include another hyperparameter, the type of weight, and test higher values of p too

In [12]:
# hyperparameters for second Grid Search
param_grid  = {
    "n_neighbors": [3, 5],
    "p": [5, 7, 9, 11],
    "weights": ["uniform", "distance"]
}

In [13]:
model = LibrasModel(KNeighborsClassifier(n_jobs=-1), has_z=False)
X = np.array(train_data["features"])
y = np.array(train_data["labels"])
X = model.transform_data(X)

In [14]:
X = list(X)
gd = GridSearchCV(model.model, param_grid, scoring=weighted_accuracy_scorer, return_train_score=True, cv=5, n_jobs=-1)
gd.fit(X, y)

In [15]:
cvres = gd.cv_results_ 
results = sorted(zip(cvres["mean_test_score"], cvres["params"]), reverse=True)
for mean_score, params in results:
    print(mean_score, params)

0.9448899290719744 {'n_neighbors': 3, 'p': 11, 'weights': 'distance'}
0.9429672822245964 {'n_neighbors': 3, 'p': 9, 'weights': 'distance'}
0.942051561643245 {'n_neighbors': 5, 'p': 9, 'weights': 'distance'}
0.9411207623935803 {'n_neighbors': 3, 'p': 7, 'weights': 'distance'}
0.9398324371170439 {'n_neighbors': 5, 'p': 11, 'weights': 'distance'}
0.9386097308014214 {'n_neighbors': 3, 'p': 11, 'weights': 'uniform'}
0.9379286372132439 {'n_neighbors': 5, 'p': 7, 'weights': 'distance'}
0.937292039941202 {'n_neighbors': 3, 'p': 5, 'weights': 'distance'}
0.9372534842980311 {'n_neighbors': 3, 'p': 9, 'weights': 'uniform'}
0.9370275114989148 {'n_neighbors': 5, 'p': 5, 'weights': 'distance'}
0.9357541980492717 {'n_neighbors': 3, 'p': 7, 'weights': 'uniform'}
0.9331724630807257 {'n_neighbors': 5, 'p': 9, 'weights': 'uniform'}
0.9321117465513563 {'n_neighbors': 3, 'p': 5, 'weights': 'uniform'}
0.9313412161877134 {'n_neighbors': 5, 'p': 11, 'weights': 'uniform'}
0.9309471369079126 {'n_neighbors': 5, 

Since the p value is still increasing, we will do one last fine tunning

In [16]:
# hyperparameters for second Grid Search
param_grid  = {
    "n_neighbors": [3],
    "p": [11, 12, 13],
    "weights": ["distance"]
}

In [17]:
model = LibrasModel(KNeighborsClassifier(n_jobs=-1), has_z=False)
X = np.array(train_data["features"])
y = np.array(train_data["labels"])
X = model.transform_data(X)

In [18]:
X = list(X)
gd = GridSearchCV(model.model, param_grid, scoring=weighted_accuracy_scorer, return_train_score=True, cv=5, n_jobs=-1)
gd.fit(X, y)

In [19]:
cvres = gd.cv_results_ 
results = sorted(zip(cvres["mean_test_score"], cvres["params"]), reverse=True)
for mean_score, params in results:
    print(mean_score, params)

0.9450450900363376 {'n_neighbors': 3, 'p': 12, 'weights': 'distance'}
0.9448899290719744 {'n_neighbors': 3, 'p': 11, 'weights': 'distance'}
0.9448371426671001 {'n_neighbors': 3, 'p': 13, 'weights': 'distance'}


Since the best model in grid search didn't use the highest p value available, we will stop the search here. However, it is interesting to see that very high values of p managed to get really good results, and doing a deeper dive into this could be useful.

### Analysing Performance in all metrics

In [23]:
from time import time

In [20]:
def print_metrics(base_model, data, has_z=False):
    model = LibrasModel(base_model, has_z=has_z)
    

    X = np.array(data["features"])
    X_transformed = model.transform_data(X)
    y = np.array(data["labels"])
    metrics = {
        "acc_w": weighted_accuracy_score,
        "acc": accuracy_score
    }
    model.fit(X, y)
    acc_w = cross_val_score(model.model, X_transformed, y, scoring=weighted_accuracy_scorer, cv=5)
    acc = cross_val_score(model.model, X_transformed, y, scoring="accuracy", cv=5)

    t = time()
    model.predict(X)
    t = time() - t

    print(f"Weighted Accuracy: {round(100 * np.mean(acc_w), 2)}%")
    print(f"Accuracy: {round(100 * np.mean(acc), 2)}%")
    print(f"Time per prediction: {1000 * t / len(y)} ms")

In [21]:
model1 = KNeighborsClassifier(n_neighbors=3, weights="distance", p=12, n_jobs=-1)
model2 = KNeighborsClassifier(n_neighbors=3, weights="distance", p=11, n_jobs=-1)
model3 = KNeighborsClassifier(n_neighbors=3, weights="distance", p=13, n_jobs=-1)

In [24]:
print_metrics(model1, train_data)

Weighted Accuracy: 94.5%
Accuracy: 93.94%
Time per prediction: 1.044890284538269 ms


In [25]:
print_metrics(model2, train_data)

Weighted Accuracy: 94.49%
Accuracy: 93.92%
Time per prediction: 1.0657704082028618 ms


In [26]:
print_metrics(model3, train_data)

Weighted Accuracy: 94.48%
Accuracy: 93.9%
Time per prediction: 1.0220191602049202 ms


## Conclusion

The KNN models achieve good accuracy values, but in terms of time performance it is not very good. Therefore, we will only consider the best model for testing.