In [88]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

# Reading in data

In [5]:
D = pd.read_csv("meta-dataset.csv")
D.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226 entries, 0 to 225
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  226 non-null    int64  
 1   beta_0      226 non-null    float64
 2   beta_1      226 non-null    float64
 3   beta_2      226 non-null    float64
 4   beta_3      226 non-null    float64
 5   beta_4      226 non-null    float64
 6   beta_5      226 non-null    float64
 7   beta_6      226 non-null    float64
 8   beta_7      226 non-null    float64
 9   label       226 non-null    int64  
dtypes: float64(8), int64(2)
memory usage: 17.8 KB


In [27]:
X = D.drop(columns=['label', 'Unnamed: 0'])
y = D['label']
X.info()
y.info()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226 entries, 0 to 225
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   beta_0  226 non-null    float64
 1   beta_1  226 non-null    float64
 2   beta_2  226 non-null    float64
 3   beta_3  226 non-null    float64
 4   beta_4  226 non-null    float64
 5   beta_5  226 non-null    float64
 6   beta_6  226 non-null    float64
 7   beta_7  226 non-null    float64
dtypes: float64(8)
memory usage: 14.2 KB
<class 'pandas.core.series.Series'>
RangeIndex: 226 entries, 0 to 225
Series name: label
Non-Null Count  Dtype
--------------  -----
226 non-null    int64
dtypes: int64(1)
memory usage: 1.9 KB


# Train data with kNN modle of different parameters

In [94]:
grid_params = { 'n_neighbors' : list(range(3, 70)),
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan', 'haversine', 'cosine']}
gs = GridSearchCV(KNeighborsClassifier(), grid_params, scoring = 'accuracy', cv=5, n_jobs = -1)

In [95]:
g_res = gs.fit(X_train, y_train)

Traceback (most recent call last):
  File "/Users/zijin/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/zijin/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Users/zijin/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/Users/zijin/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "/Users/zijin/opt/anaconda3/lib/python3.9/site-packages/sklearn/neighbors/_classification.py", line 234, in predict
    neigh_ind = self.kneighbors(X, return_distance=False)
  File "/Users/zijin/opt/anaconda3/lib/python3.9/site-packages/sklearn/neighbors/_base.py", line 824, in kneighbors
    results = ArgKmin.c

In [103]:
results = pd.DataFrame(g_res.cv_results_)
results.to_excel('KNN test results.xlsx')

Manual evaluation of top few results of GridSearchCV (based on highest mean[first 4] and lowest std[last 2])

In [113]:
classifiers = {KNeighborsClassifier(n_neighbors = 45, weights = 'distance',metric = 'manhattan'),
                KNeighborsClassifier(n_neighbors = 52, weights = 'distance',metric = 'minkowski'),
                KNeighborsClassifier(n_neighbors = 52, weights = 'distance',metric = 'euclidean'),
                KNeighborsClassifier(n_neighbors = 55, weights = 'distance',metric = 'manhattan'),
                KNeighborsClassifier(n_neighbors = 29, weights = 'distance',metric = 'manhattan'),
                KNeighborsClassifier(n_neighbors = 34, weights = 'distance',metric = 'cosine'),
                }
cv = RepeatedStratifiedKFold(n_repeats = 10, n_splits=10, random_state=1)
for knn_classifier in classifiers:
    y_train_scores = cross_val_score(knn_classifier, X_train, y_train, cv=cv, n_jobs=-1)
    knn_classifier.fit(X_train, y_train)
    y_test_predict = knn_classifier.predict(X_test)
    y_test_score = metrics.accuracy_score(y_test, y_test_predict)
    print('For classifier: ', knn_classifier._check_feature_names)
    print('Training set score:', np.mean(y_train_scores))
    print('Testing set score:',y_test_score)

For classifier:  <bound method BaseEstimator._check_feature_names of KNeighborsClassifier(metric='manhattan', n_neighbors=29, weights='distance')>
Training set score: 0.6272058823529413
Testing set score: 0.7719298245614035
For classifier:  <bound method BaseEstimator._check_feature_names of KNeighborsClassifier(metric='manhattan', n_neighbors=55, weights='distance')>
Training set score: 0.6454779411764707
Testing set score: 0.8070175438596491
For classifier:  <bound method BaseEstimator._check_feature_names of KNeighborsClassifier(metric='manhattan', n_neighbors=45, weights='distance')>
Training set score: 0.6407352941176471
Testing set score: 0.7894736842105263
For classifier:  <bound method BaseEstimator._check_feature_names of KNeighborsClassifier(metric='cosine', n_neighbors=34, weights='distance')>
Training set score: 0.6248529411764706
Testing set score: 0.7192982456140351
For classifier:  <bound method BaseEstimator._check_feature_names of KNeighborsClassifier(n_neighbors=52, w

More fine-tuned testing with top 4 k values of metric = minkowski

In [114]:
classifiers = {KNeighborsClassifier(n_neighbors = 49, weights = 'distance',metric = 'minkowski'),
                KNeighborsClassifier(n_neighbors = 51, weights = 'distance',metric = 'minkowski'),
                KNeighborsClassifier(n_neighbors = 52, weights = 'distance',metric = 'minkowski'),
                KNeighborsClassifier(n_neighbors = 54, weights = 'distance',metric = 'minkowski'),
                }
cv = RepeatedStratifiedKFold(n_repeats = 10, n_splits=10, random_state=1)
for knn_classifier in classifiers:
    y_train_scores = cross_val_score(knn_classifier, X_train, y_train, cv=cv, n_jobs=-1)
    knn_classifier.fit(X_train, y_train)
    y_test_predict = knn_classifier.predict(X_test)
    y_test_score = metrics.accuracy_score(y_test, y_test_predict)
    print('For classifier: ', knn_classifier._check_feature_names)
    print('Training set score:', np.mean(y_train_scores))
    print('Testing set score:',y_test_score)

For classifier:  <bound method BaseEstimator._check_feature_names of KNeighborsClassifier(n_neighbors=54, weights='distance')>
Training set score: 0.6461029411764706
Testing set score: 0.7894736842105263
For classifier:  <bound method BaseEstimator._check_feature_names of KNeighborsClassifier(n_neighbors=51, weights='distance')>
Training set score: 0.6478676470588236
Testing set score: 0.8070175438596491
For classifier:  <bound method BaseEstimator._check_feature_names of KNeighborsClassifier(n_neighbors=52, weights='distance')>
Training set score: 0.6466911764705883
Testing set score: 0.8070175438596491
For classifier:  <bound method BaseEstimator._check_feature_names of KNeighborsClassifier(n_neighbors=49, weights='distance')>
Training set score: 0.6461029411764707
Testing set score: 0.8070175438596491


Thus, the parameters choosen would be: n_neighbors = 52, weights = 'distance',metric = 'minkowski'

# Accuracy evaluation

In [107]:
cv = RepeatedStratifiedKFold(n_repeats = 10, n_splits=10, random_state=1)
knn = KNeighborsClassifier(n_neighbors = 52, weights = 'distance',metric = 'minkowski')

In [108]:
y_train_scores = cross_val_score(knn, X_train, y_train, cv=cv, n_jobs=-1)
knn.fit(X_train, y_train)
y_test_predict = knn.predict(X_test)
y_test_score = metrics.accuracy_score(y_test, y_test_predict)
print('Training set score:', np.mean(y_train_scores))
print('Testing set score:',y_test_score)

Training set score: 0.6478676470588236
Testing set score: 0.8070175438596491


## Reference:
##### https://www.kaggle.com/code/arunimsamudra/k-nn-with-hyperparameter-tuning
##### https://medium.datadriveninvestor.com/k-nearest-neighbors-in-python-hyperparameters-tuning-716734bc557f
##### https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
##### https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
##### https://towardsdatascience.com/its-a-mistake-to-trust-the-best-model-of-a-gridsearchcv-536a73e835ad