In [77]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from sklearn import metrics

In [None]:
data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/semeion/semeion.data",/
                   header=None, sep=' ')
data.head(10)

In [45]:
handwritten_digits = data.loc[:, 0 : 255]
result_digits = data.loc[:, 256 : 266].apply(lambda row: row.idxmax() % 256, axis=1)

In [70]:
i = np.random.randint(0, len(handwritten_digits), 1)
print(result_digits.loc[i].values)
plt.imshow(handwritten_digits.loc[i].values.reshape((16,16)), cmap = "Set1")
plt.show()

[2]


In [84]:
hw_train, hw_test, res_train, res_test = train_test_split(handwritten_digits, result_digits, \
                                                          test_size=0.3, random_state=53)
knn_cl = KNeighborsClassifier()
knn_cl.fit(hw_train, res_train)
res_test_pred = knn_cl.predict(hw_test)

[0 1 2 3 0 9 6 0 1 2 4 9 1 5 6 8 7 3 3 3 3 6 8 3 1 3 5 9 2 7 5 1 8 8 2 2 2
 8 2 7 3 4 2 0 0 2 3 5 8 2 6 2 6 2 7 0 6 2 9 0 1 8 1 8 0 5 6 4 4 9 3 1 9 1
 9 0 2 4 1 1 8 8 3 9 4 7 1 6 1 7 5 4 4 3 4 5 8 4 0 4 5 0 0 2 7 0 7 2 6 8 1
 6 4 2 4 9 1 3 6 1 3 1 5 3 8 1 8 1 0 3 3 8 2 9 7 2 6 5 5 1 6 3 4 0 0 7 6 2
 2 8 6 1 7 3 3 9 9 6 6 5 5 3 8 4 8 9 0 8 5 3 1 0 5 7 5 9 3 0 6 3 6 0 1 1 3
 1 6 3 2 0 6 2 4 4 4 2 6 3 0 4 0 7 6 3 8 0 3 9 5 1 5 5 7 5 6 5 6 2 1 9 7 1
 8 2 6 4 9 5 1 3 3 6 9 3 1 7 2 8 1 1 8 0 5 3 1 5 4 2 7 4 5 1 2 7 3 2 0 2 4
 3 2 3 9 5 0 3 8 0 5 4 2 0 4 2 4 4 9 1 7 3 1 6 5 3 6 9 2 7 5 3 9 3 2 2 6 1
 3 5 9 6 0 3 9 9 7 7 5 7 6 5 7 1 8 7 2 4 2 6 8 1 5 1 3 2 7 8 7 0 8 2 9 0 8
 5 9 7 9 5 7 5 9 3 8 5 9 6 5 1 3 7 2 8 1 1 5 8 4 8 7 5 7 6 6 7 2 2 6 4 7 4
 4 9 3 6 1 4 5 1 2 5 2 7 8 0 2 8 3 1 9 2 4 2 4 0 9 6 3 0 0 3 8 6 4 0 8 4 3
 3 7 5 4 3 3 0 5 1 2 6 9 9 2 4 7 1 0 3 9 0 2 6 4 6 7 4 9 2 4 7 6 0 0 9 7 2
 0 5 8 0 5 5 3 3 8 5 4 4 1 2 1 2 9 1 1 5 2 4 4 1 0 6 3 2 2 3 2 0 8 8]


In [85]:
metrics.accuracy_score(res_test, res_test_pred)

0.899581589958159

In [88]:
print(metrics.classification_report(res_test, res_test_pred))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95        42
           1       0.74      1.00      0.85        39
           2       0.90      0.96      0.93        56
           3       0.83      0.96      0.89        51
           4       1.00      0.92      0.96        50
           5       0.86      0.89      0.88        47
           6       0.93      0.93      0.93        45
           7       0.95      0.86      0.90        44
           8       0.98      0.75      0.85        53
           9       0.97      0.75      0.84        51

   micro avg       0.90      0.90      0.90       478
   macro avg       0.91      0.90      0.90       478
weighted avg       0.91      0.90      0.90       478



In [89]:
knn_params = {'n_neighbors': range(7, 56, 7), 
              'weights': ["uniform", "distance"], 
              'metric': ['euclidean', "rogerstanimoto", "russellrao"]}
knn_grid = GridSearchCV(knn_cl, knn_params, cv=10, scoring='accuracy')
knn_grid.fit(hw_train, res_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': range(7, 56, 7), 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'rogerstanimoto', 'russellrao']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [90]:
knn_grid.best_params_

{'metric': 'rogerstanimoto', 'n_neighbors': 7, 'weights': 'distance'}

In [91]:
knn_grid.best_score_

0.9013452914798207

In [92]:
knn_grid.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='rogerstanimoto',
           metric_params=None, n_jobs=None, n_neighbors=7, p=2,
           weights='distance')

In [93]:
knn_best_param = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='rogerstanimoto',
           metric_params=None, n_jobs=None, n_neighbors=7, p=2,
           weights='distance')
knn_best_param.fit(hw_train, res_train)
res_test_best = knn_best_param.predict(hw_test)

In [94]:
metrics.accuracy_score(res_test, res_test_best)

0.9100418410041841