In [None]:
import random
import numpy as np
from statistics import mean
import matplotlib.pyplot as plt
import time
import sys
from scipy.spatial import distance
from operator import itemgetter
%matplotlib inline

In [None]:
train = np.genfromtxt(
        'datasets/MNIST_train.csv', delimiter=',')

In [None]:
X_train, y_train = train[:,1:], train[:,0]

### (e) Cross validation on the complete dataset to tune k nearest neighbors

In [None]:
def cross_val_score_large_dataset(train_X, train_y, cv=5, k=5, p=7):
    fold_size, batch_numbers = int(train_y.shape[0]/cv), 20
    indices = np.arange(train_y.shape[0])
    np.random.RandomState(123).shuffle(indices)
    results = []
    for i in range(cv):
        validation_idx = indices[i*fold_size:(i+1)*fold_size]
        train_idx = np.concatenate((indices[:i*fold_size], indices[(i+1)*fold_size:]))
        batch_train_idx = np.array_split(train_idx, batch_numbers)
        batch_validation_idx = np.array_split(validation_idx, batch_numbers)
        validation_y_pred = []
        print("start:",i,"th cross validation")
        for u in range(batch_numbers):
            print("start:",u,"th validation batch")
            validation_unit_batch_idx = batch_validation_idx[u]
            level = [[] for i in range(validation_unit_batch_idx.shape[0])]
            for j in range(batch_numbers):
                train_unit_batch_idx = batch_train_idx[j]
                distances=distance.cdist(train_X[validation_unit_batch_idx], train_X[train_unit_batch_idx], 'minkowski', p)
                #validation_y_indices size:(validation_batch_size*train_batch_size)
                k_indices = np.argsort(distances)[:,:k]
                k_distances = np.array([distances[i,indices] for i, indices in zip(range(k_indices.shape[0]),k_indices)])
                validation_y_indices = np.array([train_unit_batch_idx[indices] for indices in k_indices])
                k_labels = np.array([train_y[indices] for indices in validation_y_indices]).astype(int)
                
                #k distances size:(validation_batch_size*k)
                #k labels size:(validation_batch_size*k)                
                # level element tuple size(k,k)
                for i in range(validation_unit_batch_idx.shape[0]):
                    level[i].extend(list(zip(k_distances[i],k_labels[i])))
            
            for i in range(validation_unit_batch_idx.shape[0]):
                level[i].sort(key=itemgetter(0))
                k_closests = [x[1] for x in level[i][:k]]            
                validation_y_pred.append(np.argmax(np.bincount(k_closests)))
           
        validation_acc = np.sum(validation_y_pred == train_y[validation_idx])
        print(f"{i}th cross validation accuracy:{validation_acc:.3%}\n")
        results.append(validation_acc/validation_idx.shape[0])
    return results

In [None]:
k_neighbors = np.linspace(1, 20, 20, dtype='int')

In [None]:
# plot the change in the average accuracy according to k
plt.figure(figsize = (15, 4))
plt.title("k-NN")
plt.xticks(k_neighbors)
plt.xlabel("Number of neighbors")
plt.ylabel("Average Accuracy")
results = []
bn, bs = 0, 0
for k in k_neighbors:
    # k-fold cv from scratch for k-NN
    print("start validate", k," neighbors")
    acc = mean(cross_val_score_large_dataset(X_train, y_train, cv=10, k=k, p=7))
    results.append(acc)
    print(f"\nfinished, mean accuracy:{acc:.3%}")
    if (bs < acc): 
        bn, bs = k, acc
plt.text(bn, bs, f'Neighbor:{bn}, Score:{bs:.3%}')
plt.plot(k_neighbors, results)
plt.show()