In [1]:
import numpy as np
from random import randrange

In [34]:
class KNearestKneighbors():
    def __init__(self, k):
        self.k = k

    def train(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X_test):
        distances = self.compute_distance(X_test)
        return self.predict_labels(distances)

    def compute_distance(self, X_test):
        distances = np.sqrt(np.sum(X_test**2, axis=1, keepdims=True) + np.sum(self.X_train**2, axis=1, keepdims=True).T - 2*np.dot(X_test, self.X_train.T))
        return distances
        
    def predict_labels(self, distances):
        n_test = distances.shape[0]
        y_pred = np.zeros(n_test)

        for i in range(n_test):
            y_index = np.argsort(distances[i, :])
            k_closests = self.y_train[y_index[:self.k]].astype(int)
            y_pred[i] = np.argmax(np.bincount(k_closests))
        return y_pred

In [None]:
def cross_validation(data, folds=5):
    k_sets = []
    data_copy = data
    fold_size = int(data.shape[0] / folds)
    for _ in range(folds):
        fold = []
        while len(fold) < fold_size:
            dummy = randrange(data.shape[0])
            index = data.index[dummy]
            fold.append(data.loc[index].values.tolist())
            data_copy = data_copy.drop(index)
        k_sets.append(np.asarray(fold))
    return k_sets

In [20]:
train_small = np.genfromtxt(
        'datasets/MNIST_train_small.csv', delimiter=',')
test_small = np.genfromtxt('datasets/MNIST_test_small.csv', delimiter=',')

In [21]:
y_train_small = train_small[:, 0]
X_train_small = train_small[:, 1:]
y_test_small = test_small[:, 0]
X_test_small = test_small[:, 1:]

In [35]:
KNN = KNearestKneighbors(k=7)
KNN.train(X_train_small, y_train_small)

In [36]:
y_pred = KNN.predict(X_test_small)
print(f'Accuracy: {sum(y_pred==y_test_small)/y_test_small.shape[0]}')

Accuracy: 0.914
