In [9]:
import pandas as pd
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
from IPython.display import display, HTML
from sklearn.model_selection import train_test_split


def predict(X, y, desired_X, n_neighbors, metric, weights, do_normalize=False):
    if isinstance(X, pd.core.series.Series):
        X = X.to_numpy()

    if isinstance(y, pd.core.series.Series):
        y = y.to_numpy()

    y = y.reshape(-1, 1)
    if X.ndim == 1:
        X = X.reshape(-1, 1)

    if isinstance(desired_X, pd.core.series.Series):
        desired_X = desired_X.to_numpy()

    if desired_X.ndim == 1:
        desired_X = desired_X.reshape(-1, 1)

    if do_normalize:
        X_mean = X.mean(axis=0)
        X_std = X.std(axis=0) + 1e-7
        desired_X = (desired_X - X_mean) / X_std

    distances = compute_distances(X, metric, desired_X)

    arg_sorted_labels = np.argsort(distances, axis=1)
    knn_weights = np.power(np.sort(distances, axis=1)[:, :n_neighbors] + 1e-7, -1)
    knn_labels = y[arg_sorted_labels].reshape(arg_sorted_labels.shape)[:, :n_neighbors]

    if weights is 'uniform':
        unique_labels, indices = np.unique(knn_labels, return_inverse=True)
        indices = indices.reshape(knn_labels.shape)
        num_per_label = np.apply_along_axis(np.bincount, 1, indices, None, minlength=np.max(indices) + 1)
        predictions = unique_labels[np.argmax(num_per_label, axis=1)]

    if weights is 'distance':
        knn_sorted_labels = np.zeros(knn_weights.shape, dtype=y.dtype)
        for row, (weights, labels) in enumerate(zip(knn_weights, knn_labels)):
            u_labels, indices = np.unique(labels, return_inverse=True)
            for indice, u_label in enumerate(u_labels):
                weights[indices == indice] = weights[indices == indice].sum()
            labels = u_labels[indices[np.argsort(weights)]]
            knn_sorted_labels[row] = labels
        predictions = knn_sorted_labels[:, n_neighbors - 1]

    return predictions


def compute_distances(X, metric, desired_X):
    if isinstance(X, pd.core.series.Series):
        X = X.to_numpy()
    along_axis_diffs = desired_X[:, np.newaxis, :] - X
    if metric is 'euclidean':
        distances = np.power(np.power(np.abs(along_axis_diffs), 2).sum(axis=-1), 1 / 2)
    if metric is 'manhattan':
        distances = np.abs(along_axis_diffs).sum(axis=-1)
    if metric is 'chebyshev':
        distances = np.abs(along_axis_diffs).max(axis=-1)

    return distances


digits = datasets.load_digits()
data = digits.data
target = digits.target

X_train = data[:int(np.floor(data.shape[0] * 6 / 10))]
X_eval = data[int(np.floor(data.shape[0] * 6 / 10)):int(np.floor(data.shape[0] * 8 / 10))]
X_test = data[int(np.floor(data.shape[0] * 8 / 10)):]

y_train = target[:int(np.floor(data.shape[0] * 6 / 10))]
y_eval = target[int(np.floor(data.shape[0] * 6 / 10)):int(np.floor(data.shape[0] * 8 / 10))]
y_test = target[int(np.floor(data.shape[0] * 8 / 10)):]


def confusion_df(grand_truth, predicts):
    df = pd.DataFrame({'predict': predicts, 'actual': grand_truth, 'cnt': 1})
    return pd.crosstab(df.actual, df.predict)


def compute_score(grand_truth, predicts):
    corr = (grand_truth == predicts).sum()
    all_ = y_eval.shape[0]
    return corr / all_ * 100


n_neighbors = np.asarray(range(1, 10))
metrics = ['euclidean', 'manhattan', 'chebyshev']
weights = ['uniform', 'distances']
normalize = [False, True]

scores_euclidean = np.zeros((len(n_neighbors)))
scores_manhattan = np.zeros((len(n_neighbors)))
scores_chebyshev = np.zeros((len(n_neighbors)))

for metric in metrics:
    for n_neighbor in n_neighbors:
        predicts = predict(X_train, y_train, X_eval, n_neighbors=n_neighbor, metric=metric,
                           weights='distance', do_normalize=False)
        score = compute_score(y_eval, predicts)
        if metric is 'euclidean':
            scores_euclidean[n_neighbor - 1] = score
        elif metric is 'manhattan':
            scores_manhattan[n_neighbor - 1] = score
        else:
            scores_chebyshev[n_neighbor - 1] = score


print(f'maximum score of euclidean = {np.max(scores_euclidean):.1f}% \
with k={n_neighbors[np.argmax(scores_minkowski)]}')
print(f'maximum score of manhattan = {np.max(scores_manhattan):.1f}% \
with k={n_neighbors[np.argmax(scores_manhattan)]}')
print(f'maximum score of chebyshev = {np.max(scores_chebyshev):.1f}% \
with k={n_neighbors[np.argmax(scores_chebyshev)]}')

predicts = predict(X_test, y_train, X_eval, n_neighbors=1, metric='euclidean',
                   weights='distance', do_normalize=False)
score = compute_score(y_eval, predicts)
cdf = confusion_df(y_eval, predicts)

maximum score of euclidean = 98.1% with k=1
maximum score of manhattan = 97.2% with k=2
maximum score of chebyshev = 96.7% with k=1
