Modify the KNN scratch code in our lecture such that:
- If the majority class of the first place is equal to the second place, then ask the algorithm to pick the next nearest neighbors as the decider
- Modify the code so it outputs the probability of the decision, where the probability is simply the class probability based on all the nearest neighbors
- Write a function which allows the program to receive a range of k, and output the cross validation score.  Last, it shall inform us which k is the best to use from a predefined range
- Put everything into a class <code>KNN(k=3)</code>.  It should have at least one method, <code>predict(X_train, X_test, y_train)</code>

In [1]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X, y = make_blobs(n_samples=300, centers=4, random_state=0, cluster_std=1.0)

#standardize
scaler = StandardScaler()
X = scaler.fit_transform(X)

#do train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [18]:
def find_distance(X_train, X_test):
    #create newaxis simply so that broadcast to all values
    dist = X_test[:, np.newaxis, :] - X_train[np.newaxis, :, :]
    sq_dist = dist ** 2
    
    #sum across feature dimension, thus axis = 2
    summed_dist = sq_dist.sum(axis=2)
    sq_dist = np.sqrt(summed_dist)
    return sq_dist

def find_neighbors(X_train, X_test, k=3):
    dist = find_distance(X_train, X_test)
    #return the first k neighbors
    neighbors_ix = np.argsort(dist)[:, 0:k]
    return neighbors_ix

def get_most_common(y):
    return np.bincount(y).argmax()

In [133]:
def predict(X_train, X_test, y_train, k=3):
    classes = len(np.unique(y_train))
    neighbors_ix = find_neighbors(X_train, X_test, k)

    pred = np.zeros(X_test.shape[0])
    prop = np.zeros((X_test.shape[0],classes))
    for ix, y in enumerate(y_train[neighbors_ix]):
        freq = np.bincount(y)
        while len(freq) < classes:
            freq = np.append(freq, 0)
        k_inc = k
        while np.sort(freq)[-1] == np.sort(freq)[-2]:
            k_inc += 1
            neighbors_ix_new = find_neighbors(X_train, X_test[ix].reshape(1,-1), k_inc).reshape(-1)
            freq = np.bincount(y_train[neighbors_ix_new])
            while len(freq) < classes:
                freq = np.append(freq, 0)
        prop[ix] = freq/np.sum(freq)
        pred[ix] = get_most_common(y)
    return pred, prop

In [135]:
yhat, yhat_prop = predict(X_train, X_test, y_train, k=5)

In [57]:
def CV_K(X_train, y_train, K_max):
    predict(X_train, X_test, y_train, k=3)

[1 2 3 4]
