In [1]:
import numpy as np

class KNearestNeighbor(object):
    """ a kNN classifier with L2 distance"""
    
    def __init__(self):
        pass
    
    def train(self, X, y):
        """
        Train the classifier. For k-nearest neighbors this is just
        memorizeing the training data.
        
        Inputs:
        - X: A numpy array of shape(num_train, D) containing the training data
            consistring of num_train samples each of dimension D.
        - y: A numpy array of shape (N,) containing the training labels, where
            y[i] is the label for X[i]
        """
        
        self.X_train = X
        self.y_train = y
        
    def predict(self, X, k=1, num_loops=0):
        """
        Predict labels for thest data using this classifier.
        
        Inputs:
        - X: A numpy array of shape (num_test, D) containing test data consisting
            of num_test samples each of dimension D.
        - k: The number of nearest neighbors that vote for the predicted labels.
        - num_loops: Determines which implementation to use to compute distances
            between training points and testing points.
            
        Returns:
        - y: A numpy array of shape (num_test,) containing predicted labels for the 
            test data, where y[i] is the predicted label for the test point X[i].
        """
        
        if num_loops == 0:
            dists = self.compute_distances_no_loops(X)
        elif num_loops == 1:
            dists = self.compute_distances_one_loop(X)
        elif num_loops == 2:
            dists = self.compute_distances_two_loops(X)
        else:
            raise ValueError('Invalid value %d for num_loops' % num_loops)
        
        return self.predict_labels(dists, k=k)
    
    def compute_distances_two_loops(self, X):
        """
        Compute the distance between each test point in X and each training point
        in self.X_train using a nested loop over both the training data and the
        test data.
        
        Inputs:
        - X: A numpy array of shape (num_test, D) containing test data.
        
        Returns:
        - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
            is the Euclidean distance between the ith test point and the jth training 
            points.
        """
        
        num_test = X.shape[0]
        num_train = self.X_train.shape[0]
        dists = np.zeros((num_test, num_train))
        for i in range(num_test):
            for j in range(num_train):
                """
                dists[i,j]=np.sqrt(np.sum((X[i]-self.X_train[j])**2))
                """
                
                t = X[i]-X_train[j]
                t = t * t.transpose()
                dists[i][j] = np.sqrt(t.sum())
        return dists
    
    def compute_distances_one_loop(self, X):
        """
        Compute the distance between each test point in X and each training point
        in self.X_train using a single loop over the test data.
        
        Input / Output: Same as compute_distances_two_loops
        """
        num_test = X.shape[0]
        num_train = self.X_train.shape[0]
        dists = np.zeros((num_test, num_train))
        for i in range(num_test):
            t = X[i] - X_train
            t = t ** 2
            t = np.sum(t, 1)
            dists[i] = np.sqrt(t)
        return dists
    
    def compute_distances_no_loops(self,X):
        """
        Compute the distance between each test point in X and each training point
        in self.X_train using no explicit loops.
        
        Input / Output : Same as compute_distances_two_loops
        """
        num_test = X.shape[0]
        num_train = self.X_train.shape[0]
        dists = np.zeros((num_test, num_train))
        
        # L2 distance : sqrt(p^2 + q^2 - 2 * p*q)
        a = X**2 
        a = np.sum(a, axis=1,keepdims=1)    # (500,1) if dont use keepdims (500,) 
        b = self.X_train ** 2
        b = np.sum(b, axis=1)
        c = np.matmul(X_test, self.X_train.transpose())
        dists = np.sqrt(a+b- 2* c)
        
        return dists
    
    def predict_labels(self, dists, k=1):
        """
        Given a matrix of distances between test points and training points,
        predict a label for each test point.
        
        Inputs:
        - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
            gives the distance between the ith test point and the jth training point.
            
        Returns:
        - y: A numpy array of shape (num_test,) containing predicted labels for the 
            test data, where y[i] is the predicted label for the test point X[i]
        """
        num_test = dists.shape[0]
        y_pred = np.zeros(num_test)
        for i in range(num_test):
            # A list of length k storing the labels of the k nearest neighbors to
            # the ith test point.
            # closest_y = []
            t = np.argsort(dists[i])
            closest_y = self.y_train[t[:k]]
            
            np.bincount(closest_y).argmax()
            

In [None]:
num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]

X_train_folds = []
y_train_folds = []
