# KNN Regression

### Implementation

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import sys
from sklearn.base import BaseEstimator
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from matplotlib import pyplot as plt

In [2]:
X, y = load_boston(return_X_y=True)

In [3]:
class KNearestNeighbourRegressor(BaseEstimator):
    """
    THis class implements K-Nearest Neighbour algorithm.
    """
    
    def __init__(self, neighbours=10, weights='uniform'):
        """
        KNNRegression, which searches closest data points in the dataset and predicts y according to them.
        
        The distance, which is used to calculate the neighbours is euclidean: sqrt((x1-y1)**2 + (x2-y2)**2 ....)
        
        Params:
        ----------------------------------
        neighbours:          Number of neighbours in a neighbourhood. 
                             Lower values tend to overfit the training data and high amount to underfit                             

        weights:             weight the averaging in the neighbourhood for the prediction
                             "uniform": uniform weights simple mean
                             "distance": 1/distance * value (weighted by inverse of the distance)
        
        """
        self.neighbours = neighbours
        self.weights = weights
        self.X_train = None
        self.y_train = None
        

    def fit(self, X, y):
        """
        (array, array) --> None
        
        Fits the model to a trainingset and defines the neighbourhoods.
                
        """
        y = y.reshape(-1,1)
        
        # Save Train Data to history as it will be used later for prediction
        self.X_train = X 
        self.y_train = y
        
        return self
                      
    def predict(self, X):
        """
        (array(nxm)) --> array (nx1)
        
        This method predicts X on a given Trainingset, which is also saved in history
        Procedure:
        for each row vector in X:
            subtract it from all rowvec in X and take the norm or distance defined
            sort by ascending distance
            take the N closest to the rowvec
            take mean of the N-closest to the rowvec
            save it
            
        returns: Predictions for given X
        
        """
        predictions = np.array([])
        
        for i in range(X.shape[0]):
            # Calculate norm distances between row vec and matrix
            distances = self._calculate_distances(row_vec=X[i].T, X=self.X_train)
            
            # Concatenate together and sort
            distances = np.concatenate((distances.reshape(-1,1), self.y_train), axis=1)
            best_pred = distances[distances[:,0].argsort()][:self.neighbours]
            
            # Take averages
            if self.weights == 'uniform':
                predictions = np.append(predictions, np.mean(best_pred[:,1]))
            elif self.weights == 'distance':
                predictions = np.append(predictions, np.average(best_pred[:,1], weights=1/best_pred[:,0]))
            else:
                raise NotImplementedError('This weight is not available. Choose between "uniform" and "distance"')
                    
        return predictions
        
    @staticmethod
    def _calculate_distances(row_vec, X):
        """
        Calculates distance between row vectors.       
        """
        # Subtraction via broadcasting the row vector of x gets subtracted at each row of X 
        # and the norm calculated along each column
        distances = np.linalg.norm(X - row_vec, axis=1)
        
        return distances

### Check with Sklearn

In [4]:
knn1 = KNeighborsRegressor(n_neighbors=5)
knn1.fit(X, y)

KNeighborsRegressor()

In [5]:
predictions1 = knn1.predict(X)
predictions1[:10]

array([21.78, 22.9 , 25.36, 26.06, 27.1 , 27.1 , 20.88, 19.1 , 18.4 ,
       19.48])

In [6]:
knn2 = KNearestNeighbourRegressor(neighbours=5)
knn2.fit(X, y)

KNearestNeighbourRegressor(neighbours=5)

In [7]:
predictions2 = knn2.predict(X)
predictions2[:10]

array([21.78, 22.9 , 25.36, 26.06, 27.1 , 27.1 , 20.88, 19.1 , 18.4 ,
       19.48])

In [25]:
np.testing.assert_array_almost_equal(predictions1, predictions2)

### Sources



Murphy, K. P. (2012). Machine learning: a probabilistic perspective. MIT press.