In [55]:
import numpy as np # Importing required module

class Knnregression():
    """

    K-NN based regression

    This is a K-NN regression built using Numpy module that
    only supports numerical data as input and euclidean distance
    for computing neighbors.
    """

    def __init__(self, k):
        '''
        Constructs K attribute of K-NN regression

        Parameters
        ----------
        k: int
          Number of neighbors to take into account in K-NN
        '''

        self.k= k

    def fit(self, X_train, y_train):
        '''
        Trains the K-NN model, i.e. storing the training dataset

        Parameters
        ----------
        X_train: Numpy.array, shape(n,m)
                Training feature matrix
        y_train: Numpy.array, shape(n,)
                Training target values
        '''

        self.X_train = X_train
        self.y_train = y_train

    def _calculate_euc_dist_mat(self, X_test):
        '''
        Computes the euclidean distance matrix between two feature vectors

        Parameters
        ----------
        X_test: Numpy.array, shape(z,m)
            New feature matrix, this feature vector and stored one's euclidean
            distance is computed.

        Returns
        -------
        euc_mat: Numpy.array, shape(n,z)
            Euclidean distance matrix, here element (1,1) is euclidean distance
            between stored data's 1st sample and new data's 1st sample, (1,2)
            between stored data's 1st sample and new data's 2nd sample and so on.
        '''
        a = np.sum(self.X_train**2, axis=1).reshape(-1,1) # Reshaping for proper euclidean matrix.
        b_T = np.sum(X_test**2, axis=1)
        W = -2 * np.dot(self.X_train,X_test.T)
        euc_mat = np.sqrt(a + b_T + W + 1e-10) # Adding small value to avoid warning.

        return euc_mat

    def predict(self, X_test):
        '''
        Predicts the target labels of provided data

        Parameters
        ----------
        X_test: Numpy.array, shape(z,m)

        Returns
        -------
        np.array(self.predictions): Numpy,array, shape(z,)
                Predictions of the target labels
        '''

        self.predictions = []

        dist_mat = self._calculate_euc_dist_mat(X_test)

        for i in range(X_test.shape[0]):
            distance = dist_mat[:,i] # Taking ith column of distance matrix
            near_neigh_index = np.argsort(distance)[:self.k]
            near_neigh_labels = self.y_train.iloc[near_neigh_index]


            self.predictions.append(np.mean(near_neigh_labels)) # Aggregation

        return np.array(self.predictions)

# KNN( k- Nearest Neighbour)

### Import dataset and feature scaling

In [56]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [57]:
url = "https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv"
df = pd.read_csv(url)
# New descriptive column names
new_column_names = {
    'crim': 'Crime_rate_per_capita',
    'zn': 'Residential_land_zone_pct',
    'indus': 'Non_retail_business_acres_pct',
    'chas': 'Next_to_Charles_River',
    'nox': 'Nitric_Oxide_conc',
    'rm': 'Avg_rooms_per_dwelling',
    'age': 'Pct_houses_built_before_1940',
    'dis': 'Weighted_dist_to_employment',
    'rad': 'Highway_access_index',
    'tax': 'Property_tax_rate_per_10000',
    'ptratio': 'Pupil_teacher_ratio',
    'b': 'Black_residents_index',
    'lstat': 'Low_income_pct',
    'medv': 'Median_home_value_1000s'
}

# Rename columns
df = df.rename(columns=new_column_names)

# Check new columns
print(df.columns)



Index(['Crime_rate_per_capita', 'Residential_land_zone_pct',
       'Non_retail_business_acres_pct', 'Next_to_Charles_River',
       'Nitric_Oxide_conc', 'Avg_rooms_per_dwelling',
       'Pct_houses_built_before_1940', 'Weighted_dist_to_employment',
       'Highway_access_index', 'Property_tax_rate_per_10000',
       'Pupil_teacher_ratio', 'Black_residents_index', 'Low_income_pct',
       'Median_home_value_1000s'],
      dtype='object')


In [58]:
X =df.drop('Median_home_value_1000s', axis =1)
y = df['Median_home_value_1000s']

X_scaled = MinMaxScaler().fit_transform(X)

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled,y, test_size = 0.2, random_state = 42)

### Fitting the KNN regression and prediction

In [60]:
knn_reg = Knnregression(k=4)
knn_reg.fit(X_train, y_train)

knn_y_train_pred = knn_reg.predict(X_train)
knn_y_test_pred = knn_reg.predict(X_test)

### Performance Comparision with linear regression

In [61]:
from sklearn.metrics import mean_squared_error

knn_train_error = mean_squared_error(y_train, knn_y_train_pred)
knn_test_error = mean_squared_error(y_test, knn_y_test_pred)

print(f'KNN, Training MSE: {knn_train_error:.2f}')
print(f'KNN, Test MSE: { knn_test_error:.2f}')

KNN, Training MSE: 11.91
KNN, Test MSE: 19.80


Now lets fit the linear regression and compute MSE on training  and test set.

In [64]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

lin_y_train_pred = lin_reg.predict(X_train)
lin_y_test_pred = lin_reg.predict(X_test)

lin_train_error = mean_squared_error(y_train, lin_y_train_pred)

lin_test_error = mean_squared_error(y_test, lin_y_test_pred)

print(f'Linear Regression, Training MSE: {lin_train_error}')

print(f'Linear Regression, Test MSE: {lin_test_error}')

Linear Regression, Training MSE: 21.641412753226312
Linear Regression, Test MSE: 24.291119474973545
