In [25]:
import pandas as pd
import numpy as np

# 1.2 Filtered Boston housing and kernels

## 1.2.4: Baseline vs full linear regression

In [26]:
def get_train_test_data():
    df = pd.read_csv("boston_data.csv")
    df_train = df.sample(frac=2.0/3)
    df_test = df.drop(df_train.index)

    df_train = df_train.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    X_train = df_train.drop("MEDV", axis=1).to_numpy()
    y_train = df_train["MEDV"].to_numpy()

    X_test = df_test.drop("MEDV", axis=1).to_numpy()
    y_test = df_test["MEDV"].to_numpy()

    return X_train, y_train, X_test, y_test

### Part (a) - Linear regression with constant function.

In [39]:

def constant_regression():
    """Fits a constant to the data set using linear regression"""

    mse_train_values = []
    mse_test_values = []

    for _ in range(20):

        X_train, y_train, X_test, y_test = get_train_test_data()

        m_train, m_test = len(X_train), len(X_test)

        phi_train = np.ones(m_train).reshape(-1, 1)

        a = np.linalg.lstsq(phi_train, y_train)[0] # best fit constant
        mse_train = (1.0/m_train) * np.linalg.norm(np.dot(phi_train, a) - y_train)**2
        mse_train_values.append(mse_train)

        phi_test = np.ones(m_test).reshape(-1, 1)

        mse_test = (1.0/m_test) * np.linalg.norm(np.dot(phi_test, a) -  y_test)**2
        mse_test_values.append(mse_test)

    return np.mean(mse_train_values), np.std(mse_train_values), np.mean(mse_test_values), np.std(mse_test_values)


mse_train, std_train, mse_test, std_test = constant_regression()
print(f"MSE train: {mse_train}")
print(f"STDEV train: {std_train}")
print(f"MSE test:  {mse_test}")
print(f"STDEV test: {std_test}")

MSE train: 81.76217595470595
STDEV train: 4.994606544286939
MSE test:  89.92771137344184
STDEV test: 9.98019379935081


### Part (c) - Linear regression with a single attribute

In [42]:

def single_attribute_regression(index):
    """Does linear regression with a bias term with a single attribute given by the index. """
    
    mse_train_values = []
    mse_test_values = []

    for _ in range(20):

        X_train, y_train, X_test, y_test = get_train_test_data()
        m_train, m_test = len(X_train), len(X_test)

        X_train = X_train[:, index]
        phi_train = np.column_stack((X_train, np.ones_like(X_train)))
        

        X_test = X_test[:, index]
        phi_test = np.column_stack((X_test, np.ones_like(X_test)))

        w = np.linalg.lstsq(phi_train, y_train)[0] # best fit weight vector
        mse_train = (1.0/m_train) * np.linalg.norm(np.dot(phi_train, w) - y_train)**2
        mse_train_values.append(mse_train)

        mse_test = (1.0/m_test) * np.linalg.norm(np.dot(phi_test, w) -  y_test)**2
        mse_test_values.append(mse_test)

    return np.mean(mse_train_values), np.std(mse_train_values), np.mean(mse_test_values), np.std(mse_test_values)


for i in range(12):
    print(f"Regressing on variable {i}")
    mse_train, std_train, mse_test, std_test = single_attribute_regression(i)
    print(f"MSE train: {mse_train}")
    print(f"STDEV train: {std_train}")
    print(f"MSE test:  {mse_test}")
    print(f"STDEV test: {std_test}")
    print("\n\n")


Regressing on variable 0
MSE train: 70.16958853648867
STDEV train: 3.8804582086632413
MSE test:  76.09580813052492
STDEV test: 8.300932155555811



Regressing on variable 1
MSE train: 74.71191750968053
STDEV train: 5.0432032900833486
MSE test:  71.47342561942729
STDEV test: 9.900873702841436



Regressing on variable 2
MSE train: 64.31011914302483
STDEV train: 4.290270541823158
MSE test:  65.89358426659177
STDEV test: 8.598513983717593



Regressing on variable 3
MSE train: 82.51321853875527
STDEV train: 4.788182862377677
MSE test:  80.80106295274359
STDEV test: 9.671204369554383



Regressing on variable 4
MSE train: 67.64838072001758
STDEV train: 6.069222019562323
MSE test:  72.24252814424088
STDEV test: 12.05331194556931



Regressing on variable 5
MSE train: 42.71144872500424
STDEV train: 4.092043027499033
MSE test:  45.82626806049116
STDEV test: 8.115517852520506



Regressing on variable 6
MSE train: 72.65770425992363
STDEV train: 5.467864849622816
MSE test:  72.4667215438165
STD

### Part (d) - Linear regression with all variables

In [48]:
def all_attributes_regression():
    """Does linear regression with a bias term with a single attribute given by the index. """
    
    mse_train_values = []
    mse_test_values = []

    for _ in range(20):

        X_train, y_train, X_test, y_test = get_train_test_data()
        m_train, m_test = len(X_train), len(X_test)
        phi_train = np.column_stack((X_train, np.ones(m_train)))

        w = np.linalg.lstsq(phi_train, y_train)[0] 
        mse_train = (1.0/m_train) * np.linalg.norm(np.dot(phi_train, w) - y_train)**2
        mse_train_values.append(mse_train)

        phi_test = np.column_stack((X_test, np.ones(m_test)))
        mse_test = (1.0/m_test) * np.linalg.norm(np.dot(phi_test, w) -  y_test)**2
        mse_test_values.append(mse_test)
    
    return np.mean(mse_train_values), np.std(mse_train_values), np.mean(mse_test_values), np.std(mse_test_values)

mse_train, std_train, mse_test, std_test = all_attributes_regression()

print(f"MSE train: {mse_train}")
print(f"STDEV train: {std_train}")
print(f"MSE test:  {mse_test}")
print(f"STDEV test: {std_test}")


MSE train: 23.044043073978294
STDEV train: 2.181288693491689
MSE test:  22.565300261884058
STDEV test: 4.6115414249210005


# 1.3.5 Filtered Boston housing and kernels

In [30]:
class KRR:

    def __init__(self, X_train, y_train):
        self.pairwise_norm = self._get_pairwise_norm(X_train)
        self.X_train = X_train
        self.y_train = y_train
        
    def _get_pairwise_norm(self, X_train):
        m = len(X_train)
        M = np.zeros((m, m))
        for i in range(m):
            for j in range(m):
                M[i, j] = np.linalg.norm(X_train[i] - X_train[j])
        return M

    def get_alpha(self, gamma, sigma):
        m = len(self.y_train)
        K = np.exp(-((self.pairwise_norm/sigma)**2)/2.0)
        M = K + gamma * m * np.identity(m)
        alpha = np.linalg.solve(M, self.y_train)
        return alpha
        
    def predict(self, X_test, gamma, sigma):
        alpha = self.get_alpha(gamma, sigma)
        diff = self.X_train[:, None, :] - X_test[None, :, :] # diff[i][j] = X_train[i] - X_test[j]
        norms = np.linalg.norm(diff, axis=2) # norms[i][j] = ||X_train[i] - X_test[j]||

        k = np.exp(-((norms/sigma)**2)/2)

        y_preds = np.dot(alpha, k) # dots alpha with each column of norms

        return y_preds

    def get_mse(self, X_test, y_test, gamma, sigma):
        y_preds = self.predict(X_test, gamma, sigma)
    
        return (np.linalg.norm(y_preds - y_test)**2)/len(X_test)        

### Part (a) - Hyperparameter optimisation of sigma and gamma

In [None]:
from joblib import Parallel, delayed

def get_cross_validation_error(folds_X, folds_y, gamma, sigma):

    num_folds = len(folds_X)

    def get_fold_error(i):
        fold_X_test = folds_X[i]
        fold_y_test = folds_y[i]
        fold_X_train = np.concatenate([folds_X[j] for j in range(num_folds) if j != i])
        fold_y_train = np.concatenate([folds_y[j] for j in range(num_folds) if j != i])
        krr = KRR(fold_X_train, fold_y_train)
        return krr.get_mse(fold_X_test, fold_y_test, gamma, sigma)
    
    # parallelism across folds for speedup
    errors = Parallel(n_jobs=-1)(delayed(get_fold_error)(i) for i in range(num_folds))

    return np.mean(errors)


def get_best_params(X_train, y_train):
    """Gets the best pair of values (gamma, sigma) to use for ridge regression.
    by doing five-fold validation on the given training data."""

    num_folds = 5
    gammas = 2.0 ** np.arange(-40, -25)
    sigmas = 2.0 ** np.arange(7.0, 13.1, 0.5)

    folds_X = np.array_split(X_train, num_folds)
    folds_y = np.array_split(y_train, num_folds)

    min_cross_error = np.inf

    best_gamma, best_sigma = 0, 0

    errors = np.zeros(shape=(len(gammas), len(sigmas))) # errors[i][j] = cross validation error using gammas[i] and sigmas[j]

    for i, gamma in enumerate(gammas):
        for j, sigma in enumerate(sigmas):
            cross_error = get_cross_validation_error(folds_X, folds_y, gamma, sigma)
            if cross_error < min_cross_error:
                min_cross_error = cross_error
                best_gamma, best_sigma = gamma, sigma
            
            errors[i][j] = cross_error

    return best_gamma, best_sigma, errors

def kernelised_ridge_regression():
    """Does kernelised ridge regression using best gamma and sigma values found from cross-fold validation. """
    
    mse_train_values = []
    mse_test_values = []

    for _ in range(20):

        X_train, y_train, X_test, y_test = get_train_test_data()

        gamma, sigma, _ = get_best_params(X_train, y_train)

        krr = KRR(X_train, y_train)

        mse_train = krr.get_mse(X_train, y_train, gamma, sigma)
        mse_train_values.append(mse_train)

        mse_test = krr.get_mse(X_test, y_test, gamma, sigma)
        mse_test_values.append(mse_test)
    
    return np.mean(mse_train_values), np.std(mse_train_values), np.mean(mse_test_values), np.std(mse_test_values)


    


7.450580596923828e-09 362.03867196751236
3.725290298461914e-09 256.0
4.656612873077393e-10 1024.0


### Part (b) - Plot of cross-validation error for different values of sigma and gamma

In [None]:
# get best here and plot in this cell then without calling again use best values to fgind test and train mse values 

X_train, y_train, X_test, y_test = get_train_test_data()
gamma, sigma, errors = get_best_params(X_train, y_train)

print(f"Best gamma: {gamma}")
print(f"Best sigma: {sigma}")

print(errors)

Best gamma: 1.8189894035458565e-12
Best sigma: 1024.0
[[ 573.4037687  2929.61507567 3411.2474111   479.44861909   34.25364645
    16.96078408   13.02313226   13.35990231   15.21947747   16.0904
    14.36134125   13.14385719   14.50240617]
 [ 793.81474037 3503.75692617 2188.39675413  159.00795306   25.97578095
    15.2449257    12.95751673   13.83722121   15.75566624   15.36193469
    13.51498102   13.4351248    16.25255086]
 [1256.68179707 3500.90811678 1090.17121072   58.59882173   21.79487693
    14.01061372   13.1262667    14.44715761   15.81251818   14.3693236
    13.149362     14.42767684   18.50445535]
 [1832.02767945 2815.4951893   424.18597475   32.9852154    18.94468414
    13.36212841   13.43959116   15.03927693   15.25519329   13.52742319
    13.4240573    16.13454001   20.72520589]
 [2262.59018935 1776.3181715   142.85166145   25.72655527   16.70190335
    13.22637943   13.87940124   15.30202539   14.34041835   13.16166079
    14.38933498   18.36860868   22.46089038]
 [2271

### Part (c) - Test/train MSE for best sigma and gamma

In [37]:
krr = KRR(X_train, y_train)

mse_train = krr.get_mse(X_train, y_train, gamma, sigma)
mse_test = krr.get_mse(X_test, y_test, gamma, sigma)

print(f"MSE train: {mse_train}")
print(f"MSE test: {mse_test}")

MSE train: 5.60937539629866
MSE test: 14.151779629854182


### Part (d) - Test/train MSE for all the above methods averaged for 20 runs!

In [None]:
mse_train, std_train, mse_test, std_test = constant_regression()
print("Naive regression")
print(f"MSE train: {mse_train.round(2)} ± {std_train.round(2)}")
print(f"MSE test:  {mse_test.round(2)} ± {std_test.round(2)}")

for i in range(12):
    print("\n")
    print(f"Linear Regression (attribute {i+1})")
    mse_train, std_train, mse_test, std_test = single_attribute_regression(i)
    print(f"MSE train: {mse_train.round(2)} ± {std_train.round(2)}")
    print(f"MSE test:  {mse_test.round(2)} ± {std_test.round(2)}")

print("Kernel Ridge Regression")
mse_train, std_train, mse_test, std_test = kernelised_ridge_regression()
print(f"MSE train: {mse_train.round(2)} ± {std_train.round(2)}")
print(f"MSE test:  {mse_test.round(2)} ± {std_test.round(2)}")
    
    


Naive regression
MSE train: 85.52 ± 3.11
MSE test:  82.44 ± 6.25


Linear Regression (attribute 1)
MSE train: 71.05 ± 4.26
MSE test:  74.34 ± 10.58


Linear Regression (attribute 2)
MSE train: 73.56 ± 5.65
MSE test:  73.75 ± 11.22


Linear Regression (attribute 3)
MSE train: 64.49 ± 5.69
MSE test:  65.35 ± 11.36


Linear Regression (attribute 4)
MSE train: 82.08 ± 3.8
MSE test:  81.91 ± 7.69


Linear Regression (attribute 5)
MSE train: 69.86 ± 4.86
MSE test:  67.58 ± 9.71


Linear Regression (attribute 6)
MSE train: 42.87 ± 4.32
MSE test:  45.62 ± 8.52


Linear Regression (attribute 7)
MSE train: 72.5 ± 4.15
MSE test:  72.86 ± 8.36


Linear Regression (attribute 8)
MSE train: 77.86 ± 6.39
MSE test:  82.17 ± 12.95


Linear Regression (attribute 9)
MSE train: 72.36 ± 3.68
MSE test:  72.04 ± 7.3


Linear Regression (attribute 10)
MSE train: 67.14 ± 5.15
MSE test:  63.7 ± 10.38


Linear Regression (attribute 11)
MSE train: 62.14 ± 3.02
MSE test:  64.18 ± 6.12


Linear Regression (attribute