In [75]:
import pandas as pd
import numpy as np

# 1.2 Filtered Boston housing and kernels

## 1.2.4: Baseline vs full linear regression

In [76]:
def get_train_test_data():
    df = pd.read_csv("boston_data.csv")
    df_train = df.sample(frac=2.0/3)
    df_test = df.drop(df_train.index)

    df_train = df_train.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    X_train = df_train.drop("MEDV", axis=1).to_numpy()
    y_train = df_train["MEDV"].to_numpy()

    X_test = df_test.drop("MEDV", axis=1).to_numpy()
    y_test = df_test["MEDV"].to_numpy()

    return X_train, y_train, X_test, y_test

### Part (a) - Linear regression with constant function.

In [77]:

def constant_regression():
    """Fits a constant to the data set using linear regression"""

    mse_train_values = []
    mse_test_values = []

    for _ in range(20):

        X_train, y_train, X_test, y_test = get_train_test_data()

        m_train, m_test = len(X_train), len(X_test)

        phi_train = np.ones(m_train).reshape(-1, 1)

        a = np.linalg.lstsq(phi_train, y_train)[0] # best fit constant
        mse_train = (1.0/m_train) * np.linalg.norm(np.dot(phi_train, a) - y_train)**2
        mse_train_values.append(mse_train)

        phi_test = np.ones(m_test).reshape(-1, 1)

        mse_test = (1.0/m_test) * np.linalg.norm(np.dot(phi_test, a) -  y_test)**2
        mse_test_values.append(mse_test)
    
    print(f"MSE train: {np.mean(mse_train_values)}")
    print(f"STDEV train: {np.std(mse_train_values)}")
    print(f"MSE test:  {np.mean(mse_test_values)}")
    print(f"STDEV test: {np.std(mse_test_values)}")


constant_regression()

MSE train: 83.81195638774668
STDEV train: 6.084186869841014
MSE test:  85.9538741216207
STDEV test: 12.157748978119765


### Part (c) - Linear regression with a single attribute

In [78]:

def single_attribute_regression(index):
    """Does linear regression with a bias term with a single attribute given by the index. """
    
    mse_train_values = []
    mse_test_values = []

    for _ in range(20):

        X_train, y_train, X_test, y_test = get_train_test_data()
        m_train, m_test = len(X_train), len(X_test)

        X_train = X_train[:, index]
        phi_train = np.column_stack((X_train, np.ones_like(X_train)))
        

        X_test = X_test[:, index]
        phi_test = np.column_stack((X_test, np.ones_like(X_test)))

        w = np.linalg.lstsq(phi_train, y_train)[0] # best fit weight vector
        mse_train = (1.0/m_train) * np.linalg.norm(np.dot(phi_train, w) - y_train)**2
        mse_train_values.append(mse_train)

        mse_test = (1.0/m_test) * np.linalg.norm(np.dot(phi_test, w) -  y_test)**2
        mse_test_values.append(mse_test)
    
    print(f"MSE train: {np.mean(mse_train_values)}")
    print(f"STDEV train: {np.std(mse_train_values)}")
    print(f"MSE test:  {np.mean(mse_test_values)}")
    print(f"STDEV test: {np.std(mse_test_values)}")


for i in range(12):
    print(f"Regressing on variable {i}")
    single_attribute_regression(i)
    print("\n\n")


Regressing on variable 0
MSE train: 71.50705954065003
STDEV train: 3.624486209472725
MSE test:  73.24346853862562
STDEV test: 7.3804062860751305



Regressing on variable 1
MSE train: 74.54231818094163
STDEV train: 3.5691668949555044
MSE test:  71.78938204216271
STDEV test: 7.044006278955211



Regressing on variable 2
MSE train: 64.67199551008842
STDEV train: 4.955579447513301
MSE test:  65.22635323511463
STDEV test: 10.111741930257647



Regressing on variable 3
MSE train: 80.68825919081668
STDEV train: 4.664966031707165
MSE test:  84.49402795748553
STDEV test: 9.298097086827497



Regressing on variable 4
MSE train: 67.22321731665544
STDEV train: 5.598311914480576
MSE test:  72.8467516288682
STDEV test: 11.321879759382561



Regressing on variable 5
MSE train: 44.87939134409023
STDEV train: 3.1099013754650238
MSE test:  41.45333358846996
STDEV test: 6.161205741373596



Regressing on variable 6
MSE train: 71.13521318602788
STDEV train: 4.435557959470775
MSE test:  75.36578922097796


### Part (d) - Linear regression with all variables

In [79]:
def all_attributes_regression():
    """Does linear regression with a bias term with a single attribute given by the index. """
    
    mse_train_values = []
    mse_test_values = []

    for _ in range(20):

        X_train, y_train, X_test, y_test = get_train_test_data()
        m_train, m_test = len(X_train), len(X_test)
        phi_train = np.column_stack((X_train, np.ones(m_train)))

        w = np.linalg.lstsq(phi_train, y_train)[0] 
        mse_train = (1.0/m_train) * np.linalg.norm(np.dot(phi_train, w) - y_train)**2
        mse_train_values.append(mse_train)

        phi_test = np.column_stack((X_test, np.ones(m_test)))
        mse_test = (1.0/m_test) * np.linalg.norm(np.dot(phi_test, w) -  y_test)**2
        mse_test_values.append(mse_test)
    
    print(f"MSE train: {np.mean(mse_train_values)}")
    print(f"STDEV train: {np.std(mse_train_values)}")
    print(f"MSE test:  {np.mean(mse_test_values)}")
    print(f"STDEV test: {np.std(mse_test_values)}")

all_attributes_regression()

MSE train: 21.873843248374364
STDEV train: 1.9157752611266605
MSE test:  25.131907682621122
STDEV test: 4.250722227177687


# 1.3 Filtered Boston housing and kernels

In [None]:
class KRR:

    def __init__(self, X_train, y_train):
        self.pairwise_norm = self._get_pairwise_norm(X_train)
        
    def _get_pairwise_norm(X_train):
        m = len(X_train)
        M = np.zeros((m, m))
        for i in range(m):
            for j in range(m):
                M[i, j] = np.linalg.norm(X_train[i] - X_train[j])
        return M

    def get_alpha(self, gamma, sigma):
        m = len(self.y_train)
        K = np.exp(-((self.pairwise_norm/sigma)**2)/2.0)
        M = K + gamma * m * np.identity(m)
        alpha = np.linalg.solve(M, self.y_train)

        

In [None]:
def get_pairwise_norm(X_train):
    m = len(X_train)
    M = np.zeros((m, m))
    for i in range(m):
        for j in range(m):
            M[i, j] = np.linalg.norm(X_train[i] - X_train[j])
    return M

def get_best_params(X_train, y_train):
    """Gets the best pair of values (gamma, sigma) to use for ridge regression.
    by doing five-fold validation on the given training data."""

    m = len(X_train)
    gammas = 2.0 ** np.arange(-40, -25)
    sigmas = 2.0 ** np.arange(7.0, 13.1, 0.5)
    
    pairwise_norm = get_pairwise_norm(X_train)

    for gamma in gammas:
        for sigma in sigmas:
            K = np.exp(-((pairwise_norm/sigma)**2)/2.0)
            M = K + gamma * m * np.identity(m)
            alpha = np.linalg.solve(M, y_train)
    
    

X_train, y_train, X_test, y_test = get_train_test_data()
get_best_params(X_train, y_train)