In [2]:
import pandas as pd
import numpy as np

# 1.2 Filtered Boston housing and kernels

## 1.2.4: Baseline vs full linear regression

In [3]:
def get_train_test_data():
    df = pd.read_csv("boston_data.csv")
    df_train = df.sample(frac=2.0/3)
    df_test = df.drop(df_train.index)

    df_train = df_train.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    X_train = df_train.drop("MEDV", axis=1).to_numpy()
    y_train = df_train["MEDV"].to_numpy()

    X_test = df_test.drop("MEDV", axis=1).to_numpy()
    y_test = df_test["MEDV"].to_numpy()

    return X_train, y_train, X_test, y_test

### Part (a) - Linear regression with constant function.

In [4]:

def constant_regression():
    """Fits a constant to the data set using linear regression"""

    mse_train_values = []
    mse_test_values = []

    for _ in range(20):

        X_train, y_train, X_test, y_test = get_train_test_data()

        m_train, m_test = len(X_train), len(X_test)

        phi_train = np.ones(m_train).reshape(-1, 1)

        a = np.linalg.lstsq(phi_train, y_train)[0] # best fit constant
        mse_train = (1.0/m_train) * np.linalg.norm(np.dot(phi_train, a) - y_train)**2
        mse_train_values.append(mse_train)

        phi_test = np.ones(m_test).reshape(-1, 1)

        mse_test = (1.0/m_test) * np.linalg.norm(np.dot(phi_test, a) -  y_test)**2
        mse_test_values.append(mse_test)
    
    print(f"MSE train: {np.mean(mse_train_values)}")
    print(f"STDEV train: {np.std(mse_train_values)}")
    print(f"MSE test:  {np.mean(mse_test_values)}")
    print(f"STDEV test: {np.std(mse_test_values)}")


constant_regression()

MSE train: 83.48062905370304
STDEV train: 2.8184916703253795
MSE test:  86.63999663765651
STDEV test: 5.632731350589415


### Part (c) - Linear regression with a single attribute

In [5]:

def single_attribute_regression(index):
    """Does linear regression with a bias term with a single attribute given by the index. """
    
    mse_train_values = []
    mse_test_values = []

    for _ in range(20):

        X_train, y_train, X_test, y_test = get_train_test_data()
        m_train, m_test = len(X_train), len(X_test)

        X_train = X_train[:, index]
        phi_train = np.column_stack((X_train, np.ones_like(X_train)))
        

        X_test = X_test[:, index]
        phi_test = np.column_stack((X_test, np.ones_like(X_test)))

        w = np.linalg.lstsq(phi_train, y_train)[0] # best fit weight vector
        mse_train = (1.0/m_train) * np.linalg.norm(np.dot(phi_train, w) - y_train)**2
        mse_train_values.append(mse_train)

        mse_test = (1.0/m_test) * np.linalg.norm(np.dot(phi_test, w) -  y_test)**2
        mse_test_values.append(mse_test)
    
    print(f"MSE train: {np.mean(mse_train_values)}")
    print(f"STDEV train: {np.std(mse_train_values)}")
    print(f"MSE test:  {np.mean(mse_test_values)}")
    print(f"STDEV test: {np.std(mse_test_values)}")


for i in range(12):
    print(f"Regressing on variable {i}")
    single_attribute_regression(i)
    print("\n\n")


Regressing on variable 0
MSE train: 71.48057779674183
STDEV train: 5.686339024704249
MSE test:  73.63667224139493
STDEV test: 11.675148421314773



Regressing on variable 1
MSE train: 73.5551224677202
STDEV train: 3.69135052886399
MSE test:  73.6040418702664
STDEV test: 7.470038693086179



Regressing on variable 2
MSE train: 62.07530804996154
STDEV train: 5.53606922956964
MSE test:  70.34999204278735
STDEV test: 11.421859594385735



Regressing on variable 3
MSE train: 81.13958091916646
STDEV train: 5.193755027862674
MSE test:  84.00732117357443
STDEV test: 10.531479303243646



Regressing on variable 4
MSE train: 69.34527563583677
STDEV train: 3.921113728617015
MSE test:  68.81703898149696
STDEV test: 7.713522951660825



Regressing on variable 5
MSE train: 42.68846742349146
STDEV train: 3.9109605609388707
MSE test:  45.915398029708015
STDEV test: 7.842791635619058



Regressing on variable 6
MSE train: 72.48704664624809
STDEV train: 5.803662911526368
MSE test:  72.758313622293
STDEV

### Part (d) - Linear regression with all variables

In [6]:
def all_attributes_regression():
    """Does linear regression with a bias term with a single attribute given by the index. """
    
    mse_train_values = []
    mse_test_values = []

    for _ in range(20):

        X_train, y_train, X_test, y_test = get_train_test_data()
        m_train, m_test = len(X_train), len(X_test)
        phi_train = np.column_stack((X_train, np.ones(m_train)))

        w = np.linalg.lstsq(phi_train, y_train)[0] 
        mse_train = (1.0/m_train) * np.linalg.norm(np.dot(phi_train, w) - y_train)**2
        mse_train_values.append(mse_train)

        phi_test = np.column_stack((X_test, np.ones(m_test)))
        mse_test = (1.0/m_test) * np.linalg.norm(np.dot(phi_test, w) -  y_test)**2
        mse_test_values.append(mse_test)
    
    print(f"MSE train: {np.mean(mse_train_values)}")
    print(f"STDEV train: {np.std(mse_train_values)}")
    print(f"MSE test:  {np.mean(mse_test_values)}")
    print(f"STDEV test: {np.std(mse_test_values)}")

all_attributes_regression()

MSE train: 21.531171035325674
STDEV train: 1.6508548715627567
MSE test:  25.464046383515814
STDEV test: 3.3586067813011917


# 1.3 Filtered Boston housing and kernels

In [14]:
class KRR:

    def __init__(self, X_train, y_train):
        self.pairwise_norm = self._get_pairwise_norm(X_train)
        self.X_train = X_train
        self.y_train = y_train
        
    def _get_pairwise_norm(self, X_train):
        m = len(X_train)
        M = np.zeros((m, m))
        for i in range(m):
            for j in range(m):
                M[i, j] = np.linalg.norm(X_train[i] - X_train[j])
        return M

    def get_alpha(self, gamma, sigma):
        m = len(self.y_train)
        K = np.exp(-((self.pairwise_norm/sigma)**2)/2.0)
        M = K + gamma * m * np.identity(m)
        alpha = np.linalg.solve(M, self.y_train)
        return alpha
    
    def predict(self, x_test, gamma, sigma):
        alpha = self.get_alpha(gamma, sigma)

        diff = self.X_train - x_test
        k = np.exp(-(np.linalg.norm(self.X_train - x_test)**2)/(2*sigma**2))
        print(diff.shape)

        y_pred = np.sum(alpha * k)

        return y_pred


    def get_mse(self, X_test, y_test, gamma, sigma):
        mse = 0
        
        for i in range(len(X_test)):
            mse += (self.predict(X_test[i], gamma, sigma) - y_test[i])**2
            
        return mse/len(X_test)

    

    
    

        

In [15]:

def get_best_params(X_train, y_train):
    """Gets the best pair of values (gamma, sigma) to use for ridge regression.
    by doing five-fold validation on the given training data."""

    m = len(X_train)
    gammas = 2.0 ** np.arange(-40, -25)
    sigmas = 2.0 ** np.arange(7.0, 13.1, 0.5)
    
    krr = KRR(X_train, y_train)

    for gamma in gammas:
        for sigma in sigmas:
            mse = krr.get_mse(X_train, y_train, gamma, sigma)
    

X_train, y_train, X_test, y_test = get_train_test_data()
get_best_params(X_train, y_train)

(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
(337, 12)
