In [127]:
import pandas as pd
import numpy as np

# 1.2 Filtered Boston housing and kernels

## 1.2.4: Baseline vs full linear regression

In [128]:
def get_train_test_data():
    df = pd.read_csv("boston_data.csv")
    df_train = df.sample(frac=2.0/3)
    df_test = df.drop(df_train.index)

    df_train = df_train.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    X_train = df_train.drop("MEDV", axis=1).to_numpy()
    y_train = df_train["MEDV"].to_numpy()

    X_test = df_test.drop("MEDV", axis=1).to_numpy()
    y_test = df_test["MEDV"].to_numpy()

    return X_train, y_train, X_test, y_test

### Part (a) - Linear regression with constant function.

In [129]:

def constant_regression():
    """Fits a constant to the data set using linear regression"""

    mse_train_values = []
    mse_test_values = []

    for _ in range(20):

        X_train, y_train, X_test, y_test = get_train_test_data()

        m_train, m_test = len(X_train), len(X_test)

        phi_train = np.ones(m_train).reshape(-1, 1)

        a = np.linalg.lstsq(phi_train, y_train)[0] # best fit constant
        mse_train = (1.0/m_train) * np.linalg.norm(np.dot(phi_train, a) - y_train)**2
        mse_train_values.append(mse_train)

        phi_test = np.ones(m_test).reshape(-1, 1)

        mse_test = (1.0/m_test) * np.linalg.norm(np.dot(phi_test, a) -  y_test)**2
        mse_test_values.append(mse_test)
    
    print(f"MSE on training set {np.mean(mse_train_values)}")
    print(f"MSE on test set {np.mean(mse_test_values)}")


constant_regression()

MSE on training set 83.1481586612544
MSE on test set 87.29860888391444


### Part (c) - Linear regression with a single attribute

In [130]:

def single_attribute_regression(index):
    """Does linear regression with a bias term with a single attribute given by the index. """
    
    mse_train_values = []
    mse_test_values = []

    for _ in range(20):

        X_train, y_train, X_test, y_test = get_train_test_data()
        m_train, m_test = len(X_train), len(X_test)

        X_train = X_train[:, index]
        phi_train = np.column_stack((X_train, np.ones_like(X_train)))
        

        X_test = X_test[:, index]
        phi_test = np.column_stack((X_test, np.ones_like(X_test)))

        w = np.linalg.lstsq(phi_train, y_train)[0] # best fit weight vector
        mse_train = (1.0/m_train) * np.linalg.norm(np.dot(phi_train, w) - y_train)**2
        mse_train_values.append(mse_train)

        mse_test = (1.0/m_test) * np.linalg.norm(np.dot(phi_test, w) -  y_test)**2
        mse_test_values.append(mse_test)
    
    print(f"MSE on training set {np.mean(mse_train_values)}")
    print(f"MSE on test set {np.mean(mse_test_values)}")


for i in range(12):
    print(f"Regressing on variable {i}")
    single_attribute_regression(i)
    print("\n\n")


Regressing on variable 0
MSE on training set 73.83785919138573
MSE on test set 68.74585102124095



Regressing on variable 1
MSE on training set 74.64291110873162
MSE on test set 71.54707542896176



Regressing on variable 2
MSE on training set 64.23150939381676
MSE on test set 66.10119575683089



Regressing on variable 3
MSE on training set 81.22486309580027
MSE on test set 83.8200791166579



Regressing on variable 4
MSE on training set 68.68647153704593
MSE on test set 69.88747123561652



Regressing on variable 5
MSE on training set 42.90063440177485
MSE on test set 45.31000653072029



Regressing on variable 6
MSE on training set 70.83454840881971
MSE on test set 75.99006552613587



Regressing on variable 7
MSE on training set 79.17068990312013
MSE on test set 79.66610494151176



Regressing on variable 8
MSE on training set 73.13287208043494
MSE on test set 70.4658715495128



Regressing on variable 9
MSE on training set 65.6091870085681
MSE on test set 66.71352952452



Regres

### Part (d) - Linear regression with all variables

In [131]:
def all_attributes_regression():
    """Does linear regression with a bias term with a single attribute given by the index. """
    
    mse_train_values = []
    mse_test_values = []

    for _ in range(20):

        X_train, y_train, X_test, y_test = get_train_test_data()
        m_train, m_test = len(X_train), len(X_test)
        phi_train = np.column_stack((X_train, np.ones(m_train)))

        w = np.linalg.lstsq(phi_train, y_train)[0] 
        mse_train = (1.0/m_train) * np.linalg.norm(np.dot(phi_train, w) - y_train)**2
        mse_train_values.append(mse_train)

        phi_test = np.column_stack((X_test, np.ones(m_test)))
        mse_test = (1.0/m_test) * np.linalg.norm(np.dot(phi_test, w) -  y_test)**2
        mse_test_values.append(mse_test)
    
    print(f"MSE on training set {np.mean(mse_train_values)}")
    print(f"MSE on test set {np.mean(mse_test_values)}")

all_attributes_regression()

MSE on training set 22.355970148688726
MSE on test set 23.746913541216266


# 1.3 Filtered Boston housing and kernels