In [2]:
import pandas as pd
import numpy as np

# 1.2 Filtered Boston housing and kernels

## 1.2.4: Baseline vs full linear regression

In [3]:
def get_train_test_data():
    df = pd.read_csv("boston_data.csv")
    df_train = df.sample(frac=2.0/3)
    df_test = df.drop(df_train.index)

    df_train = df_train.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    X_train = df_train.drop("MEDV", axis=1).to_numpy()
    y_train = df_train["MEDV"].to_numpy()

    X_test = df_test.drop("MEDV", axis=1).to_numpy()
    y_test = df_test["MEDV"].to_numpy()

    return X_train, y_train, X_test, y_test

### Part (a) - Linear regression with constant function.

In [4]:

def constant_regression():
    """Fits a constant to the data set using linear regression"""

    mse_train_values = []
    mse_test_values = []

    for _ in range(20):

        X_train, y_train, X_test, y_test = get_train_test_data()

        m_train, m_test = len(X_train), len(X_test)

        phi_train = np.ones(m_train).reshape(-1, 1)

        a = np.linalg.lstsq(phi_train, y_train)[0] # best fit constant
        mse_train = (1.0/m_train) * np.linalg.norm(np.dot(phi_train, a) - y_train)**2
        mse_train_values.append(mse_train)

        phi_test = np.ones(m_test).reshape(-1, 1)

        mse_test = (1.0/m_test) * np.linalg.norm(np.dot(phi_test, a) -  y_test)**2
        mse_test_values.append(mse_test)
    
    print(f"MSE train: {np.mean(mse_train_values)}")
    print(f"STDEV train: {np.std(mse_train_values)}")
    print(f"MSE test:  {np.mean(mse_test_values)}")
    print(f"STDEV test: {np.std(mse_test_values)}")


constant_regression()

MSE train: 83.09053139501097
STDEV train: 5.341849964301301
MSE test:  87.26859662282831
STDEV test: 10.58976103768394


### Part (c) - Linear regression with a single attribute

In [5]:

def single_attribute_regression(index):
    """Does linear regression with a bias term with a single attribute given by the index. """
    
    mse_train_values = []
    mse_test_values = []

    for _ in range(20):

        X_train, y_train, X_test, y_test = get_train_test_data()
        m_train, m_test = len(X_train), len(X_test)

        X_train = X_train[:, index]
        phi_train = np.column_stack((X_train, np.ones_like(X_train)))
        

        X_test = X_test[:, index]
        phi_test = np.column_stack((X_test, np.ones_like(X_test)))

        w = np.linalg.lstsq(phi_train, y_train)[0] # best fit weight vector
        mse_train = (1.0/m_train) * np.linalg.norm(np.dot(phi_train, w) - y_train)**2
        mse_train_values.append(mse_train)

        mse_test = (1.0/m_test) * np.linalg.norm(np.dot(phi_test, w) -  y_test)**2
        mse_test_values.append(mse_test)
    
    print(f"MSE train: {np.mean(mse_train_values)}")
    print(f"STDEV train: {np.std(mse_train_values)}")
    print(f"MSE test:  {np.mean(mse_test_values)}")
    print(f"STDEV test: {np.std(mse_test_values)}")


for i in range(12):
    print(f"Regressing on variable {i}")
    single_attribute_regression(i)
    print("\n\n")


Regressing on variable 0
MSE train: 71.31073711451387
STDEV train: 4.321244317475169
MSE test:  73.00318014754522
STDEV test: 8.664325342583926



Regressing on variable 1
MSE train: 71.93026048797451
STDEV train: 4.664719549309076
MSE test:  76.94384464743798
STDEV test: 9.316406784960927



Regressing on variable 2
MSE train: 63.52588614682338
STDEV train: 4.113638203333017
MSE test:  67.24649167119055
STDEV test: 8.264101737314547



Regressing on variable 3
MSE train: 81.44579383372039
STDEV train: 4.95134722045434
MSE test:  83.12461280051681
STDEV test: 9.874930707759344



Regressing on variable 4
MSE train: 68.94506634458135
STDEV train: 3.9823797818889917
MSE test:  69.41177884236326
STDEV test: 8.113710379438976



Regressing on variable 5
MSE train: 44.01665217695546
STDEV train: 3.6455235856509858
MSE test:  43.222268808617756
STDEV test: 7.225440416189414



Regressing on variable 6
MSE train: 73.04974255558369
STDEV train: 5.131783651905555
MSE test:  71.63401149938292
ST

### Part (d) - Linear regression with all variables

In [6]:
def all_attributes_regression():
    """Does linear regression with a bias term with a single attribute given by the index. """
    
    mse_train_values = []
    mse_test_values = []

    for _ in range(20):

        X_train, y_train, X_test, y_test = get_train_test_data()
        m_train, m_test = len(X_train), len(X_test)
        phi_train = np.column_stack((X_train, np.ones(m_train)))

        w = np.linalg.lstsq(phi_train, y_train)[0] 
        mse_train = (1.0/m_train) * np.linalg.norm(np.dot(phi_train, w) - y_train)**2
        mse_train_values.append(mse_train)

        phi_test = np.column_stack((X_test, np.ones(m_test)))
        mse_test = (1.0/m_test) * np.linalg.norm(np.dot(phi_test, w) -  y_test)**2
        mse_test_values.append(mse_test)
    
    print(f"MSE train: {np.mean(mse_train_values)}")
    print(f"STDEV train: {np.std(mse_train_values)}")
    print(f"MSE test:  {np.mean(mse_test_values)}")
    print(f"STDEV test: {np.std(mse_test_values)}")

all_attributes_regression()

MSE train: 21.68136305461265
STDEV train: 2.2325003246302426
MSE test:  25.62333988655688
STDEV test: 4.864137282101709


# 1.3 Filtered Boston housing and kernels

In [9]:
class KRR:

    def __init__(self, X_train, y_train):
        self.pairwise_norm = self._get_pairwise_norm(X_train)
        self.X_train = X_train
        self.y_train = y_train
        
    def _get_pairwise_norm(self, X_train):
        m = len(X_train)
        M = np.zeros((m, m))
        for i in range(m):
            for j in range(m):
                M[i, j] = np.linalg.norm(X_train[i] - X_train[j])
        return M

    def get_alpha(self, gamma, sigma):
        m = len(self.y_train)
        K = np.exp(-((self.pairwise_norm/sigma)**2)/2.0)
        M = K + gamma * m * np.identity(m)
        alpha = np.linalg.solve(M, self.y_train)
        return alpha
        
    def predict(self, X_test, gamma, sigma):
        alpha = self.get_alpha(gamma, sigma)
        diff = self.X_train[:, None, :] - X_test[None, :, :] # diff[i][j] = X_train[i] - X_test[j]
        norms = np.linalg.norm(diff, axis=2) # norms[i][j] = ||X_train[i] - X_test[j]||

        k = np.exp(-((norms/sigma)**2)/2)

        y_preds = np.dot(alpha, k) # dots alpha with each column of norms

        return y_preds

    def get_mse(self, X_test, y_test, gamma, sigma):
        y_preds = self.predict(X_test, gamma, sigma)
    
        return (np.linalg.norm(y_preds - y_test)**2)/len(X_test)

    

    
    

        

In [None]:
from joblib import Parallel, delayed

def get_cross_validation_error(folds_X, folds_y, gamma, sigma):

    num_folds = len(folds_X)

    def get_fold_error(i):
        fold_X_test = folds_X[i]
        fold_y_test = folds_y[i]
        fold_X_train = np.concatenate([folds_X[j] for j in range(num_folds) if j != i])
        fold_y_train = np.concatenate([folds_y[j] for j in range(num_folds) if j != i])
        krr = KRR(fold_X_train, fold_y_train)
        return krr.get_mse(fold_X_test, fold_y_test, gamma, sigma)
    
    errors = Parallel(n_jobs=-1)(delayed(get_fold_error)(i) for i in range(num_folds))

    return np.mean(errors)


def get_best_params(X_train, y_train):
    """Gets the best pair of values (gamma, sigma) to use for ridge regression.
    by doing five-fold validation on the given training data."""

    num_folds = 5
    gammas = 2.0 ** np.arange(-40, -25)
    sigmas = 2.0 ** np.arange(7.0, 13.1, 0.5)

    folds_X = np.array_split(X_train, num_folds)
    folds_y = np.array_split(y_train, num_folds)

    min_cross_error = np.inf

    best_gamma, best_sigma = 0, 0

    for gamma in gammas:
        for sigma in sigmas:
            
            cross_error = get_cross_validation_error(folds_X, folds_y, gamma, sigma)
            if cross_error < min_cross_error:
                min_cross_error = cross_error
                best_gamma, best_sigma = gamma, sigma

            print(cross_error)

    return best_gamma, best_sigma
    

X_train, y_train, X_test, y_test = get_train_test_data()
get_best_params(X_train, y_train)

ModuleNotFoundError: No module named 'joblib'