# Assignment 2

Importing Data

In [1]:
import numpy as np
from scipy.optimize import minimize
from scipy.linalg import cho_factor, cho_solve
data = np.load('DATA/ct_data.npz')
X_train = data['X_train']; X_val = data['X_val']; X_test = data['X_test']
y_train = data['y_train']; y_val = data['y_val']; y_test = data['y_test']

In [2]:
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')

X_train shape: (40754, 384)
y_train shape: (40754,)


Question 1a

In [3]:
#defining a function to calculate standard error
def cal_se(data):
    std = np.std(data)
    se = std/ (len(data)**(1/2))
    return se

#verifying the mean on the training set is zero
print(f'The mean on y_train is {round(np.mean(y_train), 8)}.')
print(f'The mean in y_val is {round(np.mean(y_val),8)} +/- {round(cal_se(y_val), 8)}.')

print(f'The mean on the first 5785 entries in y_train is {round(np.mean(y_train[:5785]), 8)} +/- {round(cal_se(y_train[:5785]),8)}.')
#used 8 decimal places as that is the pattern of the data

The mean on y_train is -0.0.
The mean in y_val is -0.21600851 +/- 0.01290338.
The mean on the first 5785 entries in y_train is -0.44247688 +/- 0.01192627.


The standard errors on the mean values of the first 5785 entries of training and the validation dataset suggest that even the edge cases of the mean of the two sets does not meet the overal population mean values. The standard error bars are misleading here because they do not align to what the population mean is??? 

standard error --> makes it look like we are more certain than we are. 

Question 1b

In [4]:
isConstant = []
isDuplicates = np.array(np.zeros(X_train.shape[1]), dtype='bool') 

#identify all the constant columns
for i in range(X_train.shape[1]) :
    isConstant.append((X_train[:,i] == X_train[0][i]).all())

#identify all the columns which are duplicates to previous columns
for i in range(X_train.shape[1]) :
    for k in range(i+1, X_train.shape[1]) :
        if isDuplicates[k] == False:
            isDuplicates[k] = (X_train[:,i] == X_train[:,k]).all()

#create a list with all the columns to remove
columns_to_remove = np.unique(np.hstack((np.nonzero(isDuplicates)[0], np.nonzero(isConstant)[0])))

#remove these columns and name them modified datasets
X_train = np.delete(X_train, columns_to_remove ,1)
X_val = np.delete(X_val, columns_to_remove ,1)
X_test = np.delete(X_test, columns_to_remove ,1)

#check if the shape still aligns to expectations (number of rows stay the same, number of columns are same for all three sets)
print(X_train.shape, X_val.shape, X_test.shape)
# print list of removed columns
print(f'constant columns: {np.nonzero(isConstant)[0]}')
print(f'duplicate columns: {np.nonzero(isDuplicates)[0]}')

(40754, 373) (5785, 373) (6961, 373)
constant columns: [ 59  69 179 189 351]
duplicate columns: [ 69  78  79 179 188 189 199 287 351 359]


Question 2

We want to set up the following matrix equation to solve for the weights and bias of the linear regression model.
$$ \tilde{\Phi}=\left[\begin{matrix} X_{N\times D} & 1_{N\times 1}\\ \sqrt{\alpha}\mathbb{I}_{D\times D} & 0_{D\times 1} \end{matrix}\right],\quad \underline{\tilde{w}}=\left[\begin{matrix} w_{D\times 1}\\ b\end{matrix}\right],\quad \underline{\tilde{y}}=\left[\begin{matrix} y_{D\times 1}\\ 0\end{matrix}\right] $$
$$ E(\underline{w},b)=(\tilde{\Phi}\underline{\tilde{w}}-\underline{\tilde{y}})^T(\tilde{\Phi}\underline{\tilde{w}}-\underline{\tilde{y}}) $$

In [5]:
def fit_linreg(X, yy, alpha):
    yy = yy[:, np.newaxis]
    # add a column of ones to the X matrix
    Phi = np.hstack((X, np.ones((X.shape[0],1))))
    # add an identity matrix to the Phi matrix for regularization
    # leave the last column as zeros to ignore the bias term
    Phi_til = np.vstack((Phi, np.hstack((np.sqrt(alpha)*np.eye(X.shape[1]), np.zeros((X.shape[1],1))))))
    # compute the new y vector
    yy_til = np.vstack((yy, np.zeros((X.shape[1],1))))
    # compute the weights
    w = np.linalg.lstsq(Phi_til, yy_til, rcond=None)[0][:,0]
    return w[:-1], w[-1]

In [6]:
# compute the weighs and bias for the linreg model
w_linreg, b_linreg = fit_linreg(X_train, y_train, 30)

In [7]:
from ct_support_code import fit_linreg_gradopt
# compute the weighs and bias for the linreg_gradopt model
w_grad, b_grad = fit_linreg_gradopt(X_train, y_train, 30)

In [8]:
def compute_rmse(X, yy, ww, bb):
    residuals = X @ ww[:, np.newaxis] + bb - yy[:, np.newaxis]
    return np.sqrt(residuals.T @ residuals / len(yy))[0][0]

In [9]:
print(f'RMSE for training linear regression: {compute_rmse(X_train, y_train, w_linreg, b_linreg)}')
print(f'RMSE for training gradient descent: {compute_rmse(X_train, y_train, w_grad, b_grad)}')
print(f'RMSE for validation linear regression: {compute_rmse(X_val, y_val, w_linreg, b_linreg)}')
print(f'RMSE for validation gradient descent: {compute_rmse(X_val, y_val, w_grad, b_grad)}')

RMSE for training linear regression: 0.3567565397204048
RMSE for training gradient descent: 0.3567556103401207
RMSE for validation linear regression: 0.4230521968394693
RMSE for validation gradient descent: 0.4230551058620388


Question 3

In [10]:
from ct_support_code import logreg_cost, minimize_list
#write a function to fit logistc regresion using gradient opt
def fit_logreg_gradopt(X, yy, alpha):
    D = X.shape[1]
    args = (X, yy, alpha)
    init = (np.zeros(D), np.array(0))
    ww, bb = minimize_list(logreg_cost, init, args)
    return ww, bb

In [11]:
K = 20 # number of thresholded classification problems to fit
mx = np.max(y_train); mn = np.min(y_train); hh = (mx-mn)/(K+1)
thresholds = np.linspace(mn+hh, mx-hh, num=K, endpoint=True)
params = []
for kk in range(K):
    labels = y_train > thresholds[kk]
    # ... fit logistic regression to these labels
    params.append(fit_logreg_gradopt(X_train, labels, 30))

In [12]:
pred_list_train = []
pred_list_val = []

#sigmoid function to be used to normalise X
def sigmoid_func(x): 
    return 1/(1+np.exp(-x))

#utilise the parameters from the model fitted earlier and apply sigmoid on the predictions
for i in range(K):
    pred_list_train.append(sigmoid_func(X_train @ params[i][0] + params[i][1]))
    pred_list_val.append(sigmoid_func(X_val @ params[i][0] + params[i][1]))
    
X_train_transform = np.vstack(pred_list_train).T
X_val_transform = np.vstack(pred_list_val).T

In [13]:
#fit linear regression on the predictions
w_train, b_train = fit_linreg(X_train_transform, y_train, 30)

In [14]:
print(f'RMSE for training linear regression: {compute_rmse(X_train_transform, y_train, w_train, b_train)}')
print(f'RMSE for training linear regression: {compute_rmse(X_val_transform, y_val, w_train, b_train)}')

RMSE for training linear regression: 0.15441150429956377
RMSE for training linear regression: 0.25424772979325594


# Question 4

In [43]:
#fitting a neural networks with random weights 
from ct_support_code import nn_cost, minimize_list
def fit_nn(init, X, yy, alpha):
    args = (X, yy, alpha)
    ww_bar, bb_bar, V_bar, bk_bar = minimize_list(nn_cost, init, args)
    return ww_bar, bb_bar, V_bar, bk_bar

input_weights = np.vstack([params[k][0] for k in range(K)])
input_biases = np.array((params[k][1] for k in range(K)))

init_q3 = [w_train, b_train, input_weights, input_biases]
type(init_q3)
#new_params = fit_nn(init_q3, X_train, y_train, 30)

list

In [30]:
params[:][0]

(array([ 3.72131666e-02,  3.85191731e-02,  6.09695788e-02,  2.55051985e-02,
        -2.47286359e-01, -8.58959927e-02,  4.81597615e-02,  1.09588642e-01,
         6.12431547e-02,  4.38472667e-03,  3.07444459e-02,  2.48259769e-02,
         4.26447107e-02, -1.50022539e-04, -1.59717658e-01, -2.83752405e-01,
         4.98174444e-02, -4.22221501e-02,  8.75690997e-02,  5.74859554e-02,
         4.33683756e-02,  3.56396848e-02,  4.41376702e-02,  1.71934178e-01,
         2.83862680e-01, -6.89139542e-01, -1.91489693e-01,  1.86460396e-03,
        -1.33449308e-02,  5.13863892e-02,  5.38371598e-02,  5.95809408e-02,
         1.01325817e-01,  3.40404513e-01,  3.16187297e-01, -2.85393874e-01,
        -1.83488809e-01, -9.56039897e-03, -4.12462395e-02, -6.71461851e-02,
         4.29713291e-02,  8.74041524e-02,  1.32973035e-01,  2.91639563e-01,
         3.53415274e-01,  2.87345284e-02, -3.56115600e-01, -2.39844073e-01,
        -8.42215578e-02, -3.85362379e-02,  3.35726409e-02,  7.26722284e-02,
         8.2