In [1]:
# load -> clean -> features -> fit/validate -> predict -> submit
from linear_model_base import RidgeRegression
import numpy as np

In [2]:
from data_cleaner import Data_Cleaner

#80/20 train test split to verify predictions before upload

data_cleaner_train = Data_Cleaner("C:/Users/Tim/Documents/GitHub/MLProject1/project1/data/train.csv")
data_cleaner_train.tX = data_cleaner_train.tX[:200000,:]
data_cleaner_train.y = data_cleaner_train.y[:200000]

data_cleaner_train._fill_with_NaN()
data_cleaner_train.replace_with_zero()
minimum, maximum = data_cleaner_train.getMinMax()
data_cleaner_train.standardize()


data_cleaner_test = Data_Cleaner("C:/Users/Tim/Documents/GitHub/MLProject1/project1/data/train.csv")
data_cleaner_test.tX = data_cleaner_test.tX[200000:,:]
data_cleaner_test.y = data_cleaner_test.y[200000:]

data_cleaner_test._fill_with_NaN()
data_cleaner_test.replace_with_zero()
data_cleaner_test.tX = (data_cleaner_test.tX-minimum)/(maximum-minimum)

In [3]:
def build_poly(x, degree, add_degree_zero=False):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    # ***************************************************
    # INSERT YOUR CODE HERE
    # polynomial basis function: TODO
    # this function should return the matrix formed
    # by applying the polynomial basis to the input data
    if add_degree_zero:
        xN = np.hstack([np.ones([x.shape[0],1]),x])
    else:
        xN = x
    if degree>0:
        for i in range(degree-1):
            xN = np.hstack([xN, x**(i+2)])
    return np.array(xN)

In [4]:
def rho_compute(y,X,w,j):
    #y is the response variable
    #X is the predictor variables matrix
    #w is the weight vector
    #j is the feature selector
    X_k = np.delete(X,j,1) #Remove the j variable i.e. j column
    w_k = np.delete(w,j) #Remove the weight j
    predict_k = predicted_values(X_k, w_k)
    residual = y - predict_k
    rho_j = np.sum(X[:,j]*residual)
    return(rho_j)

#z computation for unnormalised features
def z_compute(X):
    z_vector = np.sum(X*X, axis = 0) #Compute sum for each column
    return(z_vector)

def coordinate_descent(y,X,w,alpha,z,tolerance):
    max_step = 100.
    iteration = 0
    while(max_step > tolerance):
        iteration += 1
        print("Iteration (start) : ",iteration)
        old_weights = np.copy(w)
        #print("\nOld Weights\n",old_weights)
        for j in range(len(w)): #Take the number of features ie columns
            rho_j = rho_compute(y,X,w,j)
            if j == 0: #Intercept is not included with the alpha regularisation
                w[j] = rho_j/z[j]
            elif rho_j < -alpha*len(y):
                w[j] = (rho_j + (alpha*len(y)))/z[j]
            elif rho_j > -alpha*len(y) and rho_j < alpha*len(y):
                w[j] = 0.
            elif rho_j > alpha*len(y):
                w[j] = (rho_j - (alpha*len(y)))/z[j]
            else:
                w[j] = np.NaN
            #print("step"+ str(j))
        #print("\nNew Weights\n",w)
        step_sizes = abs(old_weights - w)
        #print("\nStep sizes\n",step_sizes)
        max_step = step_sizes.max()
        #print("\nMax step:",max_step)
        
        
    return(w, iteration, max_step)

def predicted_values(X, w):
    # X will be n x (d+1)
    # w will be (d+1) x 1
    predictions = np.matmul(X,w) # n x 1
    return(predictions)

In [5]:
data_cleaner_train.tX = build_poly(data_cleaner_train.tX, 3, add_degree_zero=True)
data_cleaner_test.tX = build_poly(data_cleaner_test.tX, 3, add_degree_zero=True)

#Initialise weight/parameter vector, w, to be a zero vector
w = np.zeros(data_cleaner_train.tX.shape[1], dtype = float)

#Pre-compute the z_j term
z = z_compute(data_cleaner_train.tX)

#Set the alpha and tolerance level
alpha = 0.001
tolerance = 0.1

w_opt, iterations, max_step = coordinate_descent(data_cleaner_train.y ,data_cleaner_train.tX ,w,alpha,z,tolerance)

Iteration (start) :  1
Iteration (start) :  2
Iteration (start) :  3
Iteration (start) :  4
Iteration (start) :  5
Iteration (start) :  6
Iteration (start) :  7
Iteration (start) :  8
Iteration (start) :  9
Iteration (start) :  10


In [27]:
weights = Model._run(lambda_ = best_lambda)

In [7]:
from proj1_helpers import *
y_pred = predict_labels(w_opt, data_cleaner_test.tX)

In [19]:
from costs import *
err = compute_mse(data_cleaner_test.y, data_cleaner_test.tX, weights)
print(np.sqrt(err))

0.5592479226305184


In [8]:
def estimate_Leaderboard_score(y_true,weights,data):
    """Helper function estimating the categorical accuracy on the leaderscore
    """
    y_pred = predict_labels(weights, data)
    N_tot = y_pred.shape[0]
    N_true = len(np.where(y_pred == y_true)[0])
    categorical_acuracy = N_true/N_tot
    return categorical_acuracy

estimate_Leaderboard_score(data_cleaner_test.y, w_opt, data_cleaner_test.tX)

0.73012