# Implementations pipeline
##### In this notebook we test our implemented ML methods (regressions) and we test their accuracy
We begin by importing the libraries that we are going to need for this procedure and initialising the constants that are going to be used for the models.

In [1]:
import sys

SCRIPTS_FILEPATH = "../scripts/"
DATA_FILEPATH = "../data/train.csv"

sys.path.append(SCRIPTS_FILEPATH)
from implementations import *
from compute import *
from data_cleaner import Data_Cleaner
from proj1_helpers import predict_labels
from linear_model_base import RidgeRegression
from linear_model_base import LogisiticRegression


max_iters = 1000
search_space = np.logspace(-15, -5, 15)

We will train our models using 3 different versions of the same dataset. We do this to able to compare the impact of feature engineering in our implementations.
1. Raw data : The data is loaded, the missing variables and the outliers are treated. Then the data is normalized
2. Polynomial data : The data is loaded, the missing variables and the outliers are treated. Polynomial feature expansion is applied. The data is normalized.
3. Interactions data : The data is loaded, the missing variables and the outliers are treated. Feature interaction is applied. The data is normalized.
4. Polynomial & Interactions data : The data is loaded, the missing variables and the outliers are treated. Polynomial feature expansion and feature interaction is applied. The data is normalized.
5. Final model: The data is loaded, the missing variables and the outliers are treated. Feature interaction is applied. Polynomial feature expansion, feature interaction and logarithmic scale is applied

In all cases the dataset is split in 2 so we can estimate the performance of the model on the validation set :
- Training dataset (80%)
- Test validation dataset (20%)

## 1. Raw data

In [2]:
data = Data_Cleaner(DATA_FILEPATH)
data._fill_with_NaN()
data.fix_mass_MMC()
data.replace_with_zero()

#split
tX_train, tX_test, y_train, y_test = data.split_data(80)

#generate for minmax scale
data_train = Data_Cleaner()
data_train.tX = tX_train
data_train.y = y_train

#generate for minmax scale
data_test = Data_Cleaner()
data_test.tX = tX_test
data_test.y = y_test

#scale
minimum, maximum = data_train.getMinMax()
data_train.standardize()
data_test.tX = (data_test.tX-minimum)/(maximum-minimum)

y_train = data_train.y
tX_train = data_train.tX

y_test = data_test.y
tX_test = data_test.tX

initial_w = np.zeros(tX_train.shape[1])

In [3]:
w, loss = least_squares_GD(y_train, tX_train, np.copy(initial_w), max_iters, gamma =1e-1)
y_pred =  predict_labels(w,tX_test)

compute_leaderboard_score(y_test,y_pred)

Current iteration :0, loss= 0.5000
Current iteration :250, loss= 0.3889
Current iteration :500, loss= 0.3827
Current iteration :750, loss= 0.3787


0.70546

In [4]:
w, loss = least_squares_SGD(y_train, tX_train, np.copy(initial_w), max_iters, gamma = 1e-3)
y_pred =  predict_labels(w,tX_test)

compute_leaderboard_score(y_test,y_pred)

Current iteration :0, loss= 0.5000
Current iteration :250, loss= 0.3477
Current iteration :500, loss= 0.2527
Current iteration :750, loss= 0.2744


0.65674

In [5]:
w, loss = least_squares(y_train, tX_train)
y_pred =  predict_labels(w,tX_test)

compute_leaderboard_score(y_test,y_pred)

0.74786

In [6]:
rmse_te = []
rmse_tr = []

data_train = Data_Cleaner()
data_train.tX = tX_train
data_train.y = y_train

Model = RidgeRegression(data_train)
for lambda_ in search_space:
        
    tr, te = Model.cross_validation(5, lambda_=lambda_)
    rmse_te.append(te)
    rmse_tr.append(tr) 

best_lambda = search_space[np.where(rmse_te==np.min(rmse_te))]
Model = RidgeRegression(data_train)
weights = Model._run(lambda_ = best_lambda)

y_pred =  predict_labels(weights,tX_test)

compute_leaderboard_score(y_test,y_pred)

0.7478

In [7]:
best_lambda

array([2.6826958e-14])

In [8]:
w, loss = logistic_regression(y_train, tX_train, np.copy(initial_w), 1000, gamma = 1e-3)
y_pred =  predict_labels(w,tX_test)

compute_leaderboard_score(y_test,y_pred)

Current iteration :0, loss= 138629.4361
Current iteration :250, loss= -503868.9187
Current iteration :500, loss= -503868.9187
Current iteration :750, loss= -503868.9187


0.65674

In [9]:
lambda_ = 1e-5
w, loss = reg_logistic_regression(y_train, tX_train,lambda_, np.copy(initial_w), max_iters, gamma=1e-3)
y_pred =  predict_labels(w,tX_test)

compute_leaderboard_score(y_test,y_pred)

Current iteration :0, loss= 138629.4361
Current iteration :250, loss= -495958.0256
Current iteration :500, loss= -472435.8213
Current iteration :750, loss= -433302.1641


0.65674

## 2. Polynomial data

In [10]:
data = Data_Cleaner(DATA_FILEPATH)
data._fill_with_NaN()
data.fix_mass_MMC()
data.replace_with_zero()
data.build_polynomial(2)

tX_train, tX_test, y_train, y_test = data.split_data(80)

#generate for minmax scale
data_train = Data_Cleaner()
data_train.tX = tX_train
data_train.y = y_train

#generate for minmax scale
data_test = Data_Cleaner()
data_test.tX = tX_test
data_test.y = y_test

#scale
minimum, maximum = data_train.getMinMax()
data_train.standardize()
data_test.tX = (data_test.tX-minimum)/(maximum-minimum)

rmse_te = []
rmse_tr = []

Model = RidgeRegression(data_train)
for lambda_ in search_space:
        
    tr, te = Model.cross_validation(5, lambda_=lambda_)
    rmse_te.append(te)
    rmse_tr.append(tr) 

best_lambda = search_space[np.where(rmse_te==np.min(rmse_te))]
Model = RidgeRegression(data_train)
weights = Model._run(lambda_ = best_lambda)

y_pred =  predict_labels(weights,data_test.tX)

compute_leaderboard_score(y_test,y_pred)

0.77456

## 3. Interactions data 

In [11]:
data = Data_Cleaner(DATA_FILEPATH)
data._fill_with_NaN()
data.fix_mass_MMC()
data.replace_with_zero()
data.build_interactions()

tX_train, tX_test, y_train, y_test = data.split_data(80)

#generate for minmax scale
data_train = Data_Cleaner()
data_train.tX = tX_train
data_train.y = y_train

#generate for minmax scale
data_test = Data_Cleaner()
data_test.tX = tX_test
data_test.y = y_test

#scale
minimum, maximum = data_train.getMinMax()
data_train.standardize()
data_test.tX = (data_test.tX-minimum)/(maximum-minimum)

rmse_te = []
rmse_tr = []

Model = RidgeRegression(data_train)
for lambda_ in search_space:
        
    tr, te = Model.cross_validation(5, lambda_=lambda_)
    rmse_te.append(te)
    rmse_tr.append(tr) 

best_lambda = search_space[np.where(rmse_te==np.min(rmse_te))]
Model = RidgeRegression(data_train)
weights = Model._run(lambda_ = best_lambda)

y_pred =  predict_labels(weights,data_test.tX)

compute_leaderboard_score(y_test,y_pred)

0.7957

## 4. Polynomial & Interactions data 

In [12]:
#interaction terms and polynomial features
data = Data_Cleaner(DATA_FILEPATH)
data._fill_with_NaN()
data.fix_mass_MMC()
data.replace_with_zero()
data.build_polynomial(2)
data.build_interactions()

tX_train, tX_test, y_train, y_test = data.split_data(80)

#generate for minmax scale
data_train = Data_Cleaner()
data_train.tX = tX_train
data_train.y = y_train

#generate for minmax scale
data_test = Data_Cleaner()
data_test.tX = tX_test
data_test.y = y_test

#scale
minimum, maximum = data_train.getMinMax()
data_train.standardize()
data_test.tX = (data_test.tX-minimum)/(maximum-minimum)

rmse_te = []
rmse_tr = []

Model = RidgeRegression(data_train)
for lambda_ in search_space:
        
    tr, te = Model.cross_validation(5, lambda_=lambda_)
    rmse_te.append(te)
    rmse_tr.append(tr) 

best_lambda = search_space[np.where(rmse_te==np.min(rmse_te))]
Model = RidgeRegression(data_train)
weights = Model._run(lambda_ = best_lambda)

y_pred =  predict_labels(weights,data_test.tX)

compute_leaderboard_score(y_test,y_pred)

0.81518

## 5. Final model

In [14]:
data = Data_Cleaner(DATA_FILEPATH)
data._fill_with_NaN()
data.fix_mass_MMC()
data.replace_with_one()
#find columns with multiscale input (max greater than 100)
log_columns = np.max(data.tX, axis=0)>100
#log(x+1) of columns with multiscale data, to ensure no x <= 0
data.tX[:,log_columns] = np.log(data.tX[:,log_columns]+1)
data.build_polynomial(2)
data.build_interactions()

tX_train, tX_test, y_train, y_test = data.split_data(80)

#generate for minmax scale
data_train = Data_Cleaner()
data_train.tX = tX_train
data_train.y = y_train

#generate for minmax scale
data_test = Data_Cleaner()
data_test.tX = tX_test
data_test.y = y_test

#remove outliers
data_test.treat_outliers(1.5,92.5)
data_train.treat_outliers(1.5,92.5)

#scale
minimum, maximum = data_train.getMinMax()
data_train.standardize()
data_test.tX = (data_test.tX-minimum)/(maximum-minimum)

rmse_te = []
rmse_tr = []

Model = RidgeRegression(data_train)
for lambda_ in search_space:
        
    tr, te = Model.cross_validation(5, lambda_=lambda_)
    rmse_te.append(te)
    rmse_tr.append(tr) 

best_lambda = search_space[np.where(rmse_te==np.min(rmse_te))]
Model = RidgeRegression(data_train)
weights = Model._run(lambda_ = best_lambda)

y_pred =  predict_labels(weights,data_test.tX)

compute_leaderboard_score(y_test,y_pred)

0.83564