# Regression - Part 1

# Load the data

In [870]:
import numpy as np

X_train_ini = np.load('Xtrain_Regression_Part1.npy')
Y_train_ini = np.load('Ytrain_Regression_Part1.npy')

X_test = np.load('Xtest_Regression_Part1.npy')

# Define cross-validation method

In [871]:
from sklearn.model_selection import RepeatedKFold

round_digits = 7
#random_state = 1
random_state = None

# define model evaluation method (repeats k-folds n times, with k-folds=n_splits and n=n_repeats)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=random_state)

# Set train and validation samples

In [872]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(X_train_ini, Y_train_ini, 
                                                  random_state=random_state, 
                                                  shuffle=True, test_size=1/3)


# Train the linear regression model

In [873]:
from sklearn.linear_model import LinearRegression

# Ordinary least squares Linear Regression
lr = LinearRegression()
lr.fit(x_train, y_train)

# Training score
R2 = lr.score(x_train, y_train)
print('R²:', round(R2, round_digits))

R²: 0.9994526


# MSE for training data

In [874]:
from numpy import round
from sklearn.metrics import mean_squared_error, r2_score

y_hat_lr_train = lr.predict(x_train)

OLS_lr_MSE_predicted_train = round(mean_squared_error(y_train, y_hat_lr_train, squared=True), round_digits)
print('MSE:', OLS_lr_MSE_predicted_train)
print('R²:', round(r2_score(y_train, y_hat_lr_train), round_digits))

MSE: 0.0076001
R²: 0.9994526


# Print the coeficients and the intercept

In [875]:
print('Coef.:', lr.coef_)
print('Intercept:', lr.intercept_)

Coef.: [[ 3.23083688e-02  1.23513283e-03  1.01038112e-01  3.24436004e-01
  -6.85030791e-01  1.68942423e+00  3.12752780e-02  1.82667421e+00
   3.87705239e-03 -1.82000765e-02 -1.45479220e+00 -7.06796151e-01
   4.40583563e-02 -6.10750534e-01  2.53779992e-02 -3.82729060e-01
  -1.23498181e-01 -1.36088088e+00 -1.25774574e+00  9.61829946e-01]]
Intercept: [-0.02600608]


# Prediction and performance measures

In [876]:
from sklearn.metrics import mean_squared_error, r2_score

# Prediction on validation data
y_pred = lr.predict(x_val)

# Scores on validation data
#print('RMSE: %.5f' % mean_squared_error(y_val, y_pred, squared=False))
#print('SSE:', round(y_pred.size * mean_squared_error(y_val, y_pred, squared=True), 5))

OLS_lr_MSE_predicted_val = round(mean_squared_error(y_val, y_pred, squared=True), round_digits)

print('MSE:', OLS_lr_MSE_predicted_val)
print('R²:', round(r2_score(y_val, y_pred), round_digits))


MSE: 0.020024
R²: 0.9988606


# Cross-validation performance measures

In [877]:
from sklearn.model_selection import cross_validate, cross_val_score
from numpy import absolute

# Evaluate the model with cross-validation 
score = cross_val_score(lr, x_train, y_train, cv=cv,
                        scoring='neg_mean_squared_error')

OLS_lr_MSE_cv_train = round(absolute(score.mean()), round_digits)

print('score MSE:', score)
print("MSE:", OLS_lr_MSE_cv_train, '  std:', round(score.std(), round_digits))

score MSE: [-0.0287981  -0.02234128 -0.01188768 -0.00642502 -0.00715508 -0.01663265
 -0.01298279 -0.01673138 -0.03312045 -0.02608749 -0.02988212 -0.01291274
 -0.0037524  -0.01213862 -0.02913642 -0.01878453 -0.01815248 -0.01192407
 -0.01621763 -0.01085675 -0.04116021 -0.00594729 -0.04342735 -0.00911864
 -0.01043875 -0.04003053 -0.01339716 -0.01954434 -0.01141052 -0.00378261]
MSE: 0.0181392   std: 0.0109323


# Ridge regression

Linear least squares with l2 regularization.

Minimizes the objective function:

$ \ ||y - Xw||^2_2 + alpha * ||w||^2_2 $

# Train rigde regression model

In [878]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from numpy import arange

ridge = Ridge()

parameters = {'alpha': arange(0, 1, 0.01)}

ridge_lr = GridSearchCV(ridge, parameters, scoring='neg_mean_squared_error', cv=cv, refit=True, verbose=1)
ridge_lr.fit(x_train, y_train)

print('Ridge Coef.:', ridge_lr.best_estimator_.coef_)
print('Ridge Intercept:', ridge_lr.best_estimator_.intercept_)

ridge_MSE_cv_train = round(absolute(ridge_lr.best_score_), round_digits)

print('Ridge best params:', ridge_lr.best_params_ )
print('Ridge MSE:', ridge_MSE_cv_train)

Fitting 30 folds for each of 100 candidates, totalling 3000 fits
Ridge Coef.: [[ 3.21606184e-02  1.22530170e-03  1.00674665e-01  3.24098660e-01
  -6.84316255e-01  1.68794494e+00  3.13195515e-02  1.82521149e+00
   3.92889707e-03 -1.82000811e-02 -1.45380213e+00 -7.06267507e-01
   4.37820823e-02 -6.10168067e-01  2.59433561e-02 -3.82397486e-01
  -1.22875733e-01 -1.35953712e+00 -1.25729634e+00  9.61659776e-01]]
Ridge Intercept: [-0.02591421]
Ridge best params: {'alpha': 0.04}
Ridge MSE: 0.0173796


best_estimator_:

Estimator that was chosen by the search, i.e. estimator which gave highest score (or smallest loss if specified) on the left out data. Not available if refit=False.

best_score_:

Mean cross-validated score of the best_estimator

best_params_:

Parameter setting that gave the best results on the hold out data.

Importantly, we can configure the hyperparameter search to refit a final model with the entire training dataset using the best hyperparameters found during the search. This can be achieved by setting the “refit” argument to True, then retrieving the model via the “best_estimator_” attribute on the search result.


In [879]:
print(ridge_lr.best_score_)
print(ridge_lr.best_estimator_)
print(ridge_lr.best_params_)

-0.017379592340896373
Ridge(alpha=0.04)
{'alpha': 0.04}


In [880]:
y_hat_ridge = ridge_lr.best_estimator_.predict(x_val)

ridge_MSE_predicted_val = round(mean_squared_error(y_val, y_hat_ridge, squared=True), round_digits) 
print('MSE ridge:', ridge_MSE_predicted_val)

MSE ridge: 0.0201114


# Train Lasso regression model

https://medium.com/pursuitnotes/day-34-regularization-in-machine-learning-3-c37c336b079c

Linear Model trained with L1 prior as regularizer (aka the Lasso).

The optimization objective for Lasso is:

$\ (1 / (2 * n samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 $

In [881]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

lasso = Lasso()
#parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]}
parameters = {'alpha': arange(0.0009, 0.002, 0.0001)}

lasso_lr = GridSearchCV(lasso, parameters, scoring='neg_mean_squared_error', cv = cv)
lasso_lr.fit(x_train, y_train)

print('Lasso Coef.:', lasso_lr.best_estimator_.coef_)
print('Lasso Intercept:', lasso_lr.best_estimator_.intercept_)

print('Lasso best params:', lasso_lr.best_params_ )

lasso_MSE_cv_train = round(absolute(lasso_lr.best_score_), round_digits)
print('Lasso MSE:', lasso_MSE_cv_train)

Lasso Coef.: [ 2.87275012e-02  0.00000000e+00  9.86073269e-02  3.21229690e-01
 -6.82937403e-01  1.68652240e+00  2.95228783e-02  1.82319248e+00
  1.14608703e-03 -1.70519453e-02 -1.45255477e+00 -7.05544194e-01
  4.10745247e-02 -6.08362196e-01  2.56820468e-02 -3.81030225e-01
 -1.20184086e-01 -1.35711944e+00 -1.25644124e+00  9.60901642e-01]
Lasso Intercept: [-0.0260653]
Lasso best params: {'alpha': 0.0019000000000000004}
Lasso MSE: 0.0166523


In [882]:
y_hat_lasso = lasso_lr.best_estimator_.predict(x_val)

lasso_MSE_predicted_val = round(mean_squared_error(y_val, y_hat_lasso, squared=True), round_digits)
print('MSE lasso:', lasso_MSE_predicted_val)

MSE lasso: 0.0198326


# Print all MSE results and save to CSV file

In [883]:
print('################ OLS Regression ################')
print('OLS_lr_MSE_predicted_train:', OLS_lr_MSE_predicted_train)
print('OLS_lr_MSE_cv_train:', OLS_lr_MSE_cv_train)
print('OLS_lr_MSE_predicted_val:', OLS_lr_MSE_predicted_val)
print('')

print('#################### Ridge ####################')
print('ridge_MSE_cv_train:', ridge_MSE_cv_train)
print('ridge_MSE_predicted_val:', ridge_MSE_predicted_val)
print('')

print('#################### Lasso ####################')
print('lasso_MSE_cv_train:', lasso_MSE_cv_train)
print('lasso_MSE_predicted_val:', lasso_MSE_predicted_val)


################ OLS Regression ################
OLS_lr_MSE_predicted_train: 0.0076001
OLS_lr_MSE_cv_train: 0.0181392
OLS_lr_MSE_predicted_val: 0.020024

#################### Ridge ####################
ridge_cv_train: 0.0173796
ridge_predicted_val: 0.0201114

#################### Lasso ####################
lasso_MSE_cv_train: 0.0166523
lasso_MSE_predicted_val: 0.0198326


In [884]:
import csv
import pandas as pd
 
data = [[OLS_lr_MSE_cv_train, OLS_lr_MSE_predicted_val,
         ridge_MSE_cv_train, ridge_MSE_predicted_val,
         lasso_MSE_cv_train, lasso_MSE_predicted_val]]
 
# Create the pandas DataFrame
#df = pd.DataFrame(data, columns = ['Name', 'Age'])
df = pd.DataFrame(data)

df.to_csv('AAut_Project_part1_results.csv', header=False, mode='a')
 