In [245]:

# https://medium.com/@shuklapratik22/implementation-of-simple-linear-regression-using-normal-equation-matrices-f9021c3590da
# https://aegis4048.github.io/mutiple_linear_regression_and_visualization_in_python

# Boa explicação teórica.
# https://algotech.netlify.app/blog/ridge-lasso/
# https://www.datacamp.com/community/tutorials/tutorial-ridge-lasso-elastic-net


# Load the data

In [246]:
import numpy as np

X_train_ini = np.load('Xtrain_Regression_Part1.npy')
Y_train_ini = np.load('Ytrain_Regression_Part1.npy')

X_test = np.load('Xtest_Regression_Part1.npy')

#for i in np.arange(len(X_train_i)):
#    print(X_train_i[i])

#print(X_train_i.shape) # (100, 20)
#print(Y_train_i.shape) # (100, 1)
#print(X_test_i.shape)  # (1000, 20)

# Set train and validation samples

In [247]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(X_train_ini, Y_train_ini, 
                                                  random_state=42, shuffle=True, test_size=0.3)


# Train the linear regression model

In [248]:
from sklearn.linear_model import LinearRegression

# Ordinary least squares Linear Regression
lr = LinearRegression()
lr.fit(x_train, y_train)

# Training score
R2 = lr.score(x_train, y_train)
print('R²:', round(R2, 5))

R²: 0.99949


# Print the coeficients and the intercept

In [249]:
print('Coef.:', lr.coef_)
print('Intercept:', lr.intercept_)

Coef.: [[ 0.03665259 -0.0074543   0.0733648   0.32871596 -0.66023264  1.71193719
   0.06092674  1.80734642  0.01970353 -0.02815931 -1.4483526  -0.72090303
   0.02929287 -0.61160516  0.01287571 -0.37050708 -0.13928133 -1.35886414
  -1.26912984  0.95467059]]
Intercept: [-0.01575139]


# Prediction and performance measures

In [250]:
from sklearn.metrics import mean_squared_error, r2_score

# Prediction on validation data
y_pred = lr.predict(x_val)

print(y_pred.shape)

# Scores on validation data
print('RMSE: %.5f' % mean_squared_error(y_val, y_pred, squared=False))
print('SSE:', round(y_pred.size * mean_squared_error(y_val, y_pred, squared=True), 5))
print('R²:', round(r2_score(y_val, y_pred), 5))
print('MSE: %.5f' % mean_squared_error(y_val, y_pred, squared=True))


(30, 1)
RMSE: 0.13767
SSE: 0.56856
R²: 0.99855
MSE: 0.01895


# Cross-validation performance measures
https://stats.stackexchange.com/questions/411290/how-to-use-a-cross-validated-model-for-prediction

https://towardsdatascience.com/complete-guide-to-pythons-cross-validation-with-examples-a9676b5cac12

In [251]:
#from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate, cross_val_score, RepeatedKFold


# define model evaluation method (repeats k-folds n times, with k-folds=n_splits and n=n_repeats)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=10)

#cv_results = cross_validate(lr, X_train_ini, Y_train_ini, cv=10, scoring='rmse')
#print(sorted(cv_results.keys()))
#print('Scores:', cv_results['test_score'])

# run cross-validation on multiple metrics and also to return train scores, fit times and score times.
scores = cross_validate(lr, X_train_ini, Y_train_ini, cv=cv,
                        scoring=('r2', 'neg_mean_squared_error'),
                        return_train_score=True)

print('train MSE', scores['train_neg_mean_squared_error'])
print('test MSE', scores['test_neg_mean_squared_error'])
print('train R²:', scores['train_r2'])
print('test R²:', scores['test_r2'])


# Evaluate only one score by cross-validation 
# (is the same as 'test_neg_mean_squared_error' of cross_validate did before)
score = cross_val_score(lr, X_train_ini, Y_train_ini, cv=cv,
                        scoring='neg_mean_squared_error')
print('score MSE:', score)
#print("MSE: %0.4f (+/- %0.4f)" % (-score.mean(), score.std() * 2))
print("MSE: %0.5f (%0.5f)" % (-score.mean(), score.std()))

train MSE [-0.00934127 -0.00952399 -0.0087068  -0.0101702  -0.00937168 -0.00967213
 -0.00910104 -0.00966113 -0.00964104 -0.00965431 -0.00925583 -0.00922878
 -0.00995022 -0.00908418 -0.00920842 -0.01010391 -0.00969974 -0.00945459
 -0.00920963 -0.00923122 -0.00913095 -0.00984356 -0.00990566 -0.00949864
 -0.00909545 -0.00893561 -0.00996329 -0.00957123 -0.00987853 -0.00875648]
test MSE [-0.01591206 -0.01345742 -0.0255339  -0.00717773 -0.01439072 -0.01469674
 -0.01988479 -0.01279024 -0.01359396 -0.01263347 -0.01914141 -0.02035905
 -0.00975983 -0.0201782  -0.01738964 -0.00899257 -0.01335777 -0.01496261
 -0.01673795 -0.01897613 -0.01796032 -0.00987247 -0.01035053 -0.01492009
 -0.02063354 -0.02205157 -0.00988657 -0.01424006 -0.01111747 -0.02471772]
train R²: [0.99938587 0.99938163 0.99939567 0.99933553 0.99937562 0.99936869
 0.99942789 0.99935663 0.99938276 0.9993241  0.99938677 0.99942595
 0.99930274 0.99938559 0.99932432 0.99933875 0.99940658 0.99937568
 0.99939945 0.9994024  0.99942588 0.99

# Ridge regression

Linear least squares with l2 regularization.

Minimizes the objective function:

$ \ ||y - Xw||^2_2 + alpha * ||w||^2_2 $

# Train rigde regression model

In [254]:
# https://machinelearningmastery.com/ridge-regression-with-python/

from sklearn.linear_model import RidgeCV
from sklearn.model_selection import RepeatedKFold
from numpy import arange, absolute

# define model evaluation method (repeats k-folds n times, with k-folds=n_splits and n=n_repeats)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# Grid shearch for alphas
ridge_lr = RidgeCV(alphas=arange(0, 1, 0.01), cv=cv
                ,scoring='neg_mean_squared_error')

ridge_lr.fit(x_train, y_train)

# summarize chosen configuration
print('alpha: %0.01f' % ridge_lr.alpha_)

#best_score _: Mean cross-validated score of the best_estimator
print('Ridge Best Score - MSE: %0.5f' % absolute(ridge_lr.best_score_))


#Y_hat = ridge_lr.predict(X_test)


alpha: 0.1
Ridge Best Score - MSE: 0.01883


# Train Lasso regression model

https://medium.com/pursuitnotes/day-34-regularization-in-machine-learning-3-c37c336b079c

Linear Model trained with L1 prior as regularizer (aka the Lasso).

The optimization objective for Lasso is:

$\ (1 / (2 * n samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 $

In [256]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

lasso = Lasso()
#parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]}
parameters = {'alpha': arange(0.0009, 0.002, 0.0001)}

# define model evaluation method (repeats k-folds n times, with k-folds=n_splits and n=n_repeats)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

lasso_regressor = GridSearchCV(lasso, parameters, scoring='neg_mean_squared_error', cv = cv)
lasso_regressor.fit(x_train, y_train)


print('Lasso best params:', lasso_regressor.best_params_ )
print('Lasso MSE:', round(absolute(lasso_regressor.best_score_), 5))

Lasso best params: {'alpha': 0.0018000000000000004}
Lasso MSE: 0.0187
