# Ridge and Lasso regression 

We use Ridge and Lasso regression to predict the boston house median value.

We show how to calibrate the hyperparameters using cross-validation in different way:

1. GridSearchCV 
2. Cross-validation coded without using any package.

In [None]:
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

# Focus on the importance of the hyperparameters

In [2]:
data = pd.read_csv("../Dataset/BostonHouse.csv")
print(data.shape)
data.head()

(506, 14)


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [18]:
pd.unique(data['zn'])

array([ 18. ,   0. ,  12.5,  75. ,  21. ,  90. ,  85. , 100. ,  25. ,
        17.5,  80. ,  28. ,  45. ,  60. ,  95. ,  82.5,  30. ,  22. ,
        20. ,  40. ,  55. ,  52.5,  70. ,  34. ,  33. ,  35. ])

In [None]:
""""
Alternative way to import the same dataset
from sklearn.datasets import load_boston
data = load_boston()
data = pd.DataFrame(boston.data,columns=boston.feature_names)
data.head()
# add target 
data['price'] = boston.target
"""

In [19]:
# we get rid og these variables because they are categorical
X = data.drop(['zn','rad','medv'], axis = 1)
y = data['medv']

# No Cross-Validation, we try only one alpha random

In [20]:
# Split the sample 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 10)

In [21]:
# Speicfy the λ parameter
lin_reg = LinearRegression()
ridge_reg = Ridge(alpha = 0.5)
lasso_reg = Lasso(alpha = 0.5)

In [22]:
# Fit the model
lin_reg.fit(X_train, y_train)
ridge_reg.fit(X_train, y_train)
lasso_reg.fit(X_train, y_train)

Lasso(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [23]:
# Predict
pred_lin = lin_reg.predict(X_test)
pred_ridge = ridge_reg.predict(X_test)
pred_lasso = lasso_reg.predict(X_test)

print('MSE oos Linear Regression: ' + str(np.mean((pred_lin - y_test)**2)))
print('MSE oos Ridge: ' + str(np.mean((pred_ridge - y_test)**2)))
print('MSE oos Lasso: ' + str(np.mean((pred_lasso - y_test)**2)))

# Using the function mean_squared_error is the same
# print('MSE oos Linear Regression: ' + str(mean_squared_error(y_test, pred_lin)))
# print('MSE oos Ridge: ' + str(mean_squared_error(y_test, pred_ridge)))
# print('MSE oos Lasso: ' + str(mean_squared_error(y_test, pred_lasso)))

MSE oos Linear Regression: 33.23413967236286
MSE oos Ridge: 33.52851898955666
MSE oos Lasso: 40.08403870234899


# Cross-Vaildation, using only one alpha

In [24]:
# Models
lin_reg2 = LinearRegression()
ridge_reg2 = Ridge(alpha = 0.5)
lasso_reg2 = Lasso(alpha = 0.5)

In [25]:
# fit and compute MSE on the entire sample
lin_reg2_ = cross_val_score(lin_reg2, X,y, scoring = 'neg_mean_squared_error', cv = 10)
ridge_reg2_ = cross_val_score(ridge_reg2, X,y, scoring = 'neg_mean_squared_error', cv = 10)
lasso_reg2_ = cross_val_score(lasso_reg2, X,y, scoring = 'neg_mean_squared_error', cv = 10)


print('Mean MSE lin reg: ' + str(np.mean(np.abs(lin_reg2_))))
print('Mean MSE ridge reg: ' + str(np.mean(np.abs(ridge_reg2_))))
print('Mean MSE lasso reg: ' + str(np.mean(np.abs(lasso_reg2_))))
print('')
print('Min MSE lin reg: ' + str(np.amin(np.abs(lin_reg2_))))
print('Min MSE ridge reg: ' + str(np.amin(np.abs(ridge_reg2_))))
print('Min MSE lasso reg: ' + str(np.amin(np.abs(lasso_reg2_))))

# with cross validation we can see that the error decreases
# Ridge and Linear regression are very similar.

Mean MSE lin reg: 36.12144451676859
Mean MSE ridge reg: 35.68322843991933
Mean MSE lasso reg: 34.660555264922195

Min MSE lin reg: 8.515384570687695
Min MSE ridge reg: 8.58802057488008
Min MSE lasso reg: 10.531168366887572


In [26]:
# we can use instead the train_test_split function inside a loop
# in order to build i-Kfold cross-validation procedure
mse_lin_reg = []
mse_ridge = []
mse_lasso = []

for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = i)
    lin_reg2.fit(X_train, y_train)
    ridge_reg2.fit(X_train, y_train)
    lasso_reg2.fit(X_train, y_train)
    
    # predict 
    pred_lin = lin_reg2.predict(X_test)
    pred_ridge = ridge_reg2.predict(X_test)
    pred_lasso = lasso_reg2.predict(X_test)
    
    # MSE
    mse_lin_reg.append(mean_squared_error(y_test, pred_lin))
    mse_ridge.append(mean_squared_error(y_test, pred_ridge))
    mse_lasso.append(mean_squared_error(y_test, pred_lasso))
    

In [27]:
print('Mean MSE lin reg: ' + str(np.mean(mse_lin_reg)))
print('Mean MSE ridge reg: ' + str(np.mean(mse_ridge)))
print('Mean MSE lasso reg: ' + str(np.mean(mse_lasso)))
print('')
print('Min MSE lin reg: ' + str(np.amin(mse_lin_reg)))
print('Min MSE ridge reg: ' + str(np.amin(mse_ridge)))
print('Min MSE lasso reg: ' + str(np.amin(mse_lasso)))

# results are different cause the teo algorithm takes differnt codes

Mean MSE lin reg: 25.164823717137914
Mean MSE ridge reg: 25.265000520426494
Mean MSE lasso reg: 27.139754667948008

Min MSE lin reg: 15.167805842454264
Min MSE ridge reg: 14.799553147966884
Min MSE lasso reg: 15.061507960483716


We can notice that only with the cross validation the Mean MSE decreses.

# Cross - Validation usinfg a grid of alphas

In [28]:
# Models
lin_reg3 = Ridge()
ridge_reg3 = Ridge()
lasso_reg3 = Lasso()
params = {'alpha': np.linspace(0.001,0.999, 10)}
params_lin_reg = {'alpha': np.linspace(1,1, 10)}

We can use GridSearchCV or LassoCV and RidgeCV. In the following we use GridSearchCV.

input(model, parameters grid, scoring , cv)
This commad already performs the cross validation using all the parameter and performing 10fold cross validation.

In [29]:
# we can apply this only to lasso and ridge
lin_reg_3_ = GridSearchCV(lin_reg3, params_lin_reg, scoring = 'neg_mean_squared_error', cv = 10)
ridge_reg_3_ = GridSearchCV(ridge_reg3, params, scoring = 'neg_mean_squared_error', cv = 10)
lasso_reg_3_ = GridSearchCV(lasso_reg3, params, scoring = 'neg_mean_squared_error', cv = 10)

In [30]:
# Fit the model, we pass the entire dataset because GridSearchCV already does the splitting of the sample.
lin_reg_3_.fit(X, y)
ridge_reg_3_.fit(X, y)
lasso_reg_3_.fit(X, y)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': array([0.001  , 0.11189, 0.22278, 0.33367, 0.44456, 0.55544, 0.66633,
       0.77722, 0.88811, 0.999  ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [31]:
# Show the best alpha and MSE
print('Lin param in sample:' +str(lin_reg_3_.best_params_))
print('Lin MSE in sample: ' + str(np.abs(lin_reg_3_.best_score_)))
print('Ridge param in sample:' +str(ridge_reg_3_.best_params_))
print('Ridge MSE in sample: ' + str(np.abs(ridge_reg_3_.best_score_)))
print('Lasso param in sample:' +str(lasso_reg_3_.best_params_))
print('Lasso MSE in sample: ' + str(np.abs(lasso_reg_3_.best_score_)))

# best index
print('')
print('Lin param in sample:' +str(lin_reg_3_.best_index_))
print('Ridge param in sample:' +str(ridge_reg_3_.best_index_))
print('Lasso param in sample:' +str(lasso_reg_3_.best_index_))

Lin param in sample:{'alpha': 1.0}
Lin MSE in sample: 35.310394911288526
Ridge param in sample:{'alpha': 0.999}
Ridge MSE in sample: 35.31068410296068
Lasso param in sample:{'alpha': 0.44455555555555554}
Lasso MSE in sample: 34.544430878386564

Lin param in sample:0
Ridge param in sample:9
Lasso param in sample:4


In [38]:
params['alpha'][9]

0.999

In [39]:
# We need to be more fair in the comparison.
# We fit the model on a training sample and fit it on the test sample.
lin_reg_3_.fit(X_train, y_train)
ridge_reg_3_.fit(X_train, y_train)
lasso_reg_3_.fit(X_train, y_train)

pred_lin3 = lin_reg_3_.predict(X_test)
pred_ridge3 = ridge_reg_3_.predict(X_test)
pred_lasso3 =lasso_reg_3_.predict(X_test)


print('OOS MSE lin reg: ' + str(np.mean((pred_lin3 - y_test)**2)))
print('OOS MSE ridge reg: ' + str(np.mean((pred_ridge3 - y_test)**2)))
print('OOS MSE lasso reg: ' + str(np.mean((pred_lasso3 - y_test)**2)))

# Out of sample the best is ridge and not lasso.
print('')
# Show the best alpha computed in the train setand MSE
print('Lin param train set:' +str(lin_reg_3_.best_params_))
print('Ridge param train set:' +str(ridge_reg_3_.best_params_))
print('Lasso param train set:' +str(lasso_reg_3_.best_params_))

OOS MSE lin reg: 33.63239523801085
OOS MSE ridge reg: 33.13810595436818
OOS MSE lasso reg: 32.962074356774714

Lin param train set:{'alpha': 1.0}
Ridge param train set:{'alpha': 0.22277777777777777}
Lasso param train set:{'alpha': 0.001}


This is are the parameters that are used in the out of sample. Let's see

In [41]:
# comparison with output above
ridge_try = Ridge(alpha = 0.22227777)
lasso_try = Lasso(alpha = 0.001)
ridge_try.fit(X_train, y_train)
lasso_try.fit(X_train, y_train)

ridge_pred_try = ridge_try.predict(X_test)
lasso_pred_try = lasso_try.predict(X_test)
print('OOS MSE ridge reg: ' + str(np.mean((ridge_pred_try - y_test)**2)))
print('OOS MSE lasso reg: ' + str(np.mean((lasso_pred_try - y_test)**2)))

OOS MSE ridge reg: 33.13766984447106
OOS MSE lasso reg: 32.962074356774714


You can notice that the results are the same as above.

In [42]:
# Cross Validation coded hard-core
max_int = 10
size_block = np.int(np.floor(X.shape[0]/10))
alphas = np.linspace(0.01, 0.99, 100)
XX_tot =[]
yy_tot = []
for i in range(max_int):
    if i == max_int-1:
        X_tmp = X[i*size_block:]
        y_tmp = y[i*size_block:]
    else:
        X_tmp = X[i*size_block:(i+1)*size_block]
        y_tmp = y[i*size_block:(i+1)*size_block]
    XX_tot.append(X_tmp)
    yy_tot.append(y_tmp)

idx_target_block = np.asarray(np.linspace(0,9,10),int)

mse_lm = np.zeros((len(alphas), max_int))
mse_ridge = np.zeros((len(alphas), max_int))
mse_lasso = np.zeros((len(alphas), max_int))

for i in range(max_int):
    random_idx_train = np.random.choice(np.delete(idx_target_block, i), 9)
    idx_test = i
    
    for j in range(len(random_idx_train)-1):
        XX_train = XX_tot[random_idx_train[0]]
        yy_train = yy_tot[random_idx_train[0]]
        
        XX_train =pd.concat([XX_train, XX_tot[random_idx_train[j+1]]], axis = 0)
        yy_train =pd.concat([yy_train, yy_tot[random_idx_train[j+1]]], axis = 0)
        
        XX_test = XX_tot[i]
        yy_test = yy_tot[i]
        
        # fit the model
    for a in range(len(alphas)):
        lm = LinearRegression()
        ridge_ = Ridge(alpha = alphas[a])
        lasso_ = Lasso(alpha = alphas[a])
            
        lm.fit(XX_train, yy_train)
        ridge_.fit(XX_train, yy_train)
        lasso_.fit(XX_train, yy_train)
            
        # predict
        lm_pred = lm.predict(X_test)
        ridge_pred = ridge_.predict(X_test)
        lasso_pred = lasso_.predict(X_test)

        # MSE
        mse_lm[a,i] = mean_squared_error(y_test, lm_pred)
        mse_ridge[a,i] = mean_squared_error(y_test, ridge_pred)
        mse_lasso[a,i] = mean_squared_error(y_test, lasso_pred)
 

In [53]:
print('Mean MSE lin reg: ' + str(np.mean(mse_lm)))
print('Mean MSE ridge reg: ' + str(np.mean(mse_ridge)))
print('Mean MSE lasso reg: ' + str(np.mean(mse_lasso)))
print('')
print('Min MSE lin reg: ' + str(np.amin(mse_lm)))
print('Min MSE ridge reg: ' + str(np.amin(mse_ridge)))
print('Min MSE lasso reg: ' + str(np.amin(mse_lasso)))
print('')
print('Optimal Ridge λ: ', alphas[np.where(mse_ridge == np.amin(mse_ridge))[0][0]])
print('Optimal Lasso λ: ', alphas[np.where(mse_lasso == np.amin(mse_lasso))[0][0]])

Mean MSE lin reg: 134.10442096935404
Mean MSE ridge reg: 79.88386311954409
Mean MSE lasso reg: 64.4339660256806

Min MSE lin reg: 37.25418248827695
Min MSE ridge reg: 35.025611667885244
Min MSE lasso reg: 35.638724533741694

Optimal Ridge λ:  0.1782828282828283
Optimal Lasso λ:  0.029797979797979796


Ridge and Lasso outperform the linear regression.

Why do we see such differences in the MSE with the different models?
Because the cross-validation at this level still depends a lot on the data used.

To avloid this issues we need to exploit all the possible possibilities in order to perform a more precise out os fample exercise

In [66]:
# We can do the same using a Loop and using train_test_split to split our sample
alphas = np.linspace(0.01, 0.99, 100)

mse_lm_insample = np.zeros((100, 20))
mse_ridge_insample = np.zeros((100, 20))
mse_lasso_insample = np.zeros((100, 20))

mse_lm_oos = np.zeros((100, 20))
mse_ridge_oos = np.zeros((100, 20))
mse_lasso_oos = np.zeros((100, 20))

j = 0
for h in range(0,400,20):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = h)
    
    for i in range(len(alphas)):
        lin_reg3 =LinearRegression()
        ridge_reg3 = Ridge(alpha = alphas[i])
        lasso_reg3 = Lasso(alpha = alphas[i])
    
    # fit the model
        lin_reg3.fit(X_train, y_train)
        ridge_reg3.fit(X_train, y_train)
        lasso_reg3.fit(X_train, y_train)
    
    # predict 
        pred_lin_insample = lin_reg3.predict(X_train)
        pred_ridge_insample = ridge_reg3.predict(X_train)
        pred_lasso_insample =lasso_reg3.predict(X_train)
    
        pred_lin = lin_reg3.predict(X_test)
        pred_ridge = ridge_reg3.predict(X_test)
        pred_lasso =lasso_reg3.predict(X_test)

    
    # MSE insample and out of sample
        mse_lm_insample[i, j] = mean_squared_error(y_train, pred_lin_insample)
        mse_ridge_insample[i, j] = mean_squared_error(y_train, pred_ridge_insample)
        mse_lasso_insample[i, j] = mean_squared_error(y_train, pred_lasso_insample)
        
        mse_lm_oos[i, j] = mean_squared_error(y_test, pred_lin)
        mse_ridge_oos[i, j] = mean_squared_error(y_test, pred_ridge)
        mse_lasso_oos[i, j] = mean_squared_error(y_test, pred_lasso)
    j+=1

In [67]:
# Compute minimum MSE
print('Mean MSE in oos Linear Regression: ' , np.mean(mse_lm_oos))
print('Mean MSE in oos Ridge: ' , np.mean(mse_ridge_oos))
print('Mean MSE in oos Lasso: ' , np.mean(mse_lasso_oos))
print('')
print('Mean MSE insample Linear Regression: ' ,np.mean(mse_lm_insample))
print('Mean MSE insample Ridge Regression: ' ,np.mean(mse_ridge_insample))
print('Mean MSE insample Lasso Regression: ' ,np.mean(mse_lasso_insample))

Mean MSE in oos Linear Regression:  25.348403464755933
Mean MSE in oos Ridge:  25.42317722820139
Mean MSE in oos Lasso:  27.55898416563065

Mean MSE insample Linear Regression:  22.953904743233345
Mean MSE insample Ridge Regression:  23.02083257122452
Mean MSE insample Lasso Regression:  25.397499543935716


In [68]:
# Compute Min
print('Min MSE in oos Linear Regression: ' , np.amin(mse_lm_oos))
print('Min MSE in oos Ridge: ' , np.amin(mse_ridge_oos))
print('Min MSE in oos Lasso: ' , np.amin(mse_lasso_oos))
print('')
print('Min MSE insample Linear Regression: ' ,np.amin(mse_lm_insample))
print('Min MSE insample Ridge Regression: ' ,np.amin(mse_ridge_insample))
print('Min MSE insample Lasso Regression: ' ,np.amin(mse_lasso_insample))

Min MSE in oos Linear Regression:  15.436218617060664
Min MSE in oos Ridge:  15.274855052412118
Min MSE in oos Lasso:  15.230224635012702

Min MSE insample Linear Regression:  19.98098727909367
Min MSE insample Ridge Regression:  19.98101186297584
Min MSE insample Lasso Regression:  20.018688164189822


In [69]:
# optimal value of λ
print('Best λ Ridge: ' , alphas[np.where(mse_ridge_oos == np.amin(mse_ridge_oos))[0][0]])
print('Best λ Lasso: ' , alphas[np.where(mse_lasso_oos == np.amin(mse_lasso_oos))[0][0]])

Best λ Ridge:  0.8712121212121212
Best λ Lasso:  0.0198989898989899


This is a fake prediction exercise, because to calibrate the parameter we have used the entire set of data.

In [71]:
# predict
pred_lin3 = lin_reg3.predict(X_test)
pred_ridge3 = ridge_reg3.predict(X_test)
pred_lasso3 =lasso_reg3.predict(X_test)


print('OOS MSE lin reg: ' + str(np.mean((pred_lin3 - y_test)**2)))
print('OOS MSE ridge reg: ' + str(np.mean((pred_ridge3 - y_test)**2)))
print('OOS MSE lasso reg: ' + str(np.mean((pred_lasso3 - y_test)**2)))


OOS MSE lin reg: 21.98734434242419
OOS MSE ridge reg: 21.96003402330093
OOS MSE lasso reg: 20.044856629447818


With the correct oos structure we notice that Lasso is the best and both the model are able to beat the linear regression in predictin the price of the house in Boston

# let's do the same using RidgeCV and LassoCV

In [None]:
linCV_reg = RidgeCV(alphas = np.linspace(1,1,10), cv = 10, scoring='neg_mean_squared_error')
ridgeCV_reg = RidgeCV(alphas = np.linspace(0.001,0.999,10), cv = 10, scoring='neg_mean_squared_error')
lassoCV_reg = LassoCV(alphas = np.linspace(0.001,0.999,10), cv = 10)

In [None]:
# fit full sample
linCV_reg.fit(X, y)
ridgeCV_reg.fit(X, y)
lassoCV_reg.fit(X, y)

In [None]:
# show parameters
print('Ridge α entire sample: ' + str(ridgeCV_reg.alpha_))
print('Lasso α entire sample: ' + str(lassoCV_reg.alpha_))

In [None]:
# ridge does not store the mse so to compute the in-sample MSE we have to do the following:
# 1. select best alpha
# 2. set Ridge_param
# 3. cross_val_score
α_ridge = ridgeCV_reg.alpha_
ridge_tmp = Ridge(alpha = α_ridge)
ridge_reg2_ = cross_val_score(ridge_tmp, X,y, scoring = 'neg_mean_squared_error', cv = 10)

lin_tmp = Ridge(alpha = 1)
lin_reg2_ = cross_val_score(lin_tmp, X,y, scoring = 'neg_mean_squared_error', cv = 10)

# show mse
print('MSE entire sample Lin Reg: ' + str(lin_reg2_.mean()))
print('MSE entire sample Ridge: ' + str(ridge_reg2_.mean()))
print('MSE entire sample Ridge: ' + str(lassoCV_reg.mse_path_.mean()))

Let's perform a proper out of sample exercise

In [None]:
# fit train set
linCV_reg.fit(X_train, y_train)
ridgeCV_reg.fit(X_train, y_train)
lassoCV_reg.fit(X_train, y_train)

In [None]:
# show parameters
print('Ridge α entire sample: ' + str(ridgeCV_reg.alpha_))
print('Lasso α entire sample: ' + str(lassoCV_reg.alpha_))

In [None]:
# predict - it uses the best set or parameters.
pred_lin4 = linCV_reg.predict(X_test)
pred_ridge4 = ridgeCV_reg.predict(X_test)
pred_lasso4 = lassoCV_reg.predict(X_test)

print('OOS MSE lin reg: ' + str(np.mean((pred_lin4 - y_test)**2)))
print('OOS MSE ridge reg: ' + str(np.mean((pred_ridge4 - y_test)**2)))
print('OOS MSE lasso reg: ' + str(np.mean((pred_lasso4 - y_test)**2)))

Same results as before!