This is a simple notebook to show the effect of Ridge and Lasso regularization.

It accompanies Chapter 5 of the book (3 of 5).

Author: Viviana Acquaviva, with contributions by Jake Postiglione and Olga Privman.

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib
import matplotlib.pyplot as plt
from sklearn import metrics
%matplotlib inline

font = {'size'   : 16}
matplotlib.rc('font', **font)
matplotlib.rc('xtick', labelsize=14) 
matplotlib.rc('ytick', labelsize=14) 
matplotlib.rcParams.update({'figure.autolayout': False})
matplotlib.rcParams['figure.dpi'] = 300

In [None]:
from sklearn import linear_model

In [None]:
from sklearn.linear_model import Ridge, Lasso

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
from sklearn.model_selection import cross_validate, KFold

In [None]:
np.random.seed(16) #set seed for reproducibility purposes

x1 = np.arange(100) 

x2 = np.linspace(0,1,100)

x3 = np.logspace(2,3,num=100) 

ypb = 3*x1 + 0.5*x2 + 15*x3 + 3 + 5*(np.random.poisson(3*x1 + 0.5*x2 + 15*x3,100)-(3*x1 + 0.5*x2 + 15*x3)) 
                                                    #generate some data with scatter following Poisson distribution 
                                                    #with exp value = y from linear model, centered around 0

In [None]:
xb = np.vstack((x1,x2,x3)).T

In [None]:
xb.shape

### Add correlated features (polynomial transformation)

In [None]:
poly = PolynomialFeatures(2, include_bias=False)

In [None]:
new_xb = poly.fit_transform(xb)

### Learning Check-in
    
How many features will the transformed data set have? You can think of it first, and then use code to find out.

<details>
<summary style="display: list-item;">Click here for the answer!</summary>
<p>
    
```
new_xb.shape

will show that there are 9 features (the original ones, plus all the monomial combinations of x1, x2, and x3 up to degree 1, for example x1^2, x1 x2...)
```
</p>
</details>

### Let's start with Ridge regression, and tune alpha using cross-validation.

(note what happens if repeating a few times without fixing the random seed).

In [None]:
MSE = []

for alpha in np.logspace(-6,6,13):

    model_reg = Ridge(alpha = alpha, normalize = True) #normalization helps

    scores = cross_validate(model_reg, new_xb, ypb, cv = KFold(n_splits=10, shuffle=True, random_state = 1), scoring = 'neg_mean_squared_error')

    print(alpha, np.round(-np.mean(scores['test_score'])))
    
    MSE.append(-np.mean(scores['test_score']))

print('Best alpha:', np.logspace(-6,6,13)[np.argmin(MSE)])

### There is also a built-in instrument for this!

In [None]:
from sklearn.linear_model import RidgeCV

In [None]:
regm = RidgeCV(alphas=np.logspace(-6,6,13), normalize=True, cv = KFold(n_splits=10, shuffle=True, random_state=1),\
             scoring = 'neg_mean_squared_error')

regm.fit(new_xb,ypb) #this calls the "fit" method, which means that I obtain the coefficients.

### We can compare the coefficients of the linear model for different amounts of regularization.

#### Let's pick alpha = 1000.

In [None]:
model = Ridge(alpha = 1000., normalize = True)

In [None]:
model.fit(new_xb,ypb)

coef_alpha_1000 =  np.hstack([model.coef_, model.intercept_]) #Note the intercept is the last number

print(coef_alpha_1000)

#### Now let's see for alpha = 1.0.

### Learning Check-in
    
Will the coefficients be larger or smaller, compared to the case with alpha = 1000?

<details>
<summary style="display: list-item;">Click here for the answer!</summary>
<p>
    
```
The coefficients will be larger, because we use a weaker regularization, and the effect of regularization in a linear model is to keep the coefficients small.
```
</p>
</details>

In [None]:
model = Ridge(alpha = 1., normalize = True)

In [None]:
model.fit(new_xb,ypb)

coef_alpha_1 =  np.hstack([model.coef_, model.intercept_]) #Note the intercept is the last number

print(coef_alpha_1)

#### Below, we use a trick to get coefficients for "zero" alpha (no regularization); I could have also used LinearRegressor.

In [None]:
model = Ridge(alpha = 1e-7, normalize = True)

model.fit(new_xb,ypb)

coef_no_reg =  np.hstack([model.coef_, model.intercept_])

print(coef_no_reg)

### We can compare the coefficients for the three cases.

In [None]:
plt.figure(figsize = (12,6))
plt.bar(np.arange(10)-0.2, np.abs(coef_alpha_1000), color = 'maroon',width=0.05, label = 'Ridge, alpha = 1000')
plt.bar(np.arange(10)-0.1, np.abs(coef_alpha_1), color = 'orangered',width=0.05, label = 'Ridge, alpha = 1.0')
plt.bar(range(10), np.abs(coef_no_reg), color = 'grey',width=0.05, label = 'Linear (no regularization)')
plt.yscale('log')

plt.xticks(np.arange(10), ['1','2', '3','4','5','6','7','8','9', 'Intercept'])  # Set text labels.

plt.xlabel('Feature',fontsize=14)

plt.ylabel('Coefficients (absolute value)',fontsize=14)

plt.legend(fontsize=13);

### Let's take a look at LASSO.

In [None]:
from sklearn.linear_model import Lasso, LassoCV 

In [None]:
#Note: LassoCV re-orders alphas in DESCENDING ORDER! Scores will be messed up unless you use model.alphas_ object

model = LassoCV(alphas = np.logspace(-2,2,5), cv = KFold(n_splits=10, shuffle=True, random_state=1), \
              max_iter = 1000000, tol = 1e-6, normalize=True)

model.fit(new_xb,ypb)

print('Alphas', model.alphas_)

print('Best alpha:', model.alpha_)

for i, alpha in enumerate(model.alphas_):
    print('Score for alpha', alpha, np.mean(model.mse_path_[i,:])) #for each alpha (row), 10 cv estimates of MSE

#### <font color = 'red'> Note: early reproducibility issues were solved by setting tolerance to a small value (thanks to Joel Zinn!). </font>

### We can of course also use the "regular" Lasso and do CV ourselves.

Let's look at the coefficients for alpha = 1000 and alpha = 1. Lasso regularization tends to induce sparse coefficients, so we can check that that's true!

In [None]:
L1000 = Lasso(alpha = 1000, max_iter = 1000000, tol = 0.005, normalize=True)

L1000.fit(new_xb, ypb)

coef_L1000 =  np.hstack([L1000.coef_, L1000.intercept_])

In [None]:
coef_L1000

### Learning Check-in
    
Should we be worried because 

1) The intercept has become very large?

2) All the coefficients have disappeared?

<br>

<details>
<summary style="display: list-item;">Click here for the answer!</summary>
<p>
    
```
1) Not really, because the intercept is excluded from the regularization process;

2) Only if this happens when using the same code, and a much smaller value for alpha :) 
```
</p>
</details>

In [None]:
L1 = Lasso(alpha = 1.0, max_iter = 1000000, tol = 0.005, normalize=True)

L1.fit(new_xb, ypb)

coef_L1 = np.hstack([L1.coef_, L1.intercept_])

In [None]:
coef_L1

In [None]:
coef_no_reg

### Finally, we can plot all the coefficients together.

In [None]:
plt.figure(figsize = (12,6))
plt.bar(np.arange(10)-0.2, np.abs(coef_alpha_1000), color = 'maroon',width=0.05, label = 'Ridge, alpha = 1000')
plt.bar(np.arange(10)-0.1, np.abs(coef_alpha_1), color = 'orangered',width=0.05, label = 'Ridge, alpha = 1.0')
plt.bar(range(10), np.abs(coef_no_reg), color = 'grey',width=0.05, label = 'Linear (no regularization)')
plt.bar(np.arange(10)+0.1, np.abs(coef_L1), color = 'tab:cyan',width=0.05, label = 'Lasso, alpha = 1.0')
plt.bar(np.arange(10)+0.2, np.abs(coef_L1000), color = 'tab:blue', width=0.05, label = 'Lasso, alpha = 1000')

plt.yscale('log')

plt.xticks(np.arange(10), ['1','2', '3','4','5','6','7','8','9', 'Intercept'])  # Set text labels.

plt.xlabel('Feature',fontsize=14)

plt.ylabel('Coefficients (absolute value)',fontsize=14)

plt.legend(fontsize=13, bbox_to_anchor=(1.05, 1));
