# Linear Models Regression
Linear models are a good choice when the target value is expected to be a linear combination of the features.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error
import math

import warnings
warnings.filterwarnings('ignore')

In [2]:
# setting up default plotting parameters
%matplotlib inline

plt.rcParams['figure.figsize'] = [20.0, 7.0]
plt.rcParams.update({'font.size': 22,})

sns.set_palette('viridis')
sns.set_style('white')
sns.set_context('talk', font_scale=0.8)

In [3]:
# column headings
columns = ['age', 'sex', 'bmi', 'average_bp', 's1', 's2', 's3', 's4', 's5', 's6']
# load regression dataset
diabetes, target = load_diabetes(return_X_y=True)
diabetes = pd.DataFrame(data=diabetes, columns=columns)
print(diabetes.shape)
diabetes.head()

(442, 10)


Unnamed: 0,age,sex,bmi,average_bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641


In [4]:
# Prepare data for modeling
# Separate input features and target
y = target
X = diabetes

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

In [5]:
# function to get cross validation scores
def get_cv_scores(model):
    scores = cross_val_score(model,
                             X_train,
                             y_train,
                             cv=5,
                             scoring='r2')
    
    print('CV Mean: ', np.mean(scores))
    print('STD: ', np.std(scores))
    print('\n')

## Linear Regression  (Ordinary Least Squares)
Linear Regression fits a model to minimize the residual sum of squares between observed and predicted targets.

In [6]:
from sklearn.linear_model import LinearRegression

# Train model
lr = LinearRegression().fit(X_train, y_train)

# get cross val scores
get_cv_scores(lr)

CV Mean:  0.4758231204137221
STD:  0.1412116836029729




In [7]:
print('Train Score: ', lr.score(X_train, y_train))
print('Test Score: ', lr.score(X_test, y_test))

Train Score:  0.5415712135071533
Test Score:  0.3986241710470183


In [8]:
# coef_ attribute is numpy array with one entry per input feature
lr.coef_

array([  22.14196252, -263.14598905,  590.23580493,  293.54027057,
       -671.70261398,  371.15000248,  113.81592716,  253.96563411,
        685.71493107,   67.92913193])

In [9]:
# match column names to coefficients
for coef, col in enumerate(X_train.columns):
    print(f'{col}:  {lr.coef_[coef]}')

age:  22.141962524465924
sex:  -263.14598904977373
bmi:  590.2358049346964
average_bp:  293.5402705746201
s1:  -671.7026139849073
s2:  371.15000247801123
s3:  113.81592716356215
s4:  253.9656341060198
s5:  685.7149310690525
s6:  67.9291319280824


In [10]:
# intercept_ always a single floating point number
lr.intercept_

150.3870816213711

In [11]:
# mean squared error
y_ = lr.predict(X_test)
lr_mse = mean_squared_error(y_, y_test)
lr_mse

2853.9640264262994

In [12]:
# find distance from ground truth target value
math.sqrt(lr_mse)

53.42250486851304

In [13]:
print(target.min())
print(target.max())

25.0
346.0


## Ridge Regression (L2 Regularization)
Ridge regression imposes a penalty on the size of the coefficents.  Here we want the magnitude of the coefficients to be minimized so that each feature has as little effect on the outcome as possible.

In [14]:
from sklearn.linear_model import Ridge

# Train model with default alpha=1
ridge = Ridge(alpha=1).fit(X_train, y_train)

# get cross val scores
get_cv_scores(ridge)

CV Mean:  0.3826248703036134
STD:  0.09902564009167607




In [15]:
print('Train Score: ', ridge.score(X_train, y_train))
print('Test Score: ', ridge.score(X_test, y_test))

Train Score:  0.4541351554537545
Test Score:  0.3692726746674706


In [16]:
# find optimal alpha with grid search
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid = dict(alpha=alpha)

grid = GridSearchCV(estimator=ridge, param_grid=param_grid, scoring='r2', verbose=1, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)

print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Fitting 3 folds for each of 7 candidates, totalling 21 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Best Score:  0.4883436188936269
Best Params:  {'alpha': 0.01}


[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:    1.9s finished


In [17]:
ridge = Ridge(alpha=0.1).fit(X_train, y_train)

get_cv_scores(ridge)

print('Train Score: ', ridge.score(X_train, y_train))
print('Test Score: ', ridge.score(X_test, y_test))

CV Mean:  0.4758943027304393
STD:  0.1330178383094897


Train Score:  0.5360684371966494
Test Score:  0.4023917290230471


In [18]:
ridge.intercept_

150.56598713944817

In [19]:
ridge.coef_

array([  27.91868954, -220.8689592 ,  531.29352531,  273.10426296,
        -44.01525525,  -75.27283265, -174.97337209,  149.224614  ,
        420.31566714,   88.89133225])

In [20]:
# match column names to coefficients
for coef, col in enumerate(X_train.columns):
    print(f'{col}:  {ridge.coef_[coef]}')

age:  27.91868953649017
sex:  -220.86895920135797
bmi:  531.2935253136762
average_bp:  273.1042629572964
s1:  -44.015255246707824
s2:  -75.27283264515303
s3:  -174.97337209270378
s4:  149.22461400287028
s5:  420.3156671437304
s6:  88.89133225274878


## Lasso Regression (L1 Regularization)
Lasso regression uses L1 regularization to force some coefficents to be exactly zero which means they are ignored by the model.  This can be used as a type of feature selection!  Lasso can make the model easier to interpret and reveal the most important features.

In [21]:
from sklearn.linear_model import Lasso

# Train model with default alpha=1
lasso = Lasso(alpha=1).fit(X_train, y_train)

# get cross val scores
get_cv_scores(lasso)

CV Mean:  0.3510033961713952
STD:  0.08727927390128883




In [22]:
print('Train Score: ', lasso.score(X_train, y_train))
print('Test Score: ', lasso.score(X_test, y_test))

Train Score:  0.39941787047902677
Test Score:  0.3093987980705396


In [23]:
# find optimal alpha with grid search
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid = dict(alpha=alpha)

grid = GridSearchCV(estimator=lasso, param_grid=param_grid, scoring='r2', verbose=1, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)

print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Fitting 3 folds for each of 7 candidates, totalling 21 fits
Best Score:  0.48813139496070573
Best Params:  {'alpha': 0.01}


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:    0.0s finished


In [24]:
lasso = Lasso(alpha=0.1).fit(X_train, y_train)

get_cv_scores(lasso)

print('Train Score: ', lasso.score(X_train, y_train))
print('Test Score: ', lasso.score(X_test, y_test))

CV Mean:  0.4760522440673414
STD:  0.1377224516695372


Train Score:  0.5327577910549609
Test Score:  0.3927007832332058


In [25]:
lasso.intercept_

150.62229713573453

In [26]:
lasso.coef_

array([ 0.00000000e+00, -1.73477912e+02,  5.80834808e+02,  2.44193361e+02,
       -1.07149376e-01, -0.00000000e+00, -2.15894383e+02,  0.00000000e+00,
        4.77841040e+02,  3.83734728e+01])

In [27]:
# match column names to coefficients
for coef, col in enumerate(X_train.columns):
    print(f'{col}:  {lasso.coef_[coef]}')

age:  0.0
sex:  -173.47791176370126
bmi:  580.8348081262177
average_bp:  244.19336099877884
s1:  -0.10714937611416155
s2:  -0.0
s3:  -215.89438334261982
s4:  0.0
s5:  477.8410404783166
s6:  38.37347280471298


## Elastic-Net
Elastic-net uses both L1 and L2 regularization.

In [28]:
from sklearn.linear_model import ElasticNet

# Train model with default alpha=1 and l1_ratio=0.5
elastic_net = ElasticNet(alpha=1, l1_ratio=0.5).fit(X_train, y_train)

# get cross val scores
get_cv_scores(elastic_net)

CV Mean:  -0.05139208284143739
STD:  0.07297997198698156




In [29]:
# find optimal alpha with grid search
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
l1_ratio = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
param_grid = dict(alpha=alpha, l1_ratio=l1_ratio)

grid = GridSearchCV(estimator=elastic_net, param_grid=param_grid, scoring='r2', verbose=1, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)

print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 77 candidates, totalling 231 fits
Best Score:  0.48993062619187755
Best Params:  {'alpha': 0.001, 'l1_ratio': 0.8}


[Parallel(n_jobs=-1)]: Done 231 out of 231 | elapsed:    0.2s finished


In [30]:
elastic_net = ElasticNet(alpha=0.001, l1_ratio=0.8).fit(X_train, y_train)

get_cv_scores(elastic_net)

print('Train Score: ', elastic_net.score(X_train, y_train))
print('Test Score: ', elastic_net.score(X_test, y_test))

CV Mean:  0.4772292278126552
STD:  0.13731052817086237


Train Score:  0.5378670660730037
Test Score:  0.40051069363763503


In [31]:
elastic_net.intercept_

150.52726690987788

In [32]:
elastic_net.coef_

array([  26.4993636 , -232.96058937,  549.91494018,  279.35967828,
        -64.10927996,  -70.79664538, -168.19232479,  151.62656673,
        440.57774862,   82.66456533])

In [33]:
# match column names to coefficients
for coef, col in enumerate(X_train.columns):
    print(f'{col}:  {elastic_net.coef_[coef]}')

age:  26.499363595599906
sex:  -232.96058936616024
bmi:  549.9149401774166
average_bp:  279.35967827622153
s1:  -64.10927996304933
s2:  -70.79664537880826
s3:  -168.1923247948288
s4:  151.62656673245243
s5:  440.57774862291586
s6:  82.6645653330565
