In [47]:
#We will load the Boston housing dataset first
from sklearn.datasets import load_boston
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
boston = load_boston()
X = boston.data #get the features
y = boston.target #the target to be predicted





In [5]:
#let us split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
                                                   random_state = 18)


In [7]:
x_train.shape, x_test.shape

((404, 13), (102, 13))

In [6]:
#Scale the data
#let us scale the data using a MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test) #VERY IMPORTANT - do not fit on x_test



In [8]:
#let us make the data set more complex by adding some polynomial terms
poly = PolynomialFeatures(degree=2, include_bias=False)
poly.fit(x_train)
x_train = poly.transform(x_train)
x_test = poly.transform(x_test)

In [9]:
x_train.shape, x_test.shape

((404, 104), (102, 104))

Note that the number of columns has increased from 13 to 104. Also, notice that we have only 404 observations. What do you think will happen? Will it overfit or underfit?

In [10]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train, y_train)
print("Training score: ", model.score(x_train, y_train)) #model.score displays R-Square
print("Test score: ", model.score(x_test, y_test))

Training score:  0.9429748697596014
Test score:  0.8009416011659013


In [12]:
#let us try a different metric
from sklearn.metrics import mean_squared_error
#To use mean_squared_error, you need to predict values
y_train_pred = model.predict(x_train) #predicted values for x_train
y_test_pred = model.predict(x_test) #predicted values for x_test
print("Training score: ", mean_squared_error(y_train, y_train_pred)) #model.score displays R-Square
print("Test score: ", mean_squared_error(y_test, y_test_pred))

Training score:  4.741817667724283
Test score:  17.763307218643984


In [13]:
#Let us look at the intercept
model.intercept_

5.10632131057077

In [14]:
#Coefficients of the 104 features
model.coef_

array([-3.33062161e+02, -3.44804853e+01, -6.81835513e+01,  1.53103328e+01,
       -5.82710355e+00,  6.98435737e+01,  3.60183457e+01, -4.66929235e+01,
        3.59806054e+01,  7.73700748e+00, -6.75111830e+00, -2.70725170e-01,
        4.67382078e-01,  2.05025548e+01,  3.17210888e+03, -2.13072756e+02,
        1.79508545e+02, -6.27952911e+01,  5.22765000e+01, -3.20429653e+01,
       -1.30327832e+02, -8.74903024e+02,  1.34199473e+03,  1.18343252e+02,
       -1.30337750e+01,  9.93858965e+01, -8.12292973e+00, -1.75420544e+01,
       -9.57165256e+00, -4.94939290e+01,  1.33964814e+01,  1.52040470e+00,
        1.46980654e+00, -1.32016858e+01,  3.32926562e+01, -7.99046265e+00,
        4.02724308e+01, -1.77855728e+01,  3.19720529e+01,  9.60268471e-01,
        1.02388173e+01,  3.59852885e+01,  5.70793791e+00,  1.55837573e+01,
       -2.47396640e+01,  4.08256214e+00, -8.15271150e+00,  3.45719219e+01,
       -3.33791491e+01,  1.53103328e+01, -2.99168461e+01, -3.54867572e+01,
        5.71360064e+00,  

It is clear that it is overfitting. Let us try Ridge and Lasso. Note that Ridge uses L2 norm and Lasso uses L1 norm


In [15]:
from sklearn.linear_model import Ridge

#Ridge uses a parameter called alpha to contol the complexity of the model

model = Ridge() #default alpha is 1. Increase alpha to simplify the model and decrease it to make the model more complex
model.fit(x_train, y_train)
print("Training score: ", model.score(x_train, y_train)) #model.score displays R-Square
print("Test score: ", model.score(x_test, y_test))


Training score:  0.8691865636837661
Test score:  0.8058389758289868


Notice tha alpha = 1 is not simplifying the model. Let us try alpha = 10

In [16]:
model = Ridge(alpha = 10) #default alpha is 1. Increase alpha to simplify the model and decrease it to make the model more complex
model.fit(x_train, y_train)
print("Training score: ", model.score(x_train, y_train)) #model.score displays R-Square
print("Test score: ", model.score(x_test, y_test))

Training score:  0.7708606147366355
Test score:  0.710682515912249


You can see that it has simplified the model and made it more biased. Hence the low scores for both train and test. We need to lower alpha. Let us try 2.

In [22]:
model = Ridge(alpha = 2) #default alpha is 1. Increase alpha to simplify the model and decrease it to make the model more complex
model.fit(x_train, y_train)
print("Training score: ", model.score(x_train, y_train)) #model.score displays R-Square
print("Test score: ", model.score(x_test, y_test))

Training score:  0.8456632174524317
Test score:  0.7836552986909678


In [33]:
#What happens if we try a low value of alpha
model = Ridge(alpha = 0.008) #default alpha is 1. Increase alpha to simplify the model and decrease it to make the model more complex
model.fit(x_train, y_train)
print("Training score: ", model.score(x_train, y_train)) #model.score displays R-Square
print("Test score: ", model.score(x_test, y_test))

Training score:  0.9365499481731728
Test score:  0.8145748082012485


In [35]:
#let us try Lasso. Note that Lasso may reduce some of the beta values to zero, thereby eliminating those features
from sklearn.linear_model import Lasso
model = Lasso() #use default alpha of 1
model.fit(x_train, y_train)
print("Training score: ", model.score(x_train, y_train)) #model.score displays R-Square
print("Test score: ", model.score(x_test, y_test))

Training score:  0.22698789837112943
Test score:  0.1965668087360748


OOPS! What do we have here? It is highly biased. Why is it highly biased? Check the coefficients to see how many weights were reduced to 0. 

In [36]:
model.coef_

array([-0.        ,  0.        , -0.        ,  0.        , -0.        ,
        0.        , -0.        ,  0.        , -0.        , -0.        ,
       -0.        ,  0.        , -3.70947419, -0.        ,  0.        ,
       -0.        ,  0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        , -0.        ,  0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        ,  0.        ,  0.  

In [37]:
#Let us reduce alpha and see if we can better results
model = Lasso(alpha = 0.1) #use default alpha of 1
model.fit(x_train, y_train)
print("Training score: ", model.score(x_train, y_train)) #model.score displays R-Square
print("Test score: ", model.score(x_test, y_test))

Training score:  0.745948246051797
Test score:  0.7182557308778088


In [42]:
#Much better, but it is still biased
model = Lasso(alpha = 0.001, max_iter = 100000) #use default alpha of 1
model.fit(x_train, y_train)
print("Training score: ", model.score(x_train, y_train)) #model.score displays R-Square
print("Test score: ", model.score(x_test, y_test))

Training score:  0.9306948441227001
Test score:  0.8151804832854133


In [43]:
#Let us try a combination of Lasso and Ridge called ElasticNet
from sklearn.linear_model import ElasticNet
model = ElasticNet() #uses default alpha of 1 and l1_ratio is 0.5. l1_raio determines the proprotion of L2 and L1
model.fit(x_train, y_train)
print("Training score: ", model.score(x_train, y_train)) #model.score displays R-Square
print("Test score: ", model.score(x_test, y_test))

Training score:  0.2820282314170368
Test score:  0.24903751018507392


In [44]:
model = ElasticNet(alpha = 0.001, max_iter = 100000) #uses default alpha of 1 and l1_ratio is 0.5. l1_raio determines the proprotion of L2 and L1
model.fit(x_train, y_train)
print("Training score: ", model.score(x_train, y_train)) #model.score displays R-Square
print("Test score: ", model.score(x_test, y_test))

Training score:  0.9047746864659986
Test score:  0.8279581121649763


In [46]:
model.coef_

array([ -2.56706464,  -3.83552541,   0.        ,   4.71941363,
         4.41659265,  16.52913246,   1.61085278,  -8.71727653,
        10.09208248,   0.51834593,  -1.60832654,   4.65419655,
        -2.05516037,   5.4569818 ,   0.        ,  -1.13249715,
         4.19819925,  -1.53661998,  -0.        ,  -2.38461017,
        -0.        ,  -2.94994094,  -2.39632384,  -2.12472694,
        -3.14715153,   0.        ,   1.70961824,  -2.70752971,
         3.18110828,   0.0406853 ,   7.68269154,  -1.69158271,
        -1.29365194,  -0.        ,   2.85220474,   2.00972651,
        -1.23727911,  -2.71142707,   4.08919216,   6.95325618,
         4.04686611,  -2.26183693,   4.40467748,  -5.36095723,
         1.84353853,   1.90031478,  -4.42446212,   0.29445724,
        -9.297921  ,   4.72002753, -16.01419756, -14.69914637,
         6.97823828,  -2.28220412,   3.45418296,   6.12663608,
        -1.17467283,   0.        , -10.0347709 ,  -3.12619296,
        -3.54511021,  -4.8359633 ,  -0.        ,  -4.16

Play around with the values and see what you get. Remove the higher order  polynomials and see if you can build a model with better R-Square values.