In [1]:
import numpy as np
import pandas as pd
import math

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
sns.set()
%matplotlib inline

In [3]:
from sklearn.datasets import load_boston

The load_boston function returns the dataset in [this](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_boston.html) format

In [14]:
X = df.data
y = df.target

Let's split the dataset into separate training and test sets, because we're interested in seeing how well the models we train generalize to unseen data

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

Now we train a simple linear regression model on the training data and see how well it does on the test data

In [17]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

print('Training R^2: {}'.format(lr_model.score(X_train, y_train)))
print('Test R^2: {}'.format(lr_model.score(X_test, y_test)))

y_pred = lr_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mse)

print('RMSE: {}'.format(rmse))

Training R^2: 0.7434997532004697
Test R^2: 0.711226005748496
RMSE: 4.638689926172797


We see that the model does about as well on the test data as on the training data, yielding a test $R^2$ of about 0.71. Can we do better than this using a more complicated model? 

In [18]:
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', LinearRegression())
]

pipeline = Pipeline(steps)

pipeline.fit(X_train, y_train)

print('Training R^2: {}'.format(pipeline.score(X_train, y_train)))
print('Test R^2: {}'.format(pipeline.score(X_test, y_test)))

Training R^2: 0.9445782482739907
Test R^2: 0.6266141927476768


This time, the test $R^2$ is much lower than the training $R^2$, showing that the model is significantly overfit to the training data. We want to use regularization to reduce this overfit. 

In [19]:
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', Ridge(alpha=10, fit_intercept=True))   # running a ridge regression
]

ridge_pipe = Pipeline(steps)
ridge_pipe.fit(X_train, y_train)

print('Training R^2: {}'.format(ridge_pipe.score(X_train, y_train)))
print('Test R^2: {}'.format(ridge_pipe.score(X_test, y_test)))

Training R^2: 0.9322063334864211
Test R^2: 0.8038169683868268


Our test set $R^2$ improved from 0.7 to 0.8 using regularization, in the form of ridge regression