In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
d = pd.read_csv('auto-mpg.csv')

# Model selection

We are going to use:
* LOOCV for "outer" model selection
* LOOCV for "inner" hyperparameter tuning

*Warning*: two nested LOOCV take a **long** time!

In [6]:
label = 'mpg'
features = [column for column in d.columns if column != label]
X, y = d[features], d[label]

## Models

1. Linear Regression
2. Quadratic Regression
3. Quadratic Regression with LASSO (hyperparameter: alpha)

In [28]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import GridSearchCV, LeaveOneOut, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
import numpy as np

In [8]:
linreg = make_pipeline(StandardScaler(), LinearRegression())
quadreg = make_pipeline(StandardScaler(), PolynomialFeatures(degree=2), LinearRegression())
quadlasso_free = make_pipeline(StandardScaler(), PolynomialFeatures(degree=2), Lasso(max_iter=10000))

## Evaluating the models' MSE

In [26]:
def get_mse(model):
    mses = cross_val_score(estimator=model, X=X, y=y, cv=LeaveOneOut(), scoring='neg_mean_squared_error')
    return -np.average(mses)

In [29]:
get_mse(linreg)

11.371126332686615

In [30]:
get_mse(quadreg)

7.9728313137292846

In [32]:
pg = dict(lasso__alpha=np.logspace(-3, 0, 25)) # Check quadlasso_free.get_params()
quadlasso = GridSearchCV(estimator=quadlasso_free, param_grid=pg, scoring='neg_mean_squared_error', cv=LeaveOneOut(), n_jobs=16)
get_mse(quadlasso)

7.830719447807191