# Lab - Cross Validation and Bootstrap

In [48]:
import numpy as np
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
summarize ,
poly)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
from functools import partial
from sklearn.model_selection import \
(cross_validate ,
KFold ,
ShuffleSplit)
from sklearn.base import clone
from ISLP.models import sklearn_sm

## Validation Set Approach

We explore the use of the validation set approach. 

We use train_test_split() to split the data into training and validation sets. Then we can fit a model using only the observations from the training set.

In [4]:
Auto = load_data('Auto')
Auto_train, Auto_valid = train_test_split(Auto, test_size=196, random_state=0)

In [5]:
hp_mm = MS(['horsepower'])
X_train = hp_mm.fit_transform(Auto_train)
y_train = Auto_train['mpg']
model = sm.OLS(y_train, X_train)
results = model.fit()

We now use the predict() method of results on the validation data. We also calculate the validation MSE for our model. It turns out our Test MSE is 23.6

In [6]:
X_valid = hp_mm.transform(Auto_valid)
y_valid = Auto_valid['mpg']
valid_pred = results.predict(X_valid)
np.mean((y_valid - valid_pred) ** 2)

23.61661706966988

Let's generalize calculating the Test MSE with a function and let's estimate it for linear, quadratic and cubic fits. 

In [11]:
def evalMSE(terms, response, train, test): 
    mm = MS(terms)
    X_train = mm.fit_transform(train)
    y_train = train[response]

    X_test = mm.transform(test)
    y_test = test[response]

    results = sm.OLS(y_train, X_train).fit()
    test_pred = results.predict(X_test)

    return np.mean((test_pred - y_test ) ** 2)

In [13]:
MSE = np.zeros(3)

for idx, degree in enumerate(range(1,4)): 
    MSE[idx] = evalMSE([poly('horsepower', degree)], 'mpg', Auto_train, Auto_valid)

MSE

array([23.61661707, 18.76303135, 18.79694163])

## Cross-Validation

The simplest way to cross validate in python is using sklearn, which has a different interface than statsmodels. 

cross_validate takes an optional parameter cv that indicates the K-fold for K-fold cross validation

In [40]:
model = LinearRegression()

X, Y = Auto[['horsepower']], Auto['mpg']
cv_results = cross_validate(model, X, Y, cv=10) # specifying cv does a k-fold cross validation
# the fact that we provided the shape of the dataset means that we will perform Leave-One-Out CV (LOOCV)
cv_err = np.mean(cv_results['test_score'])
cv_err

0.19549935010445704

We repeat this for various degrees of the polynomial

In [57]:
cv_error = np.zeros(5)

Y = Auto['mpg']
model = LinearRegression()

for idx, degree in enumerate(range(1,6)): 
    col = f'degree_{degree}'
    Auto[col] = np.power(Auto['horsepower'], degree)
    X = Auto[[col]]
    print(Auto[['horsepower', col]])

    cv_results = cross_validate(model, 
                                X, 
                                Y, 
                                cv=10)
    cv_error[idx] = np.mean(cv_results['test_score'])

cv_error


     horsepower  degree_1
0           130       130
1           165       165
2           150       150
3           150       150
4           140       140
..          ...       ...
387          86        86
388          52        52
389          84        84
390          79        79
391          82        82

[392 rows x 2 columns]
     horsepower  degree_2
0           130     16900
1           165     27225
2           150     22500
3           150     22500
4           140     19600
..          ...       ...
387          86      7396
388          52      2704
389          84      7056
390          79      6241
391          82      6724

[392 rows x 2 columns]
     horsepower  degree_3
0           130   2197000
1           165   4492125
2           150   3375000
3           150   3375000
4           140   2744000
..          ...       ...
387          86    636056
388          52    140608
389          84    592704
390          79    493039
391          82    551368

[392 rows x 2 c

array([ 0.19549935, -0.02532989, -0.24670234, -0.41961709, -0.53502275])