In [1]:
# import libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
                         summarize,
                         poly)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_validate
from ISLP.models import sklearn_sm

In [2]:
# load data and split into training and test sets
data = load_data('Boston')
train_data, test_data = train_test_split(data, test_size=0.5, random_state=42)

In [3]:
# Fit polynomial models of degrees 1-4, calculate validation MSE for each, and report as a numpy array (rounded to 2 decimals)
y_train = train_data['medv']
X_train = train_data[['lstat']] 
y_test = test_data['medv']
X_test = test_data[['lstat']]  
mse_list = []

for degree in range(1, 5):
    # 1. Create polynomial features object
    poly_features = PolynomialFeatures(degree=degree, include_bias=False)
    
    # 2. Create the polynomial design matrix for the training data
    X_poly_train = poly_features.fit_transform(X_train)
    
    # 3. Add a constant (intercept) for the statsmodels OLS model
    X_poly_train = sm.add_constant(X_poly_train)
    
    # 4. Transform the test data using the *same* polynomial features
    X_poly_test = poly_features.transform(X_test)
    X_poly_test = sm.add_constant(X_poly_test)
    
    # Fit model
    model = sm.OLS(y_train, X_poly_train).fit()
    
    # Predict on test set
    y_pred = model.predict(X_poly_test)
    
    # Calculate MSE
    mse = np.mean((y_test - y_pred) ** 2)
    mse_list.append(mse)

mse_array = np.round(np.array(mse_list), 2)
print("MSE for degrees 1 to 4:")
print(mse_array)

MSE for degrees 1 to 4:
[38.51 30.84 29.22 27.75]


In [2]:
# load College data
data2 = load_data('College')

In [11]:
#
cv_error = np.zeros(5) #array to store CV errors
H = np.array(data2['Room.Board']) #extract horsepower values
M = sklearn_sm(sm.OLS)  #create sklearn-compatible model
X, Y = data2.drop(columns=['Outstate']), data2['Outstate']
for i, d in enumerate(range(1,6)): #loop over polynomial degrees 1 to 5
    X = np.power.outer(H, np.arange(d+1)) #create polynomial features
    M_CV = cross_validate(M, 
                          X,
                          Y,
                          cv=data2.shape[0]) #LOOCV
    cv_error[i] = np.mean(M_CV['test_score']) #average test MSE
cv_error #display LOOCV errors for polynomial degrees 1 to 5
cv_errors_array = np.round(np.array(cv_error), 2)
cv_errors_array

array([9291471.1 , 9255509.68, 9263314.52, 9269448.22, 9300815.64])