In [None]:
#Q1

!pip install ISLP
import numpy as np
import pandas as pd
from ISLP import load_data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

Boston = load_data('Boston')      
X = Boston[['lstat']].values       
y = Boston['medv'].values 

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    train_size=253,     
    test_size=253,      
    random_state=42      
)

mse_results = []
for degree in range(1, 5):
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    X_train_poly = poly.fit_transform(X_train)
    X_val_poly = poly.transform(X_val)
    
    model = LinearRegression()
    model.fit(X_train_poly, y_train)
    
    y_val_pred = model.predict(X_val_poly)
    mse = mean_squared_error(y_val, y_val_pred)
    mse_results.append((degree, mse))

results = np.array([mse for _, mse in mse_results])
results = np.round(results, 2)
print(results)

[38.51 30.84 29.22 27.75]


# Q1
The validation MSE values (rounded to 2 decimals) are:

[38.51 30.84 29.22 27.75]



In [None]:
# Q2
import numpy as np
import pandas as pd
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import sklearn_sm, ModelSpec   
from sklearn.model_selection import cross_validate

College = load_data('College')
y = College['Outstate'].values
n = College.shape[0]

def make_poly_X(df, d):
    X = pd.DataFrame(index=df.index)
    X['RB'] = df['Room.Board']
    for k in range(2, d+1):
        X[f'RB{k}'] = df['Room.Board'] ** k
    return X

cv_mse = []
for d in range(1, 6):
    Xd = make_poly_X(College, d)
    features = list(Xd.columns)
    est = sklearn_sm(sm.OLS, ModelSpec(features))   

    cv_res = cross_validate(
        est, Xd, y,
        cv=n,
        scoring='neg_mean_squared_error',
        return_train_score=False
    )
    mean_mse = -np.mean(cv_res['test_score'])
    cv_mse.append(mean_mse)

cv_mse_array = np.round(np.array(cv_mse, dtype=float), 2)
cv_mse_array


array([9291471.1 , 9255509.68, 9263314.52, 9269359.81, 9300815.83])

# Q2

The results (rounded to 2 decimals) are:

array([9291471.10, 9255509.68, 9263314.52, 9269359.81, 9300815.83])

The smallest error occurs at degree 2.

In [None]:
# Q3
import numpy as np
import pandas as pd
from ISLP import load_data
from sklearn.model_selection import cross_validate, KFold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

College = load_data('College')
X = College[['Room.Board']].values
y = College['Outstate'].values

kf5 = KFold(n_splits=5, shuffle=True, random_state=123)
kf10 = KFold(n_splits=10, shuffle=True, random_state=123)

mse_5 = []
mse_10 = []

for d in [1, 2, 3]:
    est = Pipeline([
        ('poly', PolynomialFeatures(degree=d, include_bias=False)),
        ('lin', LinearRegression())
    ])
    
    # 5-fold CV
    cv_res5 = cross_validate(
        est, X, y,
        cv=kf5,
        scoring='neg_mean_squared_error',
        return_train_score=False
    )
    mean_mse5 = -np.mean(cv_res5['test_score'])
    mse_5.append(mean_mse5)
    
    # 10-fold CV
    cv_res10 = cross_validate(
        est, X, y,
        cv=kf10,
        scoring='neg_mean_squared_error',
        return_train_score=False
    )
    mean_mse10 = -np.mean(cv_res10['test_score'])
    mse_10.append(mean_mse10)

mse_diff = np.array(mse_5) - np.array(mse_10)

mse_diff_rounded = np.round(mse_diff, 3)
mse_diff_rounded


array([62394.573, 64506.702, 91919.397])

# Q3
The reported values correspond to (5-fold error – 10-fold error) for each degree, rounded to 3 decimals:

array([62394.573, 64506.702, 91919.397])

These results indicate that 10-fold CV consistently yields lower validation errors compared to 5-fold CV.

In [11]:
# Q4
import numpy as np
import pandas as pd
from ISLP import load_data

Default = load_data('Default')

def corr_balance_income(data, idx=None):
    if idx is None: 
        sample = data
    else: 
        sample = data.iloc[idx]
    return np.corrcoef(sample['balance'], sample['income'])[0, 1]

corr_true = corr_balance_income(Default)

B = 2000
rng = np.random.default_rng(456)  
corr_boot = []

n = len(Default)
for _ in range(B):
    idx = rng.integers(low=0, high=n, size=n) 
    corr_boot.append(corr_balance_income(Default, idx))

se_boot = np.std(corr_boot, ddof=1)

corr_true_rounded = np.round(corr_true, 4)
se_boot_rounded = np.round(se_boot, 4)

corr_true_rounded, se_boot_rounded

(np.float64(-0.1522), np.float64(0.0098))

# Q4

The correlation coefficient is -0.1522, and the bootstrap standard error is 0.0098.

In [12]:
# Q5
import numpy as np
import pandas as pd
import statsmodels.api as sm
from ISLP import load_data

Wage = load_data('Wage')

X_quad = pd.DataFrame({
    'age': Wage['age'],
    'age2': Wage['age']**2
})
X_quad = sm.add_constant(X_quad) 
y = Wage['wage']

ols_model = sm.OLS(y, X_quad).fit()
ols_se = ols_model.bse.values   

B = 1500
rng = np.random.default_rng(789)
coef_boot = []

n = len(Wage)
for _ in range(B):
    idx = rng.integers(low=0, high=n, size=n)
    Xb = X_quad.iloc[idx]
    yb = y.iloc[idx]
    model_b = sm.OLS(yb, Xb).fit()
    coef_boot.append(model_b.params.values)

coef_boot = np.array(coef_boot)
boot_se = coef_boot.std(axis=0, ddof=1)   

ratios = boot_se / ols_se
ratios_rounded = np.round(ratios, 3)

ratios_rounded


array([0.75 , 0.808, 0.841])

# Q5

The ratios of bootstrap SE to OLS SE for the three coefficients (intercept, age, age_square) are:

array([0.750, 0.808, 0.841])

This indicates that the bootstrap estimates of standard errors are consistently smaller than the traditional OLS standard errors.