In [2]:
# import necessary libraries
import numpy as np, pandas as pd
from matplotlib.pyplot import subplots
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (summarize,
                         poly,
                         ModelSpec as MS)
from statsmodels.stats.anova import anova_lm
from pygam import (s as s_gam,
                   l as l_gam,
                   f as f_gam,
                   LinearGAM,
                   LogisticGAM) 

from ISLP.transforms import (BSpline,
                             NaturalSpline)
from ISLP.models import bs, ns
from ISLP.pygam import (approx_lam,
                        degrees_of_freedom,
                        plot as plot_gam,
                        anova as anova_gam)

Question 1

In [4]:
# load the Auto dataset
Auto = load_data('Auto').dropna()
mpg = Auto['mpg']
hp = Auto['horsepower']

In [8]:
models = [MS([poly('horsepower', degree=d)]) 
          for d in range(1, 6)]
Xs = [model.fit_transform(Auto) for model in models] #fit the models and get the design matrices
anova_lm(*[sm.OLS(mpg, X_).fit()
           for X_ in Xs]) #perform ANOVA to compare the models


Unnamed: 0,df_resid,ssr,df_diff,ss_diff,F,Pr(>F)
0,390.0,9385.915872,0.0,,,
1,389.0,7442.029412,1.0,1943.88646,103.876722,8.878263e-22
2,388.0,7426.436007,1.0,15.593405,0.833275,0.3618941
3,387.0,7399.522632,1.0,26.913375,1.438187,0.2311666
4,386.0,7223.371686,1.0,176.150946,9.413092,0.002306428


In [10]:
# Let's interpret the ANOVA results
print("ANOVA Results Interpretation:")
print("============================")

print("Specific comparison requested:")
print("Degree 4 (quartic) vs Degree 3 (cubic):")
print(f"F-statistic: {1.44:.6f}")
print(f"p-value: {0.231:.6f}")
print()
print("Statistical Justification (α = 0.05):")
print("- Degree 2 vs Degree 1: Significant (p < 0.001)")
print("- Degree 3 vs Degree 2: Not significant (p = 0.362)")
print("- Degree 4 vs Degree 3: Not significant (p = 0.231)")  
print("- Degree 5 vs Degree 4: Significant (p = 0.002)")


ANOVA Results Interpretation:
Specific comparison requested:
Degree 4 (quartic) vs Degree 3 (cubic):
F-statistic: 1.440000
p-value: 0.231000

Statistical Justification (α = 0.05):
- Degree 2 vs Degree 1: Significant (p < 0.001)
- Degree 3 vs Degree 2: Not significant (p = 0.362)
- Degree 4 vs Degree 3: Not significant (p = 0.231)
- Degree 5 vs Degree 4: Significant (p = 0.002)


The degree 2 (quadratic) model is the statistically justified polynomial degree based on the ANOVA results. The comparison between degree 2 vs degree 1 shows a highly significant improvement (F = 103.88, p < 0.001)
However, the comparison between degree 3 vs degree 2 is not significant (F = 0.83, p = 0.362) and the comparison between degree 4 vs degree 3 is not significant (F = 1.44, p = 0.231). Adding higher-order terms (cubic or quartic) does not significantly improve the model fit and would constitute overfitting.

Question 2

In [11]:
#load Boston dataset
Boston = load_data('Boston').dropna()
medv = Boston['medv']
lstat = Boston['lstat']

In [12]:
bs_ = BSpline(internal_knots=[10,20,30], intercept=True).fit(lstat)
bs_lstat = bs_.transform(lstat)

In [14]:
M = sm.OLS(medv, bs_lstat).fit()
summarize(M) 

Unnamed: 0,coef,std err,t,P>|t|
"BSpline(intercept=True, internal_knots=[10, 20, 30], lower_bound=np.float64(1.73), upper_bound=np.float64(37.97))[0]",51.7319,1.819,28.444,0.0
"BSpline(intercept=True, internal_knots=[10, 20, 30], lower_bound=np.float64(1.73), upper_bound=np.float64(37.97))[1]",27.0811,1.287,21.034,0.0
"BSpline(intercept=True, internal_knots=[10, 20, 30], lower_bound=np.float64(1.73), upper_bound=np.float64(37.97))[2]",22.855,1.371,16.673,0.0
"BSpline(intercept=True, internal_knots=[10, 20, 30], lower_bound=np.float64(1.73), upper_bound=np.float64(37.97))[3]",13.5147,1.532,8.82,0.0
"BSpline(intercept=True, internal_knots=[10, 20, 30], lower_bound=np.float64(1.73), upper_bound=np.float64(37.97))[4]",10.4277,2.491,4.185,0.0
"BSpline(intercept=True, internal_knots=[10, 20, 30], lower_bound=np.float64(1.73), upper_bound=np.float64(37.97))[5]",12.346,4.024,3.068,0.002
"BSpline(intercept=True, internal_knots=[10, 20, 30], lower_bound=np.float64(1.73), upper_bound=np.float64(37.97))[6]",11.7912,4.411,2.673,0.008


In [17]:
# Get R-squared and number of basis functions
print("B-spline Model Results:")
print("======================")
print(f"R-squared: {M.rsquared:.6f}")
print(f"Adjusted R-squared: {M.rsquared_adj:.6f}")
print()
print("B-spline Transformation Details:")
print(f"Shape of B-spline matrix: {bs_lstat.shape}")
print(f"Number of observations: {bs_lstat.shape[0]}")
print(f"Number of basis functions (columns): {bs_lstat.shape[1]}")

B-spline Model Results:
R-squared: 0.683488
Adjusted R-squared: 0.679682

B-spline Transformation Details:
Shape of B-spline matrix: (506, 7)
Number of observations: 506
Number of basis functions (columns): 7
