In [11]:
import numpy as np
import pandas as pd
from ISLP.models import (Stepwise, sklearn_selected, sklearn_selection_path)
from statsmodels.api import OLS
from functools import partial
from ISLP.models import ModelSpec as MS
from ISLP.models import poly

In [12]:
rng = np.random
X = rng.normal(size=100)
eps = rng.normal(size=100, scale=1)

In [13]:
Y = 1 + X + 2 * X**2 + 3 * X**3 + eps
df = pd.DataFrame({'X':X,'Y':Y})
df

Unnamed: 0,X,Y
0,1.260228,12.989950
1,-0.654786,-0.802354
2,0.089551,0.627585
3,0.876281,5.267089
4,0.733842,3.516478
...,...,...
95,-0.103460,2.195194
96,-0.775750,1.503018
97,-2.449521,-33.483311
98,-2.584028,-40.677468


In [14]:
def nCp(sigma2, estimator, X, Y):
    n, p = X.shape
    Yhat = estimator.predict(X)
    RSS = np.sum((Y - Yhat)**2)
    # Formula for C_p
    return -(RSS + 2 * p * sigma2) / n

In [15]:
# We need to estimate sigma^2
# Fit the biggest model with all p and use MSE as sigma^2
design = MS([poly('X',degree=10)]).fit(df)
X = design.transform(df)
df = pd.DataFrame(X)
df['Y'] = Y
sigma2 = OLS(Y,X).fit().scale
df

Unnamed: 0,intercept,"poly(X, degree=10)[0]","poly(X, degree=10)[1]","poly(X, degree=10)[2]","poly(X, degree=10)[3]","poly(X, degree=10)[4]","poly(X, degree=10)[5]","poly(X, degree=10)[6]","poly(X, degree=10)[7]","poly(X, degree=10)[8]","poly(X, degree=10)[9]",Y
0,1.0,0.113092,0.047276,-0.068865,-0.102102,-0.038965,0.066590,0.123409,0.062668,-0.078014,-0.153675,12.989950
1,1.0,-0.049828,-0.071404,0.076678,0.029437,-0.082013,-0.009870,0.088598,-0.024586,-0.070573,0.054061,-0.802354
2,1.0,0.013497,-0.076000,-0.023299,0.079929,0.041260,-0.075672,-0.067876,0.059520,0.058241,-0.053583,0.627585
3,1.0,0.080428,-0.010739,-0.087332,-0.032641,0.059116,0.086963,0.019647,-0.066480,-0.047152,0.034467,5.267089
4,1.0,0.068310,-0.027897,-0.083971,-0.004754,0.077833,0.064489,-0.028279,-0.077681,0.003223,0.086564,3.516478
...,...,...,...,...,...,...,...,...,...,...,...,...
95,1.0,-0.002924,-0.081003,0.003087,0.083250,0.005628,-0.090167,-0.021597,0.083396,-0.002222,-0.090637,2.195194
96,1.0,-0.060119,-0.064563,0.089677,0.007160,-0.089083,0.024409,0.084915,-0.062503,-0.035941,0.085172,1.503018
97,1.0,-0.202515,0.204954,-0.126541,0.021593,0.089835,-0.168012,0.184376,-0.154342,0.120708,-0.047269,-33.483311
98,1.0,-0.213958,0.240771,-0.194619,0.128747,-0.016466,-0.112528,0.215732,-0.279792,0.362309,-0.378887,-40.677468


In [16]:
# The function sklearn_selected() expects a function with three args
# So we freeze sigma2
neg_Cp = partial(nCp, sigma2)

In [17]:
# Creating strategy for model selection

strategy = Stepwise.first_peak(design,
                               direction='forward',
                               max_terms=len(design.terms))

In [19]:
cp = sklearn_selected(OLS,
                      strategy,
                      scoring=neg_Cp)
cp.fit(df,Y)

KeyError: "None of [Index(['X'], dtype='object')] are in the [columns]"