In [156]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
from statsmodels.api import OLS
import sklearn.model_selection as skm
import sklearn.linear_model as skl
from sklearn.preprocessing import StandardScaler
from functools import partial
from sklearn.pipeline import Pipeline
from sklearn.decomposition  import PCA 
from sklearn.cross_decomposition import PLSRegression
from ISLP.models import (Stepwise ,sklearn_selected ,sklearn_selection_path)
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

In [83]:
hitters = pd.read_csv('Hitters.csv')
hitters.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,293,66,1,30,29,14,1,293,66,1,30,29,14,A,E,446,33,20,,A
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N


In [84]:
hitters["Salary"].isna().sum()

59

In [85]:
hitters = hitters.dropna();
hitters.shape

(263, 20)

In [86]:
def nCp(sigma2 , estimator , X, Y):
    n, p = X.shape
    Yhat = estimator.predict(X)
    RSS = np.sum((Y - Yhat)**2)
    return -(RSS + 2 * p * sigma2) / n
hitters["NewLeague"].replace("A",1,inplace=True)
hitters["NewLeague"].replace("N",0,inplace=True)
hitters["League"].replace("A",1,inplace=True)
hitters["League"].replace("N",0,inplace=True)
hitters["Division"].replace("E",1,inplace=True)
hitters["Division"].replace("W",0,inplace=True)
hitters.info()

<class 'pandas.core.frame.DataFrame'>
Index: 263 entries, 1 to 321
Data columns (total 20 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   AtBat      263 non-null    int64  
 1   Hits       263 non-null    int64  
 2   HmRun      263 non-null    int64  
 3   Runs       263 non-null    int64  
 4   RBI        263 non-null    int64  
 5   Walks      263 non-null    int64  
 6   Years      263 non-null    int64  
 7   CAtBat     263 non-null    int64  
 8   CHits      263 non-null    int64  
 9   CHmRun     263 non-null    int64  
 10  CRuns      263 non-null    int64  
 11  CRBI       263 non-null    int64  
 12  CWalks     263 non-null    int64  
 13  League     263 non-null    int64  
 14  Division   263 non-null    int64  
 15  PutOuts    263 non-null    int64  
 16  Assists    263 non-null    int64  
 17  Errors     263 non-null    int64  
 18  Salary     263 non-null    float64
 19  NewLeague  263 non-null    int64  
dtypes: float64(1), 

In [87]:
x=hitters.drop('Salary',axis=1)
y=hitters['Salary']
sigma2=OLS(y,x).fit().scale
sigma2

99571.6261849196

In [88]:
neg_Cp = partial(nCp, sigma2)


In [157]:
def calc_cp(y_true, y_pred, sigma_squared, p, n):
    rss = np.sum((y_true - y_pred) ** 2)
    cp = rss / sigma_squared + 2 * p - n
    return cp

def forward_stepwise_with_cp(x, y):
    selected_features = []
    remaining_features = list(x.columns)
    best_cp = float('inf')
    n = len(y)

    # Estimate the full model to get sigma_squared
    full_model = skl.LinearRegression().fit(x, y)
    sigma_squared = mean_squared_error(y, full_model.predict(x))

    while remaining_features:
        best_feature = None
        for feature in remaining_features:
            features_to_try = selected_features + [feature]
            model = skl.LinearRegression().fit(x[features_to_try], y)
            y_pred = model.predict(x[features_to_try])
            cp = calc_cp(y, y_pred, sigma_squared, len(features_to_try), n)
            if cp < best_cp:
                best_cp = cp
                best_feature = feature

        if best_feature is not None:
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)
        else:
            break

    return selected_features

In [158]:
HC = forward_stepwise_with_cp(x,y)
HC

['CRBI',
 'Hits',
 'PutOuts',
 'Division',
 'AtBat',
 'Walks',
 'CWalks',
 'CRuns',
 'CAtBat',
 'Assists']