In [5]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import RFE

In [2]:
def stepwise_selection(X, y,
                       initial_list=[],
                       threshold_in=0.01,
                       threshold_out = 0.05,
                       verbose=True):
    """ Perform a forward-backward feature selection
    based on p-value from statsmodels.api.OLS

    Arguments:
        X - pandas.DataFrame of numeric features
        y - vector, series of the target
        initial_list - list of features to start with (column names of X)
        threshold_in - include a feature if its p-value < threshold_in
        threshold_out - exclude a feature if its p-value > threshold_out
        verbose - whether to print the sequence of inclusions and exclusions

    Returns: list of selected features

    Example Call: stepwise_selection(X, y)
    """
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print(f'Add  {best_feature} with p-value {best_pval:.4f}')
        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print(f'Drop {worst_feature} with p-value {worst_pval:.4f}')
        if not changed:
           break
    return included


In [3]:
df = pd.read_csv(r"C:\Users\trush\OneDrive\Documents\WFU Grad School Info\BAN 6025 Machine Learning\baseball_salary.csv")
df.head()

Unnamed: 0,name,team,no_atbat,no_hits,no_home,no_runs,no_rbi,no_bb,yr_major,cr_atbat,...,cr_runs,cr_rbi,cr_bb,no_outs,no_assts,no_error,salary,league,division,position
0,"Robidoux, Billy Jo",Milwaukee,181,41,1,15,21,33,2,232,...,20,29,45,326,29,5,67.5,American,East,1B
1,"Kingery, Mike",KansasCity,209,54,3,25,14,12,1,209,...,25,14,12,102,6,3,68.0,American,West,OF
2,"Braggs, Glenn",Milwaukee,215,51,4,19,18,11,1,215,...,19,18,11,116,5,12,70.0,American,East,LF
3,"Ford, Curt",StLouis,214,53,2,30,29,23,2,226,...,32,32,27,109,7,3,70.0,National,East,OF
4,"Newman, Al",Montreal,185,37,1,23,8,21,2,214,...,30,9,24,76,127,7,70.0,National,East,2B


In [11]:
#df = df.drop(columns=['name', 'team', 'league', 'division', 'position'])

X = df.drop(columns=['salary'])
y = df['salary']

features = stepwise_selection(X, y)

print(features)

Add  cr_rbi with p-value 0.0000
Add  no_hits with p-value 0.0000
Add  no_bb with p-value 0.0008
Add  no_atbat with p-value 0.0015
Add  no_outs with p-value 0.0012
['cr_rbi', 'no_hits', 'no_bb', 'no_atbat', 'no_outs']
