This notebook contains an algorithm for best subset selection for linear regression using several selction criteria.

In [3]:
import numpy as np
from sklearn.linear_model import LinearRegression
from itertools import combinations
import pandas as pd

Here are the formulas for various selection criteria for the best subset in a linear regression model.

Some of the notation used:
$$
Y_i = \text{i-th observation}
$$
$$
\bar{Y} = \text{mean of Y}
$$
$$
\hat{Y_i} = \text{i-th prediction}
$$

Total sum of squares, `SSTO`:
$$
SSTO = \sum (Y_i - \bar{Y})^2
$$

Error sum of squares, `SSE`:
$$
SSE = \sum (Y_i - \hat{Y_i})^2
$$

Regression sum of squares, `SSR` (not used in calculations, but included for reference):  
$$
SSR = \sum (\hat{Y_i} - \bar{Y})^2
$$

Relationship between `SSTO`, `SSE`, and `SSR`:
$$
SSTO = SSE + SSR
$$

Mean squared error, `MSE`:
$$
MSE = \frac{SSE}{n-2}
$$

-----

-----

The `p` in the following formulas referes to the subset of `p` variables from the original set of independent variables. For example, if the original `X` has variables `x1`, `x2`, and `x3`, for `p=2`, `Xp` would be `{x1, x2}`, `{x1, x3}`, and `{x2, x3}`, and the criterions would be based on those subsets.

Coefficient of multiple determination, `R2`:
$$
R^{2}_p = 1 - \frac{SSE_p}{SSTO}
$$

Adjusted coefficient of multiple determination, `adj_R2`:  

$$
R^2_{a,p} = 1 - \left(\frac{n-1}{n-p}\right)\frac{SSE_p}{SSTO} = 1 - \frac{MSE_p}{\frac{SSTO}{n-1}}
$$

Mallows's `Cp`:
$$
C_p = \frac{SSE_p}{MSE(X_{1},...,X_{p-1})} - (n-2p)
$$

The following functions will calculate different statistical values.

In [2]:
def SSTO(y):
    
    '''Calculates sum of squares from the mean.'''
    
    y_mean = np.mean(y)
    squared_errors = (y - y_mean)**2
    
    return np.sum(squared_errors)

In [3]:
def SSE(y, predictions):
    
    '''Calculates sum of squared errors between predictions and actual values.'''
    
    squared_errors = (y - predictions)**2
    
    return np.sum(squared_errors)

In [4]:
def adj_R2(_sse, _ssto, n, p):
    
    '''Calculates the adjusted R^2.'''
    
    return 1 - (n-1)/(n-p) * _sse/_ssto

In [5]:
def Cp(sse_p, sse_P, n, p, P):
    
    '''Calculates Mallows's Cp value. Needs sse_p and sse_P to be pre-calculated.'''
    
    return sse_p / (sse_P/(n-P)) - (n - 2*p)

In [6]:
def AIC(_sse, n, p):
    
    '''Calculates the Akaike information criterion'''
    
    return n * np.log(_sse) - n * np.log(n) + 2*p

In [7]:
def SBC(_sse, n, p):
    
    '''Calculates Schwarz Bayesian criterion'''
    
    return n * np.log(_sse) - n * np.log(n) + np.log(n) * p

In [8]:
def PRESS(X, y):
    
    '''Calculates PRESS criterion.'''
    
    lr = LinearRegression()
    pred = np.zeros(y.shape)
    
    for i in range(X.shape[0]):
        y_mod = np.delete(y, i, 0)
        X_mod = np.delete(X, i, 0)
        lr.fit(X_mod, y_mod)
        pred[i] = lr.predict(X[i].reshape(1, -1))
        
    return SSE(y, pred)

Define some objects that will be needed in the main function.

In [None]:
def setDFP(df, P_range):
    df['p'] = P_range
    df.set_index('p', inplace = True)
    
    return df

In [None]:
def assertTypes(X,y):
    if (type(X) != np.ndarray) or (type(y) != np.ndarray):
        raise TypeError('X and y must be numpy arrays')

In [None]:
def asserSizes(X,y):
    if X.shape[0] != y.shape[0]:
        raise ValueError('X and y must have the same number of rows')
    
    if y.shape[1] != 1:
        raise ValueError('y must have a single column')

In [None]:
def getSubsets(X,P):
    subsets = []
    for i in range(1, P):
        combs = combinations(range(X.shape[1]), i)
        for item in combs:
            subsets.append(item)
            
    return subsets

In [None]:
def getIntermediateDFs(X,y,P):
    
    # clean up the comments and code in this function
    # comments for reference are in the original function
    
    SSE_df = pd.DataFrame(columns=['X_var', 'p', 'SSEp'])
    SSE_df.set_index('X_var')
    R2_df = pd.DataFrame(columns=['X_var', 'p', 'R2p'])
    adj_R2_df = pd.DataFrame(columns=['X_var', 'p', 'adj_R2p'])
    C_df = pd.DataFrame(columns=['X_var', 'p', 'Cp', 'Abs_Cp'])
    AIC_df = pd.DataFrame(columns=['X_var', 'p', 'AICp'])
    SBC_df = pd.DataFrame(columns=['X_var', 'p', 'SBCp'])
    PRESS_df = pd.DataFrame(columns=['X_var', 'p', 'PRESSp'])
    
    X_subsets = getSubsets(X,P)
    
    P_range = range(1, P+1)
    
    _ssto = SSTO(y)
    
    lin_reg = LinearRegression()
    
    for i in P_range:
        if i == 1:
            _sse = _ssto
            SSE_df.loc['None'] = ['None', i, _sse]
            R2_df.loc['None'] = ['None', i, 0]
            adj_R2_df.loc['None'] = ['None', i, 0]
            AIC_df.loc['None'] = ['None', i, AIC(_sse, n, i)]
            SBC_df.loc['None'] = ['None', i, SBC(_sse, n, i)]
        else:
            current_subset = [item for item in X_subsets if len(item) == i-1]
            for item in current_subset:
                # fit linear regression to the current subset
                # and use it for predict values when needed
                lin_reg.fit(X[:, item], y)
                y_hat = lin_reg.predict(X[:, item])
                # create var numbers (X1, X2, etc) for dataframe display
                var_numbers = [f'X{num + 1}' for num in item]
                # populate SSE_df
                _sse = SSE(y, y_hat)
                # convert the list of var_numbers into a single string to use for indexing
                # e.g. a list of ['X1', 'X2'] turns into a string "['X1', 'X2']"
                # which can be used a single label
                SSE_df.loc[str(var_numbers)] = [var_numbers, i, _sse]
                # populate R2_df using scikit-learn's 'score' method
                R2_df.loc[str(var_numbers)] = [var_numbers, i, lin_reg.score(X[:, item], y)]
                R2_df.R2p = R2_df.R2p.astype('float')
                # populate the adj_R2, AIC, and SBC dataframes
                adj_R2_df.loc[str(var_numbers)] = [var_numbers, i, adj_R2(_sse, _ssto, n, i)]
                adj_R2_df.adj_R2p = adj_R2_df.adj_R2p.astype('float')
                AIC_df.loc[str(var_numbers)] = [var_numbers, i, AIC(_sse, n, i)]
                SBC_df.loc[str(var_numbers)] = [var_numbers, i, SBC(_sse, n, i)]
                
    for i in P_range:
        if i == 1:
            sse_P = SSE_df.loc[str(var_numbers)].SSEp
                
            Cp_val = Cp(_ssto, sse_P, n, i, P)
                
            #TODO: why is it 'abs(Cp_val - i)'?
            C_df.loc['None'] = ['None', i, Cp_val, abs(Cp_val - i)]
        else:
            current_subset = [item for item in X_subsets if len(item) == i - 1]
            for x in current_subset:
                # create variable names such as 'X1', 'X2', etc
                c_var_numbers = [f'X{num+1}' for num in x]
                    
                # get the _sse value for the current var number
                _sse = SSE_df.loc[str(c_var_numbers)].SSEp
                    
                # calculate Cp value
                Cp_val = Cp(_sse, sse_P, n, i, P)
                    
                # populate C_df; #TODO: why 'abs(Cp_val - i)'?
                C_df.loc[str(c_var_numbers)] = [c_var_numbers, i, Cp_val, abs(Cp_val - i)]
                
    # calculate PRESSp and populate PRESS_df
    PRESS_predictions = np.zeros(y.shape)
    for i in P_range:
        if i == 1:
            for j in range(X.shape[0]):
                # delete a set of y values to be replaced by predictions
                y_mod = np.delete(y, j, 0)
                # in case of no X variables (P=1), use the mean as the prediction
                PRESS_predictions[j] = np.mean(y_mod)
            PRESS_df.loc['None'] = ['None', i, SSE(y, PRESS_predictions)]
        else:
            current_subset = [item for item in X_subsets if len(item) == i-1]
            for x in current_subset:
                PRESS_var_numbers = [f'X{num+1}' for num in x]
                PRESS_df.loc[str(PRESS_var_numbers)] = [PRESS_var_numbers, i,
                                                        PRESS(X[:,x], y)]
                
    return SSE_df, R2_df, adj_R2_df, C_df, AIC_df, SBC_df, PRESS_df

In [9]:
def getBestDFs(P_range, columnList = None):
    if columnList == None:
        columnList = ['p', 'SSEp', 'R^2_p', 'Adj. R^2_p','Cp', 'AICp', 'SBCp', 'PRESSp']
    # DataFrames that will store the best subset related information
    best_values_df = setDFP(pd.DataFrame(columns = columnList), P_range)
    best_subsets_df = setDFP(pd.DataFrame(columns = columnList), P_range)
    
    # merge getting best values and subsets from intermediate dataframes into here
    
    return best_values_df, best_subsets_df

Refactorization ideas:  

- [x] Create functions to check for errors at the beginning of the main function (they could probably go into the `init` function of class)
- [x] `best_values_df` and `best_subsets_df` `p` column setter function. (Combine this with setting other functions? Need more clarification. Not sure what I originally meant.)   
- [x] Can probably take linear regression initialization out of the function. (Maybe not. It's only needed for value calculation, so probably better left inside the main function)
- [x] Subset creation  
- [ ] Merge `getBestDFs` with code that gets best values and subsets from the intermediate dataframes
- [ ] Clean up comments and code in `getIntermediateDFs`  
- [ ] Look into modifying `P_range` from `range(1, P+1)` to `range(P)`  
- [ ] Comment in the main function on which lines have been refactored  
- [ ] Create a separate function to calculate and populate each intermediate dataframe (separate function per criteria/dataframe?)(That would require several iterations over ```P_range``` with one action per step, as opposed to multiple actions for each step.)(I could also just take that whole chunk of code that calculates the dataframes, and keep it as one function, at least for now, and maybe refactor it later.)  
- [x] Probably merge the `setDFP()` function with the code block that populates `best_values` and `best_subsets` DFs.  
- Other things:  
- [ ] Let the user decide which criteria to calculate (take a list of strings for criteria)(start with an output for all of them, and code the option to choose later)  
- [ ] Make it possible to display the intermediate dataframes?
- [ ] Figure out if it's possible to factor out getting the best values  
- [ ] Clean up the display for best subsets (remove the list notation)

Instead of replacing code blocks in the main function, create a second main function that is made up of refactored code.  
If the user will have a choice of which criteria to use, the DF column name setting (done above) will need to be modify to set only columns that are used. Alternatively, I could live that as is and only populate columns that are used.  
Probably complete refactoring before turning this into a class.  
~~Perhaps look into adding other selections criteria. (Might have trouble finding verification data.)~~  
Eventually turn this whole thing into a class where an object can be created by passing a dataset to the function.  
For class objects, make them "lazy", that is don't calculate anything until explicitly told to do so.  
Use code from "Distribution" class for reference.

In [10]:
# the main function that will use the criterion calculations
# to determine the best subsets for regression
def get_subsets(X, y, P):
    
    # (refactored this section)
    # (added)
    # make sure that both X and y are numpy arrays
    if (type(X) != np.ndarray) or (type(y) != np.ndarray):
        raise TypeError('X and y must be numpy arrays')
    
    # (refactored this section)
    # (added)
    # check to makes sure we have the same number of rows in X and y
    if X.shape[0] != y.shape[0]:
        raise ValueError('X and y must have the same number of rows')
        
    # (added)
    # set n as the number of observations
    n = X.shape[0]
    
    # (added)
    # create a range of values 1 through P for the numbers of variables in the subsets
    P_range = range(1, P+1)
    
    # (refactored this section)
    # (added)
    # create subsets of X consisting of 1 through P variables
    # first, create an empty list to hold the tuples of subsets
    X_subsets = []
    # create combinations of subsets using the 'combinations' function
    # and iterating over values in range equal to the number of X variables
    for i in range(1, P):
        combs = combinations(range(X.shape[1]), i)
        for item in combs:
            X_subsets.append(item)
    
    # (copied to refactored function)
    # create intermediate dataframes to hold criterion values
    SSE_df = pd.DataFrame(columns=['X_var', 'p', 'SSEp'])
    SSE_df.set_index('X_var')
    R2_df = pd.DataFrame(columns=['X_var', 'p', 'R2p'])
    adj_R2_df = pd.DataFrame(columns=['X_var', 'p', 'adj_R2p'])
    C_df = pd.DataFrame(columns=['X_var', 'p', 'Cp', 'Abs_Cp'])
    AIC_df = pd.DataFrame(columns=['X_var', 'p', 'AICp'])
    SBC_df = pd.DataFrame(columns=['X_var', 'p', 'SBCp'])
    PRESS_df = pd.DataFrame(columns=['X_var', 'p', 'PRESSp'])
    
    # (copied to refactored function)
    # calculate SSTO for y
    # SSTO will be the same for any subset of X (look at the formula above for details)
    _ssto = SSTO(y)
    
    # (copied to refactored function)
    # Scikit-learn linear regression used in calculations
    lin_reg = LinearRegression()
    
    # (copied to refactored function)
    # (create a function for this)
    # populate SSE, R2, adj_R2, AIC, and SBC values in the respective dataframes
    for i in P_range:
        # p = 1 means using just the constant with no subset of X.
        # Hence, all entries for X_var are 'None'.
        if i == 1:
            _sse = _ssto
            SSE_df.loc['None'] = ['None', i, _sse]
            R2_df.loc['None'] = ['None', i, 0]
            adj_R2_df.loc['None'] = ['None', i, 0]
            AIC_df.loc['None'] = ['None', i, AIC(_sse, n, i)]
            SBC_df.loc['None'] = ['None', i, SBC(_sse, n, i)]
        else:
            # get only subsets that consist only of i-1 variables
            current_subset = [item for item in X_subsets if len(item) == i-1]
            # calculate criterions for the current_subset
            for item in current_subset:
                # fit linear regression to the current subset
                # and use it for predict values when needed
                lin_reg.fit(X[:, item], y)
                y_hat = lin_reg.predict(X[:, item])
                # create var numbers (X1, X2, etc) for dataframe display
                var_numbers = [f'X{num + 1}' for num in item]
                # populate SSE_df
                _sse = SSE(y, y_hat)
                # convert the list of var_numbers into a single string to use for indexing
                # e.g. a list of ['X1', 'X2'] turns into a string "['X1', 'X2']"
                # which can be used a single label
                SSE_df.loc[str(var_numbers)] = [var_numbers, i, _sse]
                # populate R2_df using scikit-learn's 'score' method
                R2_df.loc[str(var_numbers)] = [var_numbers, i, lin_reg.score(X[:, item], y)]
                R2_df.R2p = R2_df.R2p.astype('float')
                # populate the adj_R2, AIC, and SBC dataframes
                adj_R2_df.loc[str(var_numbers)] = [var_numbers, i, adj_R2(_sse, _ssto, n, i)]
                adj_R2_df.adj_R2p = adj_R2_df.adj_R2p.astype('float')
                AIC_df.loc[str(var_numbers)] = [var_numbers, i, AIC(_sse, n, i)]
                SBC_df.loc[str(var_numbers)] = [var_numbers, i, SBC(_sse, n, i)]
                
    # (copied to refactored function)
    # (create a function for this)
    # Calculate Cp values and populate C_df
    for i in P_range:
        if i == 1:
            # get SSEp value for the the whole set of variables
            # which can be extracted from SSE_df using the last var_numbers values
            sse_P = SSE_df.loc[str(var_numbers)].SSEp
                
            # calculate Cp values using sse_P and _ssto
            # (_ssto is used in place of sse_p since 
            # we're not using a subset of X for this specific calculation)
            Cp_val = Cp(_ssto, sse_P, n, i, P)
                
            # enter the value into C_df
            #TODO: why is it 'abs(Cp_val - i)'?
            C_df.loc['None'] = ['None', i, Cp_val, abs(Cp_val - i)]
        else:
            current_subset = [item for item in X_subsets if len(item) == i - 1]
            for x in current_subset:
                # create variable names such as 'X1', 'X2', etc
                c_var_numbers = [f'X{num+1}' for num in x]
                    
                # get the _sse value for the current var number
                _sse = SSE_df.loc[str(c_var_numbers)].SSEp
                    
                # calculate Cp value
                Cp_val = Cp(_sse, sse_P, n, i, P)
                    
                # populate C_df; #TODO: why 'abs(Cp_val - i)'?
                C_df.loc[str(c_var_numbers)] = [c_var_numbers, i, Cp_val, abs(Cp_val - i)]
                    
    # (copied to refactored function)
    # (create a function for this)
    # calculate PRESSp and populate PRESS_df
    PRESS_predictions = np.zeros(y.shape)
    for i in P_range:
        if i == 1:
            for j in range(X.shape[0]):
                # delete a set of y values to be replaced by predictions
                y_mod = np.delete(y, j, 0)
                # in case of no X variables (P=1), use the mean as the prediction
                PRESS_predictions[j] = np.mean(y_mod)
            PRESS_df.loc['None'] = ['None', i, SSE(y, PRESS_predictions)]
        else:
            current_subset = [item for item in X_subsets if len(item) == i-1]
            for x in current_subset:
                PRESS_var_numbers = [f'X{num+1}' for num in x]
                PRESS_df.loc[str(PRESS_var_numbers)] = [PRESS_var_numbers, i,
                                                        PRESS(X[:,x], y)]
    
    # (refactored this section)
    # (added)
    # initialize best_values and best_subsets DFs
    columnList = ['p', 'SSEp', 'R^2_p', 'Adj. R^2_p','Cp', 'AICp', 'SBCp', 'PRESSp']
    best_values_df = pd.DataFrame(columns = columnList)
    best_subsets_df = pd.DataFrame(columns = columnList)
    
    # (refactored this section)
    # (added)
    # for both dataframes best_values_df and best_subsets_df,
    # set values in the 'p' column to P_range values, and set that column as the index
    # turn this into its own function
    # then it can be applied to any dataframe
    best_values_df['p'] = P_range
    best_values_df.set_index('p', inplace = True)
    best_subsets_df['p'] = P_range
    best_subsets_df.set_index('p', inplace = True)
    
    # (probably create a function for this)
    # (since best_values and best_subsets DFs are not used until this point,)
    # (it would make sense to move the earlier code down here and combine the chunks)
    # (into one function)
    for i in P_range:
        best_values_df.loc[i, 'SSEp'] = SSE_df[SSE_df.p == i].min().SSEp
        best_subsets_df.loc[i, 'SSEp'] = SSE_df[SSE_df.p == i].SSEp.idxmin()
        best_values_df.loc[i, 'R^2_p'] = R2_df[R2_df.p == i].max().R2p
        best_subsets_df.loc[i, 'R^2_p'] = R2_df[R2_df.p == i].R2p.idxmax()
        best_values_df.loc[i, 'Adj. R^2_p'] = adj_R2_df[adj_R2_df.p == i].max().adj_R2p
        best_subsets_df.loc[i, 'Adj. R^2_p'] = adj_R2_df[adj_R2_df.p == i].adj_R2p.idxmax()
        best_values_df.loc[i, 'Cp'] = C_df[C_df.p == i].min().Cp
        best_subsets_df.loc[i, 'Cp'] = C_df[C_df.p == i].Abs_Cp.idxmin()
        best_values_df.loc[i, 'AICp'] = AIC_df[AIC_df.p == i].min().AICp
        best_subsets_df.loc[i, 'AICp'] = AIC_df[AIC_df.p == i].AICp.idxmin()
        best_values_df.loc[i, 'SBCp'] = SBC_df[SBC_df.p == i].min().SBCp
        best_subsets_df.loc[i, 'SBCp'] = SBC_df[SBC_df.p == i].SBCp.idxmin()
        best_values_df.loc[i, 'PRESSp'] = PRESS_df[PRESS_df.p == i].min().PRESSp
        best_subsets_df.loc[i, 'PRESSp'] = PRESS_df[PRESS_df.p == i].PRESSp.idxmin()
        
    # (probably set the function to return best_val and best_subset DFs,)
    # (and use another function to display them)
    display('Best Values for Criteria')
    display(best_values_df)
    print('\n')
    display('Best Subsets for Criteria')
    display(best_subsets_df)

In [None]:
def get_subsets_refactored(X, y, P):
    
    assertTypes(X,y)
    
    asserSizes(X,y)
    
    # might not need this line
    n = X.shape[0]
    
    P_range = range(1, P+1)
    
    # getBestDFs returns blank DFs with p set as the index
    # will probably modify this function again so that it returns populated DFs
    best_values_df, best_subsets_df = getBestDFs(P_range)
    
    

In [4]:
np.set_printoptions(suppress=True)
data = np.loadtxt('Surgical Data.txt')
print(data)

[[   6.7     62.      81.       2.59    50.       0.       1.       0.
   695.       6.544]
 [   5.1     59.      66.       1.7     39.       0.       0.       0.
   403.       5.999]
 [   7.4     57.      83.       2.16    55.       0.       0.       0.
   710.       6.565]
 [   6.5     73.      41.       2.01    48.       0.       0.       0.
   349.       5.854]
 [   7.8     65.     115.       4.3     45.       0.       0.       1.
  2343.       7.759]
 [   5.8     38.      72.       1.42    65.       1.       1.       0.
   348.       5.852]
 [   5.7     46.      63.       1.91    49.       1.       0.       1.
   518.       6.25 ]
 [   3.7     68.      81.       2.57    69.       1.       1.       0.
   749.       6.619]
 [   6.      67.      93.       2.5     58.       0.       1.       0.
  1056.       6.962]
 [   3.7     76.      94.       2.4     48.       0.       1.       0.
   968.       6.875]
 [   6.3     84.      83.       4.13    37.       0.       1.       0.
   745.  

In [5]:
data_x = data[:,:-2]
data_y = data[:, -1]
print(data_x)
print(data_y)

[[  6.7   62.    81.     2.59  50.     0.     1.     0.  ]
 [  5.1   59.    66.     1.7   39.     0.     0.     0.  ]
 [  7.4   57.    83.     2.16  55.     0.     0.     0.  ]
 [  6.5   73.    41.     2.01  48.     0.     0.     0.  ]
 [  7.8   65.   115.     4.3   45.     0.     0.     1.  ]
 [  5.8   38.    72.     1.42  65.     1.     1.     0.  ]
 [  5.7   46.    63.     1.91  49.     1.     0.     1.  ]
 [  3.7   68.    81.     2.57  69.     1.     1.     0.  ]
 [  6.    67.    93.     2.5   58.     0.     1.     0.  ]
 [  3.7   76.    94.     2.4   48.     0.     1.     0.  ]
 [  6.3   84.    83.     4.13  37.     0.     1.     0.  ]
 [  6.7   51.    43.     1.86  57.     0.     1.     0.  ]
 [  5.8   96.   114.     3.95  63.     1.     0.     0.  ]
 [  5.8   83.    88.     3.95  52.     1.     0.     0.  ]
 [  7.7   62.    67.     3.4   58.     0.     0.     1.  ]
 [  7.4   74.    68.     2.4   64.     1.     1.     0.  ]
 [  6.    85.    28.     2.98  36.     1.     1.     0. 

In [14]:
get_subsets(data_x, data_y, 9)

'Best Values for Criteria'

Unnamed: 0_level_0,SSEp,R^2_p,Adj. R^2_p,Cp,AICp,SBCp,PRESSp
p,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,12.8077,0.0,0.0,240.452,-75.7025,-73.7135,13.2956
2,7.33157,0.427566,0.416558,117.409,-103.827,-99.8489,8.02496
3,4.31249,0.66329,0.650086,50.4716,-130.483,-124.516,5.06534
4,2.84288,0.778034,0.764716,18.9145,-150.985,-143.029,3.4694
5,2.1788,0.829884,0.815997,5.75077,-163.351,-153.406,2.73777
6,2.08201,0.837441,0.820508,5.54064,-163.805,-151.871,2.73893
7,2.00523,0.843436,0.823449,5.78739,-163.834,-149.911,2.77233
8,1.97203,0.846028,0.822597,7.02946,-162.736,-146.824,2.80871
9,1.97074,0.846129,0.818774,9.0,-160.771,-142.87,2.93123






'Best Subsets for Criteria'

Unnamed: 0_level_0,SSEp,R^2_p,Adj. R^2_p,Cp,AICp,SBCp,PRESSp
p,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,,,,,,,
2,['X3'],['X3'],['X3'],['X3'],['X3'],['X3'],['X4']
3,"['X2', 'X3']","['X2', 'X3']","['X2', 'X3']","['X2', 'X3']","['X2', 'X3']","['X2', 'X3']","['X2', 'X3']"
4,"['X2', 'X3', 'X8']","['X2', 'X3', 'X8']","['X2', 'X3', 'X8']","['X2', 'X3', 'X8']","['X2', 'X3', 'X8']","['X2', 'X3', 'X8']","['X2', 'X3', 'X8']"
5,"['X1', 'X2', 'X3', 'X8']","['X1', 'X2', 'X3', 'X8']","['X1', 'X2', 'X3', 'X8']","['X1', 'X2', 'X3', 'X8']","['X1', 'X2', 'X3', 'X8']","['X1', 'X2', 'X3', 'X8']","['X1', 'X2', 'X3', 'X8']"
6,"['X1', 'X2', 'X3', 'X6', 'X8']","['X1', 'X2', 'X3', 'X6', 'X8']","['X1', 'X2', 'X3', 'X6', 'X8']","['X1', 'X2', 'X3', 'X5', 'X8']","['X1', 'X2', 'X3', 'X6', 'X8']","['X1', 'X2', 'X3', 'X6', 'X8']","['X1', 'X2', 'X3', 'X5', 'X8']"
7,"['X1', 'X2', 'X3', 'X5', 'X6', 'X8']","['X1', 'X2', 'X3', 'X5', 'X6', 'X8']","['X1', 'X2', 'X3', 'X5', 'X6', 'X8']","['X1', 'X2', 'X3', 'X6', 'X7', 'X8']","['X1', 'X2', 'X3', 'X5', 'X6', 'X8']","['X1', 'X2', 'X3', 'X5', 'X6', 'X8']","['X1', 'X2', 'X3', 'X5', 'X6', 'X8']"
8,"['X1', 'X2', 'X3', 'X5', 'X6', 'X7', 'X8']","['X1', 'X2', 'X3', 'X5', 'X6', 'X7', 'X8']","['X1', 'X2', 'X3', 'X5', 'X6', 'X7', 'X8']","['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']","['X1', 'X2', 'X3', 'X5', 'X6', 'X7', 'X8']","['X1', 'X2', 'X3', 'X5', 'X6', 'X7', 'X8']","['X1', 'X2', 'X3', 'X5', 'X6', 'X7', 'X8']"
9,"['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']","['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']","['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']","['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']","['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']","['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']","['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']"


In [1]:
class SubsetSelection:
    
    def __init__(self, data_x, data_y):
        '''
        Class for best subset selection for linear regression
        
        inputs:
        data_x: the x vector
        data_y: the y vector
        '''
        self.data_x = data_x
        self.data_y = data_y

In [6]:
s = SubsetSelection(data_x, data_y)
print(type(s))

<class '__main__.SubsetSelection'>
