In [386]:
'''
Function needs:
y - a vector of n observations for response variable
x - a matrix with values for predictors
enter_alpha -
exit_alpha -
p - number of parameters for the model

Outputs:
(a) a matrix having P rows (see Table 9.3) whose p-th row
consists of the best values of different "model selection criteria"
among all subsets of predictors having size p ( p=1, 2, ..., P). 

(b) the best subset (among all subsets of size p) for each
p=1, 2, ..., P and for each of the criteria.

(c) the subset of predictors obtained from the forward
stepwise procedure described in class (see page 364 if the text book).
'''

'\nFunction needs:\ny - a vector of n observations for response variable\nx - a matrix with values for predictors\nenter_alpha -\nexit_alpha -\np - number of parameters for the model\n\nOutputs:\n(a) a matrix having P rows (see Table 9.3) whose p-th row\nconsists of the best values of different "model selection criteria"\namong all subsets of predictors having size p ( p=1, 2, ..., P). \n\n(b) the best subset (among all subsets of size p) for each\np=1, 2, ..., P and for each of the criteria.\n\n(c) the subset of predictors obtained from the forward\nstepwise procedure described in class (see page 364 if the text book).\n'

In [396]:
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.linear_model import LinearRegression

In [387]:
#test vectors to verify criteria calculations

surgical_data_ln_y_vector = np.array([[6.544],[5.999],[6.565],[5.854],[7.759],[5.852],[6.25],[6.619],
                                      [6.962],[6.875],[6.613],[5.549],[7.361],[6.754],[6.554],[6.695],
                                      [6.526],[5.321],[6.309],[6.731],[5.883],[5.866],[6.395],[6.332],
                                      [6.478],[6.621],[6.302],[7.583],[6.167],[6.396],[6.094],[5.198],
                                      [6.019],[6.944],[6.179],[6.453],[6.519],[5.893],[6.457],[6.558],
                                      [6.283],[6.366],[7.147],[6.288],[6.178],[6.416],[6.867],[7.17],
                                      [6.365],[6.983],[6.005],[6.361],[6.31],[6.478]])

X_matrix_surgical_data = np.array([[6.7,62,81,2.59,50,0,1,0],
[5.1,59,66,1.7,39,0,0,0],
[7.4,57,83,2.16,55,0,0,0],
[6.5,73,41,2.01,48,0,0,0],
[7.8,65,115,4.3,45,0,0,1],
[5.8,38,72,1.42,65,1,1,0],
[5.7,46,63,1.91,49,1,0,1],
[3.7,68,81,2.57,69,1,1,0],
[6,67,93,2.5,58,0,1,0],
[3.7,76,94,2.4,48,0,1,0],
[6.3,84,83,4.13,37,0,1,0],
[6.7,51,43,1.86,57,0,1,0],
[5.8,96,114,3.95,63,1,0,0],
[5.8,83,88,3.95,52,1,0,0],
[7.7,62,67,3.4,58,0,0,1],
[7.4,74,68,2.4,64,1,1,0],
[6,85,28,2.98,36,1,1,0],
[3.7,51,41,1.55,39,0,0,0],
[7.3,68,74,3.56,59,1,0,0],
[5.6,57,87,3.02,63,0,0,1],
[5.2,52,76,2.85,39,0,0,0],
[3.4,83,53,1.12,67,1,1,0],
[6.7,26,68,2.1,30,0,0,1],
[5.8,67,86,3.4,49,1,1,0],
[6.3,59,100,2.95,36,1,1,0],
[5.8,61,73,3.5,62,1,1,0],
[5.2,52,86,2.45,70,0,1,0],
[11.2,76,90,5.59,58,1,0,1],
[5.2,54,56,2.71,44,1,0,0],
[5.8,76,59,2.58,61,1,1,0],
[3.2,64,65,0.74,53,0,1,0],
[8.7,45,23,2.52,68,0,1,0],
[5,59,73,3.5,57,0,1,0],
[5.8,72,93,3.3,39,1,0,1],
[5.4,58,70,2.64,31,1,1,0],
[5.3,51,99,2.6,48,0,1,0],
[2.6,74,86,2.05,45,0,0,0],
[4.3,8,119,2.85,65,1,0,0],
[4.8,61,76,2.45,51,1,1,0],
[5.4,52,88,1.81,40,1,0,0],
[5.2,49,72,1.84,46,0,0,0],
[3.6,28,99,1.3,55,0,0,1],
[8.8,86,88,6.4,30,1,1,0],
[6.5,56,77,2.85,41,0,1,0],
[3.4,77,93,1.48,69,0,1,0],
[6.5,40,84,3,54,1,1,0],
[4.5,73,106,3.05,47,1,1,0],
[4.8,86,101,4.1,35,1,0,1],
[5.1,67,77,2.86,66,1,0,0],
[3.9,82,103,4.55,50,0,1,0],
[6.6,77,46,1.95,50,0,1,0],
[6.4,85,40,1.21,58,0,0,1],
[6.4,59,85,2.33,63,0,1,0],
[8.8,78,72,3.2,56,0,0,0]])

X_modded = np.copy(X_matrix_surgical_data)
X_modded = np.delete(X_modded, np.s_[4:], 1)

In [389]:
def SSTO(y):
    y_mean = np.mean(y)
    squared_errors = (y - y_mean) ** 2
    return np.sum(squared_errors)

In [397]:
def SSE(y, predictions):
    squared_errors = (y - predictions) ** 2
    return np.sum(squared_errors)

In [391]:
#use Linear_regression.score for R^2

def adj_R_2(sse_, ssto_, n, p):
    return 1 - (n-1)/(n-p) * sse_/ssto_

In [392]:
#store all SSEs into a dataframe
#use that datafrome to calculate Cp values

def Cp(sse_p, sse_P, n, p, P):
    return sse_p / (sse_P/(n-P)) - (n-2*p)

In [393]:
def AIC(sse_, n, p):
    return n * np.log(sse_) - n * np.log(n) + 2*p

def SBC(sse_, n, p):
    return n * np.log(sse_) - n * np.log(n) + np.log(n)*p

In [394]:
#X has to be subsetted accordingly before putting it into the function 

def PRESS(y, X):
    lr = LinearRegression()
    pred = np.zeros(y.shape)
    for i in range(X.shape[0]):
        y_mod = np.delete(y, i, 0)
        X_mod = np.delete(X, i, 0)
        lr.fit(X_mod, y_mod)
        pred[i] = lr.predict(X[i].reshape(1,-1))
    return SSE(y, pred)

In [398]:
def project_function(y, X, P):
    #error checks
    if y.shape[0] != X.shape[0]:
        return print("Error: X and y dimension do not match - ROW NUMBERS ARE NOT EQUAL")
    
    if P < 1:
        return print("Error: P must be at least 1")
    
    if P > X.shape[1]+1:
        return print("Error: P is too large. Max is", X.shape[1]+1, "for given X")
    
    #set n to number of observations
    n = X.shape[0]
    
    #creating LinearRegression
    lin_reg = LinearRegression()        
    
    #creating output dataframes
    P_range = range(1, P+1)
    best_values = pd.DataFrame(columns = ['p', 'SSEp', 'R^2_p', 'Adj. R^2_p', 
                                                         'Cp', 'AICp', 'SBCp', 'PRESSp'])
    best_subsets = pd.DataFrame(columns = ['p', 'SSEp', 'R^2_p', 'Adj. R^2_p', 
                                                         'Cp', 'AICp', 'SBCp', 'PRESSp'])
    best_values['p'] = P_range
    best_values.set_index('p', inplace = True)
    best_subsets['p'] = P_range
    best_subsets.set_index('p', inplace = True)
    #display(best_values)
    #display(best_subsets)
    
    #creating X_var subsets
    X_subsets = []
    for i in range(1, P):
        a = combinations([*range(0, X.shape[1])], i)
        for item in a:
            X_subsets.append(item)
    '''#print(X_subsets)
    #print(len(X_subsets))'''
    
    #creating intermediate dataframes to store values before passing them on to output dataframes
    SSE_df = pd.DataFrame(columns = ['X_var','p', 'SSEp'])
    SSE_df.set_index('X_var')
    R2_df = pd.DataFrame(columns = ['X_var','p', 'R2p'])
    adjR2_df = pd.DataFrame(columns = ['X_var', 'p', 'adjR2p'])
    C_df = pd.DataFrame(columns = ['X_var', 'p', 'Cp', 'Abs_Cp_p'])
    AIC_df = pd.DataFrame(columns = ['X_var', 'p', 'AICp'])
    SBC_df = pd.DataFrame(columns = ['X_var', 'p', 'SBCp'])
    PRESS_df = pd.DataFrame(columns = ['X_var', 'p', 'PRESSp'])
    
    #calculate SSTO for y
    ssto_ = SSTO(y)
    
    #calculate SSEp, R^2, adj_R_2
    for i in P_range:
        if i == 1:
            sse_ = ssto_
            SSE_df.loc['None'] = ['None', i, sse_]
            R2_df.loc['None'] = ['None', i, 0]
            adjR2_df.loc['None'] = ['None', i, 0]
            AIC_df.loc['None'] = ['None', i, AIC(sse_, n, i)]
            SBC_df.loc['None'] = ['None', i, SBC(sse_, n, i)]
            
        else:
            current_subset = [item for item in X_subsets if len(item) == i - 1]
            for k in current_subset:
                #calculate SSEp for current subset and write it into SSEp dataframe
                lin_reg.fit(X[:,k], y)
                y_hat = lin_reg.predict(X[:,k])
                '''#adjust index numbers for display
                #X_index_numbers = [x+1 for x in k]
                #print(X_index_numbers)'''
                #create var numbers for dataframe display
                var_numbers = ['X'+str(num+1) for num in k]
                '''#print(var_numbers)'''
                #populate SSE_df
                sse_ = SSE(y,y_hat)
                SSE_df.loc[str(var_numbers)] = [var_numbers, i, sse_]
                #populate R2_df using scikit learn R^2
                R2_df.loc[str(var_numbers)] = [var_numbers, i, lin_reg.score(X[:,k],y)]
                #populate adjR2_df
                adjR2_df.loc[str(var_numbers)] = [var_numbers, i, adj_R_2(sse_, ssto_, n, i)]
                #populate AIC_df
                AIC_df.loc[str(var_numbers)] = [var_numbers, i, AIC(sse_, n, i)]
                #populate SBC_df
                SBC_df.loc[str(var_numbers)] = [var_numbers, i, SBC(sse_, n, i)]
        
    #get SSE value for Cp
    sse_P = SSE_df.loc[str(var_numbers)].SSEp
    #calculating Cp and populating C_df
    for l in P_range:
        if l == 1:
            Cp_val = Cp(ssto_, sse_P, n, l, P)
            C_df.loc['None'] = ['None', l, Cp_val, abs(Cp_val - l)]
        else:
            current_subset = [item for item in X_subsets if len(item) == l - 1]
            for m in current_subset:
                c_var_numbers = ['X'+str(num+1) for num in m]
                sse_ = SSE_df.loc[str(c_var_numbers)].SSEp
                Cp_val = Cp(sse_, sse_P, n, l, P)
                C_df.loc[str(c_var_numbers)] = [c_var_numbers, l, Cp_val, abs(Cp_val - l)]
                
    #calculating PRESSp and populating PRESS_df
    PRESS_predictions = np.zeros(y.shape)
    for s in P_range:
        if s == 1:
            for t in range(X.shape[0]):
                y_mod = np.delete(y, t, 0)
                PRESS_predictions[t] = np.mean(y_mod)
            PRESS_df.loc['None'] = ['None', s, SSE(y, PRESS_predictions)]
        else:
            current_subset = [item for item in X_subsets if len(item) == s - 1]
            for q in current_subset:
                press_var_numbers = ['X'+str(num+1) for num in q]
                PRESS_df.loc[str(press_var_numbers)] = [press_var_numbers, s, PRESS(y, X[:,q])]
               
    for i in P_range:
        best_values.loc[i, 'SSEp'] = SSE_df[SSE_df.p == i].min().SSEp
        best_subsets.loc[i, 'SSEp'] = SSE_df[SSE_df.p == i].SSEp.idxmin()
        best_values.loc[i, 'R^2_p'] = R2_df[R2_df.p == i].max().R2p
        best_subsets.loc[i, 'R^2_p'] = R2_df[R2_df.p == i].R2p.idxmax()
        best_values.loc[i, 'Adj. R^2_p'] = adjR2_df[R2_df.p == i].max().adjR2p
        best_subsets.loc[i, 'Adj. R^2_p'] = adjR2_df[R2_df.p == i].adjR2p.idxmax()
        best_values.loc[i, 'Cp'] = C_df[C_df.p == i].min().Cp
        best_subsets.loc[i, 'Cp'] = C_df[C_df.p == i].Abs_Cp_p.idxmin()        
        best_values.loc[i, 'AICp'] = AIC_df[AIC_df.p == i].min().AICp
        best_subsets.loc[i, 'AICp'] = AIC_df[AIC_df.p == i].AICp.idxmin()
        best_values.loc[i, 'SBCp'] = SBC_df[SBC_df.p == i].min().SBCp
        best_subsets.loc[i, 'SBCp'] = SBC_df[SBC_df.p == i].SBCp.idxmin()
        best_values.loc[i, 'PRESSp'] = PRESS_df[PRESS_df.p == i].min().PRESSp
        best_subsets.loc[i, 'PRESSp'] = PRESS_df[PRESS_df.p == i].PRESSp.idxmin()
    
    display('Best Values for Criteria')
    display(best_values)
    print('\n')
    display('Best Subsets for Criteria')
    display(best_subsets)

    
project_function(surgical_data_ln_y_vector, X_matrix_surgical_data, 9)

'Best Values for Criteria'

Unnamed: 0_level_0,SSEp,R^2_p,Adj. R^2_p,Cp,AICp,SBCp,PRESSp
p,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,12.8077,0.0,0.0,240.452,-75.7025,-73.7135,13.2956
2,7.33157,0.427566,0.416558,117.409,-103.827,-99.8489,8.02496
3,4.31249,0.66329,0.650086,50.4716,-130.483,-124.516,5.06534
4,2.84288,0.778034,0.764716,18.9145,-150.985,-143.029,3.4694
5,2.1788,0.829884,0.815997,5.75077,-163.351,-153.406,2.73777
6,2.08201,0.837441,0.820508,5.54064,-163.805,-151.871,2.73893
7,2.00523,0.843436,0.823449,5.78739,-163.834,-149.911,2.77233
8,1.97203,0.846028,0.822597,7.02946,-162.736,-146.824,2.80871
9,1.97074,0.846129,0.818774,9.0,-160.771,-142.87,2.93123






'Best Subsets for Criteria'

Unnamed: 0_level_0,SSEp,R^2_p,Adj. R^2_p,Cp,AICp,SBCp,PRESSp
p,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,,,,,,,
2,['X3'],['X3'],['X3'],['X3'],['X3'],['X3'],['X4']
3,"['X2', 'X3']","['X2', 'X3']","['X2', 'X3']","['X2', 'X3']","['X2', 'X3']","['X2', 'X3']","['X2', 'X3']"
4,"['X2', 'X3', 'X8']","['X2', 'X3', 'X8']","['X2', 'X3', 'X8']","['X2', 'X3', 'X8']","['X2', 'X3', 'X8']","['X2', 'X3', 'X8']","['X2', 'X3', 'X8']"
5,"['X1', 'X2', 'X3', 'X8']","['X1', 'X2', 'X3', 'X8']","['X1', 'X2', 'X3', 'X8']","['X1', 'X2', 'X3', 'X8']","['X1', 'X2', 'X3', 'X8']","['X1', 'X2', 'X3', 'X8']","['X1', 'X2', 'X3', 'X8']"
6,"['X1', 'X2', 'X3', 'X6', 'X8']","['X1', 'X2', 'X3', 'X6', 'X8']","['X1', 'X2', 'X3', 'X6', 'X8']","['X1', 'X2', 'X3', 'X5', 'X8']","['X1', 'X2', 'X3', 'X6', 'X8']","['X1', 'X2', 'X3', 'X6', 'X8']","['X1', 'X2', 'X3', 'X5', 'X8']"
7,"['X1', 'X2', 'X3', 'X5', 'X6', 'X8']","['X1', 'X2', 'X3', 'X5', 'X6', 'X8']","['X1', 'X2', 'X3', 'X5', 'X6', 'X8']","['X1', 'X2', 'X3', 'X6', 'X7', 'X8']","['X1', 'X2', 'X3', 'X5', 'X6', 'X8']","['X1', 'X2', 'X3', 'X5', 'X6', 'X8']","['X1', 'X2', 'X3', 'X5', 'X6', 'X8']"
8,"['X1', 'X2', 'X3', 'X5', 'X6', 'X7', 'X8']","['X1', 'X2', 'X3', 'X5', 'X6', 'X7', 'X8']","['X1', 'X2', 'X3', 'X5', 'X6', 'X7', 'X8']","['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']","['X1', 'X2', 'X3', 'X5', 'X6', 'X7', 'X8']","['X1', 'X2', 'X3', 'X5', 'X6', 'X7', 'X8']","['X1', 'X2', 'X3', 'X5', 'X6', 'X7', 'X8']"
9,"['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']","['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']","['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']","['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']","['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']","['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']","['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']"
