In [None]:
import statsmodels.api as sm
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import os 
os.chdir('C:/Users/Test/data_re/odds_vac_BRM')

def calculate_auc(X_train, y_train, X_test, y_test, selected_variables, cur_var):
    X_train = X_train[selected_variables]
    X_test = X_test[selected_variables]
    X_train = sm.add_constant(X_train)
    X_test = sm.add_constant(X_test)
    model = sm.GLM(y_train,X_train, family=sm.families.Binomial()).fit(disp=0)
    tr_auc = roc_auc_score(y_train, model.predict(X_train))
    te_auc = roc_auc_score(y_test, model.predict(X_test))
    return (cur_var, tr_auc, te_auc)

def calculate_pvalue(X_train, y_train, selected_variables):
    X_train = X_train[selected_variables]
    X_test = X_test[selected_variables]
    X_train = sm.add_constant(X_train)
    model = sm.GLM(y_train,X_train, family=sm.families.Binomial()).fit(disp=0)
    selected_pvals = model.pvalues[1:]
    max_pval = selected_pvals.max()
    return selected_pvals, max_pval


def stepwise_feature_selection(X, y, criteria='auc', seed=0, fold=5, max_iter= 100):
    
    variables = X.columns.tolist()
    standard_auc = 0.5
    
    in_threshold = 0.0
    ex_threshold = 0.0

    selected_variables = [] ## 선택된 변수들
    remainder = list(set(variables) - set(selected_variables)) ## 선택되지 않은 변수들
    iter_cnt = 0  
    
    skf = StratifiedKFold(n_splits=fold, shuffle=True, random_state=seed)        
        
    while len(remainder) > 0 and iter_cnt < max_iter :
        if criteria == 'auc':
            perf_list = []

            ### FORWARD SELECTION BASED ON AUC 
            for var in remainder:
                for train_index, valid_index in skf.split(X, y):
                    X_train, X_val = X.iloc[train_index], X.iloc[valid_index]
                    y_train, y_val = y.iloc[train_index], y.iloc[valid_index]
                    tmp_vars = selected_variables + [var] 
                    auc = calculate_auc(X_train, y_train, X_val, y_val, tmp_vars, var)
                    perf_list.append(auc)
            perf_df = pd.DataFrame(perf_list, columns=['var', 'tr_auc', 'te_auc'])
            mean_perf_df = perf_df.groupby(perf_df['var']).mean() ## 각 변수별 점수 평균
            max_auc = mean_perf_df['te_auc'].max()
            max_auc_vars = mean_perf_df[mean_perf_df['te_auc'] == max_auc].index.values[0] 
            
            if max_auc > standard_auc + in_threshold: 
                selected_variables.append(max_auc_vars)
                remainder = list(set(variables) - set(selected_variables))
                standard_auc = max_auc
            else : 
                print("iter count: {}, selected variables: {}, AUC: {}".format(iter_cnt, selected_variables, max_auc))
            

            perf_list = []
            ### BACKWORD SELECTION BASED ON AUC 
            for var in selected_variables: 
                for train_index, valid_index in skf.split(X, y):
                    X_train, X_val = X.iloc[train_index], X.iloc[valid_index]
                    y_train, y_val = y.iloc[train_index], y.iloc[valid_index]
                    tmp_vars = selected_variables.copy()
                    tmp_vars.remove(var)
                    auc = calculate_auc(X_train, y_train, X_val, y_val, tmp_vars, var)
                    perf_list.append(auc)
            perf_df = pd.DataFrame(perf_list, columns=['var', 'tr_auc', 'te_auc'])
            mean_perf_df = perf_df.groupby(perf_df['var']).mean() ## 각 변수별 점수 평균
            max_auc = mean_perf_df['te_auc'].max()
            max_auc_vars = mean_perf_df[mean_perf_df['te_auc'] == max_auc].index.values[0]

            if max_auc > standard_auc - ex_threshold: 
                selected_variables.remove(max_auc_vars)
                remainder = list(set(variables) - set(selected_variables))  
                standard_auc = max_auc
                print("iter count: {}, selected variables: {}, AUC: {}".format(iter_cnt, selected_variables, max_auc))
        iter_cnt += 1 

    return selected_variables

In [None]:
def stepwise_feature_selection_fold(X, y , criteria='auc' , max_iter= 100):
    
    variables = X.columns.tolist()[:-1]
    standard_auc = 0.5
    
    in_threshold = 0.0
    ex_threshold = 0.0

    selected_variables = [] ## 선택된 변수들
    remainder = list(set(variables) - set(selected_variables)) ## 선택되지 않은 변수들
    iter_cnt = 0  
    
    # skf = StratifiedKFold(n_splits=fold, shuffle=True, random_state=seed)        
    while len(remainder) > 0 and iter_cnt < max_iter :
        if criteria == 'auc':
            perf_list = []        
            ### FORWARD SELECTION BASED ON AUC 
            for var in remainder:
                for fold_num in range(1,6 ): 
                    X_train, X_val = X.loc[X['fold_id'] != fold_num], X.loc[X['fold_id'] == fold_num]
                    X_train, X_val = X_train.iloc[:, :-1], X_val.iloc[:, :-1] 
                    y_train, y_val = y.loc[X['fold_id'] != fold_num], y.loc[X['fold_id'] == fold_num] 
                    tmp_vars = selected_variables + [var] 
                    auc = calculate_auc(X_train, y_train, X_val, y_val, tmp_vars, var)
                    perf_list.append(auc)
            
            perf_df = pd.DataFrame(perf_list, columns=['var', 'tr_auc', 'te_auc'])
            mean_perf_df = perf_df.groupby(perf_df['var']).mean() ## 각 변수별 점수 평균
            max_trauc = mean_perf_df['tr_auc'].max()
            max_auc = mean_perf_df['te_auc'].max()
            max_auc_vars = mean_perf_df[mean_perf_df['te_auc'] == max_auc].index.values[0] 
            
            if max_auc > standard_auc + in_threshold: 
                selected_variables.append(max_auc_vars)
                remainder = list(set(variables) - set(selected_variables))
                standard_auc = max_auc
                print("iter count: {}, selected variables: {}, AUC: {}".format(iter_cnt, selected_variables, max_trauc, max_auc))
            
            
            perf_list = []
            ### BACKWORD SELECTION BASED ON AUC 
            for var in selected_variables: 
                for fold_num in range(1,6): 
                    X_train, X_val = X.loc[X['fold_id'] != fold_num], X.loc[X['fold_id'] == fold_num]
                    X_train, X_val = X_train.iloc[:, :-1], X_val.iloc[:, :-1] 
                    y_train, y_val = y.loc[X['fold_id'] != fold_num], y.loc[X['fold_id'] == fold_num] 
                    tmp_vars = selected_variables.copy()
                    tmp_vars.remove(var)
                    auc = calculate_auc(X_train, y_train, X_val, y_val, tmp_vars, var)
                    perf_list.append(auc)
            perf_df = pd.DataFrame(perf_list, columns=['var', 'tr_auc', 'te_auc'])
            mean_perf_df = perf_df.groupby(perf_df['var']).mean() ## 각 변수별 점수 평균
            max_trauc = mean_perf_df['tr_auc'].max()
            max_auc = mean_perf_df['te_auc'].max()
            max_auc_vars = mean_perf_df[mean_perf_df['te_auc'] == max_auc].index.values[0]

            if max_auc > standard_auc - ex_threshold: 
                selected_variables.remove(max_auc_vars)
                remainder = list(set(variables) - set(selected_variables))  
                standard_auc = max_auc

                print("iter count: {}, selected variables: {}, tr_AUC: {}, val_AUC: {}".format(iter_cnt, selected_variables, max_trauc, max_auc))
        iter_cnt += 1 

    return selected_variables

In [None]:
# n_list = [3, 1000]

# for i in n_list: 
#     data = pd.read_csv('../data/Train_' + str(i) + '_days.csv')
#     X = data.iloc[:, :-1]
#     y = data.iloc[:, -1]
#     selected_vars = stepwise_feature_selection_fold(X, y)
#     # print 3 and 1000 days data
#     print("selected data for {} days aggravation:\n {}".format(i, selected_vars))

In [None]:
os.curdir

In [None]:
data = pd.read_csv('C:/Users/Test/data/7&3_trainset.csv', index_col=0)
X = data.loc[:,['BMI', 'SBP', 'age', 'altered_consciousness_confusion', 'asthma',
       'chest_x_ray_infiltration', 'chronic_cardiac_disease',
       'chronic_hematologic_disease', 'chronic_kidney_disease',
       'chronic_liver_disease', 'chronic_neurological_disorder',
       'chronic_obstructive_pulmonary_diseases', 'cough', 'diabetes',
       'diarrhea', 'dimentia', 'dyspnea', 'fatigue_malaise', 'headache',
       'heart_failure', 'heart_rate', 'hypertension', 'malignant', 'myalgia',
       'respiration_rate', 'rhinorrhea', 'sex', 'smoking', 'sore_throat',
       'sputum', 'temperature', 'vomiting_nausea','Vac_no1','Vac_no2','CVidx']]
y = data.loc[:, ['new_severity']]
X.columns = X.columns.str.replace('CVidx', 'fold_id')

In [None]:
#X = data.loc[:,['BMI', 'SBP', 'age', 'altered_consciousness_confusion', 'asthma',
#       'chest_x_ray_infiltration', 'chronic_cardiac_disease',
#       'chronic_hematologic_disease', 'chronic_kidney_disease',
#       'chronic_liver_disease', 'chronic_neurological_disorder',
#       'chronic_obstructive_pulmonary_diseases', 'cough', 'diabetes',
#       'diarrhea', 'dimentia', 'dyspnea', 'fatigue_malaise', 'headache',
#       'heart_failure', 'heart_rate', 'hypertension', 'malignant', 'myalgia',
#       'respiration_rate', 'rhinorrhea', 'sex', 'smoking', 'sore_throat',
#       'sputum', 'temperature', 'vomiting_nausea','Vac_no','lab_WBC','lab_Hb', 'lab_plt',
#       'lab_GOT', 'lab_GPT', 'lab_Cr','lab_CRP','lab_CK','lab_LDHL','lab_ddimer', 'lab_BUN',
#       'lab_glu', 'lab_Na', 'lab_TroponinI_quan', 'CVidx']]

In [None]:
selected_vars = stepwise_feature_selection_fold(X, y)

In [None]:
#selected_variables = stepwise_feature_selection(X, y, criteria='auc', seed=0, fold=5)
# selected_variables

In [None]:
selected_vars