In [17]:
import yaml
from yaml.loader import SafeLoader
from socket import gethostname
import numpy as np
import pandas as pd
from sklearn.base import clone
from dev_interaction_util import generate_synthetic_dev_outcomes, generate_synthetic_dev_data, set_up_interactions
from dev_interaction_util import do_scoring_loop, get_best_model, summarize_overall_df_results, do_final_fit, present_model_results, present_results_vs_ground_truth_cors
from ml_util import *
# Imputing with MICE
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn import linear_model
from ml_util import get_data_for_imputation
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.inspection import permutation_importance
import numpy as np
from IPython.display import display, HTML
from sklearn.base import clone
from sklearn.inspection import permutation_importance
#import seaborn as sns
from sklearn.feature_selection import SelectKBest, f_regression, RFE


In [18]:


print(gethostname())
# Open the file and load the file
with open('config.yml') as f:
    all_yaml = yaml.load(f, Loader=SafeLoader)
    if gethostname() in all_yaml.keys():
        config = all_yaml[gethostname()]
    else:
        config = all_yaml['default']
        
print(config)



Benjamins-MacBook-Pro-2.local
{'dropbox_data_dir': '/Users/benjaminsmith/Dropbox (University of Oregon)/UO-SAN Lab/Berkman Lab/Devaluation/analysis_files/data/'}


This notebook is derived from pre_registered_preview.ipynb.

The aim is to look at how the model pipeline does with different sets of ground truths. If we plug in five actual effects, or ten, or twenty, how many are actually identified and how many irrelevant effects are identified?

This can't be too black and white, because of course in real life, teh features are correlated iwth one another. But at least, the features we select to be correlated should _actually be_ the most correlated.

In [19]:
dropbox_data_dir = config['dropbox_data_dir']


# Introduction
This is a pre-registered analysis for measuring moderations of the intervention.

We'll cross-validate the intervention moderations.

For this analysis, we'll try to make predictions based on some synthetic data. we'll take wave 1 data and randomly mix in changes based on our predictors, then try to model how we would predict those things. Finally, we'll make the predictions.

# Load data

In [20]:
data_by_ppt_path = dropbox_data_dir + '/data_by_ppt.csv'
data_codebook_path = dropbox_data_dir + 'data_codebook.csv'




In [21]:
data_by_ppt = pd.read_csv(data_by_ppt_path)
data_codebook = pd.read_csv(data_codebook_path)

In [22]:
#find out which columns in data_by_ppt are missing from the codebook
data_by_ppt.columns.difference(data_codebook['VarName'])


#copy our outcome measures, bf_1 and FFQ_1, into a new dataframe
data_by_ppt['bf_2'] = data_by_ppt.bf_1
#need to decide what sort of FFQ we want to use
data_by_ppt['cancer_promoting_minus_preventing_FFQ_1'] = data_by_ppt.cancer_promoting_minus_preventing_FFQ
data_by_ppt['cancer_promoting_minus_preventing_FFQ_2'] = data_by_ppt.cancer_promoting_minus_preventing_FFQ

# do a report on missing data
analysis_data  = data_by_ppt.loc[:,data_codebook.loc[data_codebook.IsSelectedPredictor,"VarName"]].copy()
outcome_measures = data_by_ppt.loc[:,data_codebook.loc[data_codebook.IsSelectedOutcomeMeasure,"VarName"]].copy()

na_values = pd.DataFrame(data_by_ppt.isna().sum())
na_values.columns = ['NA_Count']
na_values['prop_NA'] = na_values.NA_Count / data_by_ppt.shape[0]
data_codebook = data_codebook.merge(na_values, left_on='VarName', right_index=True)

data_codebook.to_csv(dropbox_data_dir + 'data_metadata.csv', index=False)

Need to count the number of valid and missing entries in each of our data predictors

## Converting data to numeric format

In [23]:
one_hot_vals = pd.get_dummies(analysis_data.birthsex_factor)
#there's only two variables here so we can convert this into a dummy variable
analysis_data.drop(columns=['birthsex_factor'], inplace=True)
one_hot_vals.columns = ['birthsex_factor_' + str(col) for col in one_hot_vals.columns]
analysis_data = analysis_data.join(one_hot_vals.iloc[:,1:])

In [24]:
analysis_data

Unnamed: 0,BSCS,EDM,BIS_11,PCS,RS,TRSQ,ACES_neglectful_parenting,ACES_abuse,ACES_sum,ACES_divorced_separated,...,zipcode_median_income_acs,household_income_per_person,SST_prop_successful_stops,SST_GRTmean,SST_SSD,SST_PostErrorSlowW1_mean,SST_mean_ssrt_0,ROC_Crave_Regulate_Minus_Look,WTP_unhealthy_minus_healthy,birthsex_factor_Male
0,2.538462,3.250,72,7.0,20.0,63.0,,,,,...,,,,,,,,-0.5125,-0.312500,1
1,2.384615,1.750,89,9.0,22.0,63.0,,,,,...,,,,,,,,,0.440524,0
2,3.384615,2.500,63,9.0,18.0,57.0,,,,,...,,,0.500000,533.315052,284.375,0.058297,0.247061,-0.8000,-0.190476,0
3,3.076923,2.800,75,,,64.0,,,,,...,,,0.312500,498.167248,103.125,0.027730,0.446583,-0.8000,0.170363,0
4,3.307692,2.750,64,12.0,21.0,55.0,,,,,...,,,0.562500,626.507764,250.000,0.105660,0.369308,-1.5500,-0.494624,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270,3.461538,4.000,58,18.0,17.0,54.0,0.0,1.0,3.0,1.0,...,-0.690347,1.768485,0.523438,,,,0.357362,-0.0125,-1.008152,1
271,3.692308,3.875,54,17.0,13.0,55.0,2.0,2.0,5.0,0.0,...,-0.511475,-0.234851,0.492188,,,,0.335849,-0.1500,-1.889247,1
272,3.461538,3.125,69,11.0,13.0,53.0,1.0,1.0,6.0,1.0,...,1.335248,0.099038,0.507812,,,,0.273736,,0.516129,1
273,2.846154,3.000,62,15.0,22.0,84.0,0.0,1.0,4.0,1.0,...,0.855379,-0.234851,0.479167,,,,0.401098,-0.9875,-0.151210,0


# Missing data 

Apply missing data imputation to columns including cSES, ACES_sum, ses_aggregate, zipcode_median_income_acs, IMI, mcarthur social standing, based on demographic and self-report predictors.

Based on this experiment, I'm going for Ridge regression with 10 nearest features. The values it imputes are a compromise between simply using the nearest mean, which is conservative when using these values for prediction because it doesn't introduce erroneous variance, but isn't very informative, and then using all available information, which Ridge regression with an unlimited number of features would do. It's a tough choice between this and KNN, which doesn't assume normality. Overall I'm going with KNN, because it picks up on relationships between the two variables while not generating extreme values like KNN seems to do.

In [25]:
imputer = IterativeImputer(estimator=linear_model.Ridge(),n_nearest_features=10,max_iter=100,random_state=0)
analysis_data_imputed = get_data_for_imputation(analysis_data)

#this dataset is already filtered for columns so we don't need to filter those further.
analysis_data_imputed = pd.DataFrame(imputer.fit_transform(analysis_data_imputed), columns=analysis_data_imputed.columns)
imputed_datapoint = analysis_data.isna()
# do_aces_cses_imputation_diagnostic(analysis_data_imputed, imputed_datapoint,'ridge_10')




# With feature selection, KBest vs. none

In [26]:

#loops through the different estimators and feature selection methods and does a grid search over all to find the best hyperparameters
def do_hyperparameter_selection_loop(X, y,cv):
    #alpha parameters for Ridge and Lasso
    alpha_10pow_lower = 1
    alpha_10pow_upper = 0
    alpha_increments=1
    alpha_range = np.concatenate([np.power(10,np.linspace(-alpha_10pow_lower,alpha_10pow_upper,(alpha_10pow_lower+alpha_10pow_upper)*alpha_increments+1)),
        [0.2,0.4,0.6,0.8,1.0]])
    
    all_cv_results = []

    pipeline_estimator_name = 'estimator'
    feature_selection_name = 'feature_selection'


    #define the param_grid for the estimators
    estimators_to_run = {
        'Ridge':{
            'estimator':linear_model.Ridge,
            'parameters':{'alpha':alpha_range}
        },
        'Lasso':{
            'estimator':linear_model.Lasso,
            'parameters':{'alpha':alpha_range}
        },
        # 'DecisionTreeRegressor':{
        #     'estimator':DecisionTreeRegressor,
        #     'parameters':{
        #         'max_depth':[2, 3,5,10],
        #         'min_samples_split':[5,20,50],
        #         'min_samples_leaf':[5,20,50]
        #     }
        # }             
    }

    for estimator_name,estimator_dict in estimators_to_run.items():
        #param grid for the feature seelction
        #this is here because we need to know the estimator to pass to the feature selector
        feature_selectors_to_run = {
            'None':None,
            'KBest':{
                'selector':SelectKBest(),
                'parameters':{
                    'score_func' : [f_regression], 
                    'k' : [20,50]
                    }
            }#,
            # 'RFE':{
            #     'selector':RFE(linear_model.LinearRegression()),
            #     'parameters':{
            #         'n_features_to_select' : [10,25],
            #         #'verbose':[1],
            #         'step':[5]
            #     }
            # }
        }
        for selector_name, selector_dict in feature_selectors_to_run.items():
        #create the estimator
            if selector_name == 'None':
                pipeline = Pipeline([('scaler',StandardScaler()),
                                     (pipeline_estimator_name,estimator_dict['estimator']())])
                selector_params = {}
            else:
                pipeline = Pipeline([('scaler',StandardScaler()),
                                     (feature_selection_name,selector_dict['selector']), 
                                     (pipeline_estimator_name,estimator_dict['estimator']())])
                selector_params = selector_dict['parameters']

            estimator_param_grid = {(pipeline_estimator_name + '__'+k):v for k,v in estimator_dict['parameters'].items()}
            selector_param_grid = {(feature_selection_name + '__'+k):v for k,v in selector_params.items()}
            #combine the two param grid dictionaries
            full_param_grid = {**selector_param_grid, **estimator_param_grid}
            print(pipeline)
            print(full_param_grid)

            
        
            gs_1 = GridSearchCV(estimator=pipeline, 
                                param_grid = full_param_grid, 
                                cv=cv,scoring='neg_mean_absolute_error',verbose=1)
            gs_1.fit(X,y)
            all_cv_results.append(gs_1)

    #create a dataframe with the best parameters, best mean_test_score, and name of the model

    best_params_df = pd.DataFrame({
        'model': [cv_result.estimator for cv_result in all_cv_results],
        'model_name': [cv_result.estimator.__class__.__name__ for cv_result in all_cv_results],
        'best_params': [extract_estimator_params_from_gridsearch(cv_result.best_params_) for cv_result in all_cv_results],
        'best_score': [cv_result.best_score_ for cv_result in all_cv_results],
        'best_raw_params' : [cv_result.best_params_ for cv_result in all_cv_results]
        })
    
    best_params_df = best_params_df.sort_values('best_score',ascending=False).reset_index(drop=True)

    best_model = clone(best_params_df['model'][0])
    best_model_params = best_params_df['best_raw_params'][0]
    best_model.set_params(**best_model_params)

    return {
        'best_model': best_model,
        'best_params_df':best_params_df,
        'raw_cv_results':all_cv_results
    }





In [27]:

def present_model_results(X,y, final_fit):
    final_estimator = final_fit.named_steps['estimator']
    
    if hasattr(final_estimator,'coef_'):
        coef = final_estimator.coef_
    else:
        coef = None

    #now check to see if there was a feature selection step,
    #if so, get the feature names from the feature selection step
    if 'feature_selection' in final_fit.named_steps:
        feature_bool = final_fit.named_steps['feature_selection'].get_support(indices=True)
    else:
        feature_bool = [True]*len(X.columns)
    
    feature_names = X.columns[feature_bool]

    #now do a permutation test to do feature importance
    #view the coefficients
    print("doing permutation test on importance; this may take time.")
    permutation_res= [im for im in permutation_importance(final_fit, X, y, n_repeats=10).importances_mean]
    # print(len(feature_names))
    # print(len(permutation_res))
    # print(len(coef))
    
    
    final_results = pd.DataFrame({
        'predictor': feature_names,
        'coef': coef,
        'feature_importance':pd.Series(permutation_res)[feature_bool]
        #'std_err': np.sqrt(np.diag(model_fit.coef_cov_)),
        #'pval': 2*(1-stats.t.cdf(np.abs(model_fit.coef_/np.sqrt(np.diag(model_fit.coef_cov_))),df=predictor_data_nona.shape[0]-predictor_data_nona.shape[1]))
    })

    final_results['fa_abs'] = np.abs(final_results.feature_importance)
    final_results = final_results.sort_values('fa_abs',ascending=False)

    if coef is not None:
        selected_features_count = np.sum(final_estimator.coef_!=0)
        print(f"Number of selected features: {selected_features_count}")

    display(HTML(final_results[0:20].to_html()))
    return(final_results)

In [28]:

#set np random seed
np.random.seed(3161527)

group_names = ['ichi','ni','san']
#assign each row randomly to a group
group_assignments = np.random.choice(group_names,analysis_data_imputed.shape[0])

#synthetic outcomes
outcome_measures = generate_synthetic_dev_outcomes(outcome_measures)

# add synthetic primary and interaction effects


#set up the interaction effects
custom_interaction_effects_g1 = [0]*analysis_data_imputed.shape[1]
custom_interaction_effects_g1[0] = 0.15
custom_interaction_effects_g1[1] = 0.15
custom_interaction_effects_g1[2] = -0.15
custom_interaction_effects_g1[3] = -0.15

custom_interaction_effects_g2 = [0]*analysis_data_imputed.shape[1]
custom_interaction_effects_g2[4] = 0.15
custom_interaction_effects_g2[5] = 0.15
custom_interaction_effects_g2[6] = -0.15
custom_interaction_effects_g2[7] = -0.15

custom_interaction_effects = {'ni':custom_interaction_effects_g1,'san':custom_interaction_effects_g2}



synthetic_data = generate_synthetic_dev_data(analysis_data_imputed, group_assignments,outcome_measures, group_interaction_effects = custom_interaction_effects)
interaction_effect_df = synthetic_data['X_weights']
outcome_measures = synthetic_data['y']

# Set up outcome measures and group assignment one-hot

outcome_measures = calculate_outcome_changes(outcome_measures)
group_assignment_onehots = pd.get_dummies(group_assignments).loc[:,['ni','san']]

predictor_data = set_up_interactions(analysis_data_imputed, group_assignment_onehots)




['ni' 'san']
[1.28335298 0.42953651]
['san' 'san' 'ni' 'ichi' 'san' 'san' 'ichi' 'san' 'san' 'san' 'ni' 'ichi'
 'ichi' 'ichi' 'ichi' 'san' 'san' 'san' 'ichi' 'ichi' 'san' 'san' 'ni'
 'ni' 'ni' 'ni' 'ni' 'ni' 'ni' 'san' 'ni' 'san' 'ni' 'ichi' 'ni' 'san'
 'ni' 'ichi' 'san' 'ni' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ni' 'ni'
 'san' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ichi' 'ni' 'ni' 'ni' 'ichi' 'san'
 'ni' 'ni' 'ichi' 'ni' 'ichi' 'san' 'ni' 'ni' 'ni' 'san' 'ichi' 'ni' 'san'
 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'ichi' 'san' 'ichi' 'san' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'san' 'san' 'san' 'ichi'
 'ni' 'san' 'ichi' 'ichi' 'san' 'ni' 'ichi' 'san' 'ni' 'ni' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'ichi' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ichi'
 'ni' 'ni' 'ichi' 'ni' 'ni' 'ichi' 'ichi' 'san' 'san' 'ni' 'ichi' 'ni'
 'ichi' 'ichi' 'san' 'ichi' 'ni' 'san' 'san' 'ni' 'ni' 'san' 'san' 'san'
 'ichi' 'san' 'ni' 'san' 'ichi' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ni' 'ni'
 'ichi' 'ni' 'ichi' '

In [29]:
#remove any NA values for this outcome measure in both the predictor data and the outcome data
outcome_nas = outcome_measures['d_cancer_promoting_minus_preventing_FFQ'].isna()

outcome_measures_nona = outcome_measures.loc[~outcome_nas,:]
predictor_data_nona = predictor_data.loc[~outcome_nas,:]
group_assignment_onehots_nonan = group_assignment_onehots.loc[~outcome_nas,:]
group_assignments_nona = group_assignments[~outcome_nas]

### Try out CV with simple gridsearch

scoring_data = do_scoring_loop(X=predictor_data_nona, y= outcome_measures_nona['d_cancer_promoting_minus_preventing_FFQ'], 
                groups = group_assignments_nona, 
                hyperparameter_selection_on_fold=do_hyperparameter_selection_loop,
                outer_folds=5)

scores = scoring_data['scores']
best_models = scoring_data['best_models']
best_params_df_list = scoring_data['best_params_df_list']
raw_cv_results_list = scoring_data['raw_cv_results_list']

print("scores:")
print(scores)
overall_score = np.mean(scores)
print("overall_score:")
print(overall_score)



best_model = get_best_model(summarize_overall_df_results(raw_cv_results_list))
final_fit = do_final_fit(X=predictor_data_nona, y= outcome_measures_nona['d_cancer_promoting_minus_preventing_FFQ'], final_model=best_model)
final_results = present_model_results(X=predictor_data_nona, final_fit=final_fit, y=outcome_measures_nona['d_cancer_promoting_minus_preventing_FFQ'])

#print rows of final_results where feature_name is the list of features to check
base_regressors = interaction_effect_df.predictor[interaction_effect_df.interaction_effect!=0]
regressors_to_check = [x+y for y in ['','*ni','*san'] for x in base_regressors]
final_results['planned_regression'] = final_results['predictor'].isin(regressors_to_check)

present_results_vs_ground_truth_cors(predictor_data_nona,outcome_measures_nona,group_assignments_nona,final_results,base_regressors)

outer split0
Pipeline(steps=[('scaler', StandardScaler()), ('estimator', Ridge())])
{'estimator__alpha': array([0.1, 1. , 0.2, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 7 candidates, totalling 28 fits
Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_selection', SelectKBest()), ('estimator', Ridge())])
{'feature_selection__score_func': [<function f_regression at 0x1816d7920>], 'feature_selection__k': [20, 50], 'estimator__alpha': array([0.1, 1. , 0.2, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 14 candidates, totalling 56 fits
Pipeline(steps=[('scaler', StandardScaler()), ('estimator', Lasso())])
{'estimator__alpha': array([0.1, 1. , 0.2, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 7 candidates, totalling 28 fits
Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_selection', SelectKBest()), ('estimator', Lasso())])
{'feature_selection__score_func': [<function f_regression at 0x1816d7920>], 'feature_selection__k': [20, 50], 

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test_score,mean_test_score,std_test_score,std_test_score
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
model_description,params_str,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.1},-0.289938,0.002134,0.021273,0.012721
"dict_values([StandardScaler(), SelectKBest(), Ridge()])","{'estimator__alpha': 1.0, 'feature_selection__k': 20, 'feature_selection__score_func': <function f_regression at 0x1816d7920>}",-0.290794,0.005968,0.022867,0.00443
"dict_values([StandardScaler(), SelectKBest(), Ridge()])","{'estimator__alpha': 0.8, 'feature_selection__k': 20, 'feature_selection__score_func': <function f_regression at 0x1816d7920>}",-0.291894,0.006714,0.023126,0.004239
"dict_values([StandardScaler(), SelectKBest(), Ridge()])","{'estimator__alpha': 0.6, 'feature_selection__k': 20, 'feature_selection__score_func': <function f_regression at 0x1816d7920>}",-0.293261,0.007252,0.023502,0.003641
"dict_values([StandardScaler(), SelectKBest(), Ridge()])","{'estimator__alpha': 1.0, 'feature_selection__k': 50, 'feature_selection__score_func': <function f_regression at 0x1816d7920>}",-0.294541,0.00392,0.021524,0.00917
"dict_values([StandardScaler(), SelectKBest(), Ridge()])","{'estimator__alpha': 0.4, 'feature_selection__k': 20, 'feature_selection__score_func': <function f_regression at 0x1816d7920>}",-0.294999,0.008027,0.024075,0.002824
"dict_values([StandardScaler(), SelectKBest(), Lasso()])","{'estimator__alpha': 0.1, 'feature_selection__k': 50, 'feature_selection__score_func': <function f_regression at 0x1816d7920>}",-0.295234,0.002534,0.020316,0.011688
"dict_values([StandardScaler(), SelectKBest(), Lasso()])","{'estimator__alpha': 0.1, 'feature_selection__k': 20, 'feature_selection__score_func': <function f_regression at 0x1816d7920>}",-0.295299,0.002636,0.020548,0.011895
"dict_values([StandardScaler(), SelectKBest(), Ridge()])","{'estimator__alpha': 0.2, 'feature_selection__k': 20, 'feature_selection__score_func': <function f_regression at 0x1816d7920>}",-0.297494,0.009146,0.024906,0.001832
"dict_values([StandardScaler(), SelectKBest(), Ridge()])","{'estimator__alpha': 0.8, 'feature_selection__k': 50, 'feature_selection__score_func': <function f_regression at 0x1816d7920>}",-0.297517,0.004486,0.021779,0.010156


doing permutation test on importance; this may take time.
Number of selected features: 5


Unnamed: 0,predictor,coef,feature_importance,fa_abs
78,BSCS*ni,0.360661,0.857528,0.857528
90,BFI_conscientiousness*ni,0.02617,0.016946,0.016946
98,IMI_perceived_choice*ni,0.01755,0.010778,0.010778
126,RTFS_factor_2*ni,0.012885,0.007558,0.007558
159,TRSQ*san,0.009197,0.005296,0.005296
0,BSCS,-0.0,0.0,0.0
157,PCS*san,0.0,0.0,0.0
148,SST_SSD*ni,0.0,0.0,0.0
149,SST_PostErrorSlowW1_mean*ni,0.0,0.0,0.0
150,SST_mean_ssrt_0*ni,0.0,0.0,0.0


  results_vs_cors = final_results_wide.merge(group_correlations, left_index=True, right_index=True, how='outer')


Unnamed: 0,"(coef, base)","(coef, ni)","(coef, san)","(feature_importance, base)","(feature_importance, ni)","(feature_importance, san)",ichi_cor,ni_cor,san_cor,abs_effect_sum
BSCS,-0.0,0.361,0.0,0.0,0.858,0.0,-0.137,0.448,-0.08,0.858
BFI_conscientiousness,0.0,0.026,0.0,0.0,0.017,0.0,,,,0.017
IMI_perceived_choice,-0.0,0.018,0.0,0.0,0.011,0.0,,,,0.011
RTFS_factor_2,0.0,0.013,0.0,0.0,0.008,0.0,,,,0.008
TRSQ,0.0,0.0,0.009,0.0,0.0,0.005,0.091,-0.239,0.427,0.005


{'results_vs_cors':                                (coef, base)  (coef, ni)  (coef, san)  \
 BSCS                                   -0.0    0.360661     0.000000   
 BFI_conscientiousness                   0.0    0.026170     0.000000   
 IMI_perceived_choice                   -0.0    0.017550     0.000000   
 RTFS_factor_2                           0.0    0.012885     0.000000   
 TRSQ                                    0.0    0.000000     0.009197   
 ...                                     ...         ...          ...   
 NCS_prefer_little_thought               0.0    0.000000     0.000000   
 NCS_prefer_complex                      0.0    0.000000     0.000000   
 NCS_new_solutions_to_problems           0.0    0.000000     0.000000   
 NCS_like_responsibility                 0.0    0.000000     0.000000   
 zipcode_median_income_acs               0.0    0.000000     0.000000   
 
                                (feature_importance, base)  \
 BSCS                                    

In [30]:
base_regressors

0                         BSCS
7                   ACES_abuse
4                           RS
5                         TRSQ
6    ACES_neglectful_parenting
1                          EDM
2                       BIS_11
3                          PCS
Name: predictor, dtype: object

In [31]:
#import sns

#present_results_vs_ground_truth_cors(predictor_data_nona,outcome_measures_nona,group_assignments_nona,final_results,base_regressors)


for group_name in ['ichi','ni','san']:

    #print(group_name)
    group_data = predictor_data_nona.loc[group_assignments_nona==group_name,:]
    group_outcomes = outcome_measures_nona.loc[group_assignments_nona==group_name,'d_cancer_promoting_minus_preventing_FFQ']

    
    
    #get the two-way correlation between data and the outcome column
    #these are what was actually modeled into the data.
    group_correlations = pd.DataFrame({group_name + '_cor':group_data[base_regressors].corrwith(group_outcomes)})
    print(group_correlations)


                           ichi_cor
BSCS                       0.032330
ACES_abuse                 0.083333
RS                        -0.187175
TRSQ                      -0.020164
ACES_neglectful_parenting  0.105595
EDM                       -0.112815
BIS_11                    -0.152706
PCS                       -0.055152
                             ni_cor
BSCS                       0.464079
ACES_abuse                 0.045392
RS                        -0.321486
TRSQ                      -0.221584
ACES_neglectful_parenting -0.016256
EDM                        0.142513
BIS_11                    -0.586145
PCS                       -0.296484
                            san_cor
BSCS                      -0.173121
ACES_abuse                -0.563012
RS                         0.392869
TRSQ                       0.553858
ACES_neglectful_parenting -0.502054
EDM                       -0.184659
BIS_11                     0.033254
PCS                       -0.051662


Ground Truth:
```
0                              BSCS                0.15
3                               PCS               -0.15
1                               EDM                0.15
2                            BIS_11               -0.15
cancer_promoting_minus_preventing_FFQ_w2
FFQ_v2_Mean_Energy_w2
san
                       feature_name  interaction_effect
4                                RS                0.15
5                              TRSQ                0.15
6         ACES_neglectful_parenting               -0.15
7                        ACES_abuse               -0.15
```

# KBest vs RFE

## Testing a simple feature selection pipelien

In [33]:
pipeline_estimator_name = 'estimator'
feature_selection_name = 'feature_selection'

In [34]:
my_test_pipeline = Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_selection', RFE(estimator=linear_model.LinearRegression())),
                ('estimator', Ridge())])

estimator_params = {'alpha':[0.1,0.5,0.9]}
feature_selection_params = {'n_features_to_select':[10,25]}


estimator_param_grid = {(pipeline_estimator_name + '__'+k):v for k,v in estimator_params.items()}
selector_param_grid = {(feature_selection_name + '__'+k):v for k,v in feature_selection_params.items()}
#combine the two param grid dictionaries
full_param_grid = {**selector_param_grid, **estimator_param_grid}
#full_param_grid = {**estimator_param_grid}


In [35]:
full_param_grid

{'feature_selection__n_features_to_select': [10, 25],
 'estimator__alpha': [0.1, 0.5, 0.9]}

In [36]:
predictor_data_nona.shape

(275, 230)

In [37]:
#let's do StandardScaler, RFE, and Ridge outside of the GridSearchCV
#I want to see how it works from one to the next
#no pipeline, just separate steps
#I want to see how it works from one to the next
#no pipeline, just separate steps
scaler = StandardScaler()
scaled_predictor_data = scaler.fit_transform(predictor_data_nona)
rfe = RFE(estimator=linear_model.LinearRegression(),verbose=1,n_features_to_select=10,step=4)
rfe.fit(scaled_predictor_data,outcome_measures_nona['d_cancer_promoting_minus_preventing_FFQ'])
rfe_results = pd.DataFrame({'predictor':predictor_data_nona.columns,'rfe_support':rfe.support_,'rfe_ranking':rfe.ranking_})

Fitting estimator with 230 features.
Fitting estimator with 226 features.
Fitting estimator with 222 features.
Fitting estimator with 218 features.
Fitting estimator with 214 features.
Fitting estimator with 210 features.
Fitting estimator with 206 features.
Fitting estimator with 202 features.
Fitting estimator with 198 features.
Fitting estimator with 194 features.
Fitting estimator with 190 features.
Fitting estimator with 186 features.
Fitting estimator with 182 features.
Fitting estimator with 178 features.
Fitting estimator with 174 features.
Fitting estimator with 170 features.
Fitting estimator with 166 features.
Fitting estimator with 162 features.
Fitting estimator with 158 features.
Fitting estimator with 154 features.
Fitting estimator with 150 features.
Fitting estimator with 146 features.
Fitting estimator with 142 features.
Fitting estimator with 138 features.
Fitting estimator with 134 features.
Fitting estimator with 130 features.
Fitting estimator with 126 features.
F

In [38]:
rfe_results

Unnamed: 0,predictor,rfe_support,rfe_ranking
0,BSCS,False,48
1,EDM,False,16
2,BIS_11,False,12
3,PCS,False,23
4,RS,False,29
...,...,...,...
225,SST_PostErrorSlowW1_mean*san,False,55
226,SST_mean_ssrt_0*san,False,10
227,ROC_Crave_Regulate_Minus_Look*san,False,18
228,WTP_unhealthy_minus_healthy*san,False,47


## Full model with RFE

In [39]:

#loops through the different estimators and feature selection methods and does a grid search over all to find the best hyperparameters
def do_hyperparameter_selection_loop(X, y,cv):
    #alpha parameters for Ridge and Lasso
    alpha_10pow_lower = 1
    alpha_10pow_upper = 0
    alpha_increments=1
    alpha_range = np.concatenate([np.power(10,np.linspace(-alpha_10pow_lower,alpha_10pow_upper,(alpha_10pow_lower+alpha_10pow_upper)*alpha_increments+1)),
        [0.2,0.4,0.6,0.8,1.0]])
    
    all_cv_results = []

    pipeline_estimator_name = 'estimator'
    feature_selection_name = 'feature_selection'


    #define the param_grid for the estimators
    estimators_to_run = {
        'Ridge':{
            'estimator':linear_model.Ridge,
            'parameters':{'alpha':alpha_range}
        },
        'Lasso':{
            'estimator':linear_model.Lasso,
            'parameters':{'alpha':alpha_range}
        },
        'DecisionTreeRegressor':{
            'estimator':DecisionTreeRegressor,
            'parameters':{
                'max_depth':[2, 4],
                'min_samples_split':[20,50],
                'min_samples_leaf':[20,50]
            }
        }             
    }

    for estimator_name,estimator_dict in estimators_to_run.items():
        #param grid for the feature seelction
        #this is here because we need to know the estimator to pass to the feature selector
        feature_selectors_to_run = {
            'None':None,
            'KBest':{
                'selector':SelectKBest(),
                'parameters':{
                    'score_func' : [f_regression], 
                    'k' : [20,50]
                    }
            },
            'RFE':{
                'selector':RFE(linear_model.LinearRegression()),
                'parameters':{
                    'n_features_to_select' : [10,25],
                    #'verbose':[1],
                    'step':[5]
                }
            }
        }
        for selector_name, selector_dict in feature_selectors_to_run.items():
        #create the estimator
            if selector_name == 'None':
                pipeline = Pipeline([('scaler',StandardScaler()),
                                     (pipeline_estimator_name,estimator_dict['estimator']())])
                selector_params = {}
            else:
                pipeline = Pipeline([('scaler',StandardScaler()),
                                     (feature_selection_name,selector_dict['selector']), 
                                     (pipeline_estimator_name,estimator_dict['estimator']())])
                selector_params = selector_dict['parameters']

            estimator_param_grid = {(pipeline_estimator_name + '__'+k):v for k,v in estimator_dict['parameters'].items()}
            selector_param_grid = {(feature_selection_name + '__'+k):v for k,v in selector_params.items()}
            #combine the two param grid dictionaries
            full_param_grid = {**selector_param_grid, **estimator_param_grid}
            print(pipeline)
            print(full_param_grid)

            
        
            gs_1 = GridSearchCV(estimator=pipeline, 
                                param_grid = full_param_grid, 
                                cv=cv,scoring='neg_mean_absolute_error',verbose=1)
            gs_1.fit(X,y)
            all_cv_results.append(gs_1)

    #create a dataframe with the best parameters, best mean_test_score, and name of the model

    best_params_df = pd.DataFrame({
        'model': [cv_result.estimator for cv_result in all_cv_results],
        'model_name': [cv_result.estimator.__class__.__name__ for cv_result in all_cv_results],
        'best_params': [extract_estimator_params_from_gridsearch(cv_result.best_params_) for cv_result in all_cv_results],
        'best_score': [cv_result.best_score_ for cv_result in all_cv_results],
        'best_raw_params' : [cv_result.best_params_ for cv_result in all_cv_results]
        })
    
    best_params_df = best_params_df.sort_values('best_score',ascending=False).reset_index(drop=True)

    best_model = clone(best_params_df['model'][0])
    best_model_params = best_params_df['best_raw_params'][0]
    best_model.set_params(**best_model_params)

    return {
        'best_model': best_model,
        'best_params_df':best_params_df,
        'raw_cv_results':all_cv_results
    }





In [40]:

#set np random seed
np.random.seed(3161527)

group_names = ['ichi','ni','san']
#assign each row randomly to a group
group_assignments = np.random.choice(group_names,analysis_data_imputed.shape[0])

#synthetic outcomes
outcome_measures = generate_synthetic_dev_outcomes(outcome_measures)

# add synthetic primary and interaction effects


#set up the interaction effects
custom_interaction_effects_g1 = [0]*analysis_data_imputed.shape[1]
custom_interaction_effects_g1[0] = 0.15
custom_interaction_effects_g1[1] = 0.15
custom_interaction_effects_g1[2] = -0.15
custom_interaction_effects_g1[3] = -0.15

custom_interaction_effects_g2 = [0]*analysis_data_imputed.shape[1]
custom_interaction_effects_g2[4] = 0.15
custom_interaction_effects_g2[5] = 0.15
custom_interaction_effects_g2[6] = -0.15
custom_interaction_effects_g2[7] = -0.15

custom_interaction_effects = {'ni':custom_interaction_effects_g1,'san':custom_interaction_effects_g2}



synthetic_data = generate_synthetic_dev_data(analysis_data_imputed, group_assignments,outcome_measures, group_interaction_effects = custom_interaction_effects)
interaction_effect_df = synthetic_data['X_weights']
outcome_measures = synthetic_data['y']

# Set up outcome measures and group assignment one-hot

outcome_measures = calculate_outcome_changes(outcome_measures)
group_assignment_onehots = pd.get_dummies(group_assignments).loc[:,['ni','san']]

predictor_data = set_up_interactions(analysis_data_imputed, group_assignment_onehots)


#remove any NA values for this outcome measure in both the predictor data and the outcome data
outcome_nas = outcome_measures['d_cancer_promoting_minus_preventing_FFQ'].isna()

outcome_measures_nona = outcome_measures.loc[~outcome_nas,:]
predictor_data_nona = predictor_data.loc[~outcome_nas,:]
group_assignment_onehots_nonan = group_assignment_onehots.loc[~outcome_nas,:]
group_assignments_nona = group_assignments[~outcome_nas]

### Try out CV with simple gridsearch

scoring_data = do_scoring_loop(X=predictor_data_nona, y= outcome_measures_nona['d_cancer_promoting_minus_preventing_FFQ'], 
                groups = group_assignments_nona, 
                hyperparameter_selection_on_fold=do_hyperparameter_selection_loop,
                outer_folds=5)

scores = scoring_data['scores']
best_models = scoring_data['best_models']
best_params_df_list = scoring_data['best_params_df_list']
raw_cv_results_list = scoring_data['raw_cv_results_list']

print("scores:")
print(scores)
overall_score = np.mean(scores)
print("overall_score:")
print(overall_score)



best_model = get_best_model(summarize_overall_df_results(raw_cv_results_list))
final_fit = do_final_fit(X=predictor_data_nona, y= outcome_measures_nona['d_cancer_promoting_minus_preventing_FFQ'], final_model=best_model)
final_results = present_model_results(X=predictor_data_nona, final_fit=final_fit, y=outcome_measures_nona['d_cancer_promoting_minus_preventing_FFQ'])

#print rows of final_results where feature_name is the list of features to check
base_regressors = interaction_effect_df.predictor[interaction_effect_df.interaction_effect!=0]
regressors_to_check = [x+y for y in ['','*ni','*san'] for x in base_regressors]
final_results['planned_regression'] = final_results['predictor'].isin(regressors_to_check)

present_results_vs_ground_truth_cors(predictor_data_nona,outcome_measures_nona,group_assignments_nona,final_results,base_regressors)

['ni' 'san']
[1.28335298 0.42953651]
['san' 'san' 'ni' 'ichi' 'san' 'san' 'ichi' 'san' 'san' 'san' 'ni' 'ichi'
 'ichi' 'ichi' 'ichi' 'san' 'san' 'san' 'ichi' 'ichi' 'san' 'san' 'ni'
 'ni' 'ni' 'ni' 'ni' 'ni' 'ni' 'san' 'ni' 'san' 'ni' 'ichi' 'ni' 'san'
 'ni' 'ichi' 'san' 'ni' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ni' 'ni'
 'san' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ichi' 'ni' 'ni' 'ni' 'ichi' 'san'
 'ni' 'ni' 'ichi' 'ni' 'ichi' 'san' 'ni' 'ni' 'ni' 'san' 'ichi' 'ni' 'san'
 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'ichi' 'san' 'ichi' 'san' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'san' 'san' 'san' 'ichi'
 'ni' 'san' 'ichi' 'ichi' 'san' 'ni' 'ichi' 'san' 'ni' 'ni' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'ichi' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ichi'
 'ni' 'ni' 'ichi' 'ni' 'ni' 'ichi' 'ichi' 'san' 'san' 'ni' 'ichi' 'ni'
 'ichi' 'ichi' 'san' 'ichi' 'ni' 'san' 'san' 'ni' 'ni' 'san' 'san' 'san'
 'ichi' 'san' 'ni' 'san' 'ichi' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ni' 'ni'
 'ichi' 'ni' 'ichi' '

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test_score,mean_test_score,std_test_score,std_test_score
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
model_description,params_str,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 0.2, 'feature_selection__n_features_to_select': 10, 'feature_selection__step': 5}",-0.242582,0.011935,0.015178,0.011331
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 0.4, 'feature_selection__n_features_to_select': 10, 'feature_selection__step': 5}",-0.242612,0.011707,0.015433,0.010934
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 0.1, 'feature_selection__n_features_to_select': 10, 'feature_selection__step': 5}",-0.242612,0.012019,0.015016,0.011438
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 0.6, 'feature_selection__n_features_to_select': 10, 'feature_selection__step': 5}",-0.242683,0.01145,0.015649,0.010525
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 0.8, 'feature_selection__n_features_to_select': 10, 'feature_selection__step': 5}",-0.24275,0.011186,0.015851,0.010169
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 1.0, 'feature_selection__n_features_to_select': 10, 'feature_selection__step': 5}",-0.242822,0.010281,0.016033,0.009289
"dict_values([StandardScaler(), DecisionTreeRegressor()])","{'estimator__max_depth': 4, 'estimator__min_samples_leaf': 20, 'estimator__min_samples_split': 50}",-0.244519,0.007247,0.027355,0.011346
"dict_values([StandardScaler(), DecisionTreeRegressor()])","{'estimator__max_depth': 2, 'estimator__min_samples_leaf': 20, 'estimator__min_samples_split': 50}",-0.244999,0.009106,0.022486,0.008245
"dict_values([StandardScaler(), DecisionTreeRegressor()])","{'estimator__max_depth': 2, 'estimator__min_samples_leaf': 20, 'estimator__min_samples_split': 20}",-0.244999,0.009106,0.022486,0.008245
"dict_values([StandardScaler(), DecisionTreeRegressor()])","{'estimator__max_depth': 4, 'estimator__min_samples_leaf': 20, 'estimator__min_samples_split': 20}",-0.245135,0.007674,0.029332,0.012889


doing permutation test on importance; this may take time.
Number of selected features: 10


Unnamed: 0,predictor,coef,feature_importance,fa_abs
78,BSCS*ni,0.606557,1.969307,1.969307
159,TRSQ*san,0.459476,1.060477,1.060477
205,SRHI_sum*san,-0.205621,0.216856,0.216856
161,ACES_abuse*san,-0.090189,0.046144,0.046144
160,ACES_neglectful_parenting*san,-0.053777,0.01832,0.01832
162,ACES_sum*san,-0.056054,0.017809,0.017809
203,SRHI_healthy*san,0.046549,0.01045,0.01045
164,ACES_household_dysfunction*san,0.037219,0.007228,0.007228
163,ACES_divorced_separated*san,-0.004476,0.000138,0.000138
213,SRHI_healthy_minus_unhealthy*san,0.000224,3e-06,3e-06


  results_vs_cors = final_results_wide.merge(group_correlations, left_index=True, right_index=True, how='outer')


Unnamed: 0,"(coef, ni)","(coef, san)","(feature_importance, ni)","(feature_importance, san)",ichi_cor,ni_cor,san_cor,abs_effect_sum
BSCS,0.607,,1.969,,-0.137,0.448,-0.08,1.969
TRSQ,,0.459,,1.06,0.091,-0.239,0.427,1.06
SRHI_sum,,-0.206,,0.217,,,,0.217
ACES_abuse,,-0.09,,0.046,0.147,-0.125,-0.414,0.046
ACES_neglectful_parenting,,-0.054,,0.018,-0.046,-0.046,-0.476,0.018
ACES_sum,,-0.056,,0.018,,,,0.018
SRHI_healthy,,0.047,,0.01,,,,0.01
ACES_household_dysfunction,,0.037,,0.007,,,,0.007


{'results_vs_cors':                               (coef, ni)  (coef, san)  \
 BSCS                            0.606557          NaN   
 TRSQ                                 NaN     0.459476   
 SRHI_sum                             NaN    -0.205621   
 ACES_abuse                           NaN    -0.090189   
 ACES_neglectful_parenting            NaN    -0.053777   
 ACES_sum                             NaN    -0.056054   
 SRHI_healthy                         NaN     0.046549   
 ACES_household_dysfunction           NaN     0.037219   
 ACES_divorced_separated              NaN    -0.004476   
 SRHI_healthy_minus_unhealthy         NaN     0.000224   
 BIS_11                               NaN          NaN   
 EDM                                  NaN          NaN   
 PCS                                  NaN          NaN   
 RS                                   NaN          NaN   
 
                               (feature_importance, ni)  \
 BSCS                                          1.9

Well, that actually worked. It seems to be left with an $R^2$ of 0.07, which is reasonably good performance, I think.

I now want to re-run with a more ecologically valid design to see if that works.

We probably realistically will only pick up on around 3 features per item (given the size of the dataset), and we would expect their correlations to be not much higher than r=0.3. So let's design a simulated dataset that looks like that, and re-run.

# Repeat with 4 features with r around 0.3

In [41]:

#loops through the different estimators and feature selection methods and does a grid search over all to find the best hyperparameters
def do_hyperparameter_selection_loop_simple(X, y,cv):
    #alpha parameters for Ridge and Lasso
    alpha_10pow_lower = 1
    alpha_10pow_upper = 0
    alpha_increments=1
    alpha_range = np.concatenate([np.power(10,np.linspace(-alpha_10pow_lower,alpha_10pow_upper,(alpha_10pow_lower+alpha_10pow_upper)*alpha_increments+1)),
        [0.2,0.4,0.6,0.8,1.0]])
    
    all_cv_results = []

    pipeline_estimator_name = 'estimator'
    feature_selection_name = 'feature_selection'


    #define the param_grid for the estimators
    estimators_to_run = {
        'Ridge':{
            'estimator':linear_model.Ridge,
            'parameters':{'alpha':alpha_range}
        },
        # 'Lasso':{
        #     'estimator':linear_model.Lasso,
        #     'parameters':{'alpha':alpha_range}
        # },
        'DecisionTreeRegressor':{
            'estimator':DecisionTreeRegressor,
            'parameters':{
                'max_depth':[2, 4],
                'min_samples_split':[20,50],
                'min_samples_leaf':[20,50]
            }
        }             
    }

    for estimator_name,estimator_dict in estimators_to_run.items():
        #param grid for the feature seelction
        #this is here because we need to know the estimator to pass to the feature selector
        feature_selectors_to_run = {
            # 'None':None,
            'KBest':{
                'selector':SelectKBest(),
                'parameters':{
                    'score_func' : [f_regression], 
                    'k' : [20,50]
                    }
            }#,
            # 'RFE':{
            #     'selector':RFE(linear_model.LinearRegression()),
            #     'parameters':{
            #         'n_features_to_select' : [10,25],
            #         #'verbose':[1],
            #         'step':[5]
            #     }
            # }
        }
        for selector_name, selector_dict in feature_selectors_to_run.items():
        #create the estimator
            if selector_name == 'None':
                pipeline = Pipeline([('scaler',StandardScaler()),
                                     (pipeline_estimator_name,estimator_dict['estimator']())])
                selector_params = {}
            else:
                pipeline = Pipeline([('scaler',StandardScaler()),
                                     (feature_selection_name,selector_dict['selector']), 
                                     (pipeline_estimator_name,estimator_dict['estimator']())])
                selector_params = selector_dict['parameters']

            estimator_param_grid = {(pipeline_estimator_name + '__'+k):v for k,v in estimator_dict['parameters'].items()}
            selector_param_grid = {(feature_selection_name + '__'+k):v for k,v in selector_params.items()}
            #combine the two param grid dictionaries
            full_param_grid = {**selector_param_grid, **estimator_param_grid}
            print(pipeline)
            print(full_param_grid)

            
        
            gs_1 = GridSearchCV(estimator=pipeline, 
                                param_grid = full_param_grid, 
                                cv=cv,scoring='neg_mean_absolute_error',verbose=1)
            gs_1.fit(X,y)
            all_cv_results.append(gs_1)

    #create a dataframe with the best parameters, best mean_test_score, and name of the model

    best_params_df = pd.DataFrame({
        'model': [cv_result.estimator for cv_result in all_cv_results],
        'model_name': [cv_result.estimator.__class__.__name__ for cv_result in all_cv_results],
        'best_params': [extract_estimator_params_from_gridsearch(cv_result.best_params_) for cv_result in all_cv_results],
        'best_score': [cv_result.best_score_ for cv_result in all_cv_results],
        'best_raw_params' : [cv_result.best_params_ for cv_result in all_cv_results]
        })
    
    best_params_df = best_params_df.sort_values('best_score',ascending=False).reset_index(drop=True)

    best_model = clone(best_params_df['model'][0])
    best_model_params = best_params_df['best_raw_params'][0]
    best_model.set_params(**best_model_params)

    return {
        'best_model': best_model,
        'best_params_df':best_params_df,
        'raw_cv_results':all_cv_results
    }





In [42]:

#set np random seed
np.random.seed(3161527)

group_names = ['ichi','ni','san']
#assign each row randomly to a group
group_assignments = np.random.choice(group_names,analysis_data_imputed.shape[0])

#synthetic outcomes
outcome_measures = generate_synthetic_dev_outcomes(outcome_measures)

# add synthetic primary and interaction effects


#set up the interaction effects
#0.08 will give us correlations around 0.3 between the interaction effects and the outcome
custom_interaction_effects_g1 = [0]*analysis_data_imputed.shape[1]
custom_interaction_effects_g1[0] = 0.08
custom_interaction_effects_g1[1] = 0.08
custom_interaction_effects_g1[2] = -0.08

custom_interaction_effects_g2 = [0]*analysis_data_imputed.shape[1]
custom_interaction_effects_g2[4] = 0.08
custom_interaction_effects_g2[5] = 0.08
custom_interaction_effects_g2[6] = -0.08

custom_interaction_effects = {'ni':custom_interaction_effects_g1,'san':custom_interaction_effects_g2}



synthetic_data = generate_synthetic_dev_data(analysis_data_imputed, group_assignments,outcome_measures, group_interaction_effects = custom_interaction_effects)
interaction_effect_df = synthetic_data['X_weights']
outcome_measures = synthetic_data['y']

# Set up outcome measures and group assignment one-hot

outcome_measures = calculate_outcome_changes(outcome_measures)
group_assignment_onehots = pd.get_dummies(group_assignments).loc[:,['ni','san']]

predictor_data = set_up_interactions(analysis_data_imputed, group_assignment_onehots)


#remove any NA values for this outcome measure in both the predictor data and the outcome data
outcome_nas = outcome_measures['d_cancer_promoting_minus_preventing_FFQ'].isna()

outcome_measures_nona = outcome_measures.loc[~outcome_nas,:]
predictor_data_nona = predictor_data.loc[~outcome_nas,:]
group_assignment_onehots_nonan = group_assignment_onehots.loc[~outcome_nas,:]
group_assignments_nona = group_assignments[~outcome_nas]

### Try out CV with simple gridsearch

scoring_data = do_scoring_loop(X=predictor_data_nona, y= outcome_measures_nona['d_cancer_promoting_minus_preventing_FFQ'], 
                groups = group_assignments_nona, 
                hyperparameter_selection_on_fold=do_hyperparameter_selection_loop,
                outer_folds=5)

scores = scoring_data['scores']
best_models = scoring_data['best_models']
best_params_df_list = scoring_data['best_params_df_list']
raw_cv_results_list = scoring_data['raw_cv_results_list']

print("scores:")
print(scores)
overall_score = np.mean(scores)
print("overall_score:")
print(overall_score)



best_model = get_best_model(summarize_overall_df_results(raw_cv_results_list))
final_fit = do_final_fit(X=predictor_data_nona, y= outcome_measures_nona['d_cancer_promoting_minus_preventing_FFQ'], final_model=best_model)
final_results = present_model_results(X=predictor_data_nona, final_fit=final_fit, y=outcome_measures_nona['d_cancer_promoting_minus_preventing_FFQ'])

#print rows of final_results where feature_name is the list of features to check
base_regressors = interaction_effect_df.predictor[interaction_effect_df.interaction_effect!=0]
regressors_to_check = [x+y for y in ['','*ni','*san'] for x in base_regressors]
final_results['planned_regression'] = final_results['predictor'].isin(regressors_to_check)

present_results_vs_ground_truth_cors(predictor_data_nona,outcome_measures_nona,group_assignments_nona,final_results,base_regressors)

['ni' 'san']
[1.28335298 0.42953651]
['san' 'san' 'ni' 'ichi' 'san' 'san' 'ichi' 'san' 'san' 'san' 'ni' 'ichi'
 'ichi' 'ichi' 'ichi' 'san' 'san' 'san' 'ichi' 'ichi' 'san' 'san' 'ni'
 'ni' 'ni' 'ni' 'ni' 'ni' 'ni' 'san' 'ni' 'san' 'ni' 'ichi' 'ni' 'san'
 'ni' 'ichi' 'san' 'ni' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ni' 'ni'
 'san' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ichi' 'ni' 'ni' 'ni' 'ichi' 'san'
 'ni' 'ni' 'ichi' 'ni' 'ichi' 'san' 'ni' 'ni' 'ni' 'san' 'ichi' 'ni' 'san'
 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'ichi' 'san' 'ichi' 'san' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'san' 'san' 'san' 'ichi'
 'ni' 'san' 'ichi' 'ichi' 'san' 'ni' 'ichi' 'san' 'ni' 'ni' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'ichi' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ichi'
 'ni' 'ni' 'ichi' 'ni' 'ni' 'ichi' 'ichi' 'san' 'san' 'ni' 'ichi' 'ni'
 'ichi' 'ichi' 'san' 'ichi' 'ni' 'san' 'san' 'ni' 'ni' 'san' 'san' 'san'
 'ichi' 'san' 'ni' 'san' 'ichi' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ni' 'ni'
 'ichi' 'ni' 'ichi' '

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test_score,mean_test_score,std_test_score,std_test_score
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
model_description,params_str,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
"dict_values([StandardScaler(), DecisionTreeRegressor()])","{'estimator__max_depth': 2, 'estimator__min_samples_leaf': 50, 'estimator__min_samples_split': 20}",-0.20875,0.01184,0.015733,0.006939
"dict_values([StandardScaler(), DecisionTreeRegressor()])","{'estimator__max_depth': 2, 'estimator__min_samples_leaf': 20, 'estimator__min_samples_split': 20}",-0.210212,0.005771,0.015904,0.003648
"dict_values([StandardScaler(), DecisionTreeRegressor()])","{'estimator__max_depth': 2, 'estimator__min_samples_leaf': 20, 'estimator__min_samples_split': 50}",-0.210212,0.005771,0.015904,0.003648
"dict_values([StandardScaler(), DecisionTreeRegressor()])","{'estimator__max_depth': 2, 'estimator__min_samples_leaf': 50, 'estimator__min_samples_split': 50}",-0.210888,0.010552,0.015489,0.007159
"dict_values([StandardScaler(), DecisionTreeRegressor()])","{'estimator__max_depth': 4, 'estimator__min_samples_leaf': 50, 'estimator__min_samples_split': 20}",-0.21188,0.010607,0.016322,0.007236
"dict_values([StandardScaler(), DecisionTreeRegressor()])","{'estimator__max_depth': 4, 'estimator__min_samples_leaf': 50, 'estimator__min_samples_split': 50}",-0.21188,0.010607,0.016322,0.007236
"dict_values([StandardScaler(), DecisionTreeRegressor()])","{'estimator__max_depth': 4, 'estimator__min_samples_leaf': 20, 'estimator__min_samples_split': 50}",-0.213641,0.006726,0.016977,0.007967
"dict_values([StandardScaler(), DecisionTreeRegressor()])","{'estimator__max_depth': 4, 'estimator__min_samples_leaf': 20, 'estimator__min_samples_split': 20}",-0.216589,0.007338,0.018234,0.006911
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), DecisionTreeRegressor()])","{'estimator__max_depth': 4, 'estimator__min_samples_leaf': 50, 'estimator__min_samples_split': 50, 'feature_selection__n_features_to_select': 10, 'feature_selection__step': 5}",-0.222224,0.025497,0.037027,0.055896
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), DecisionTreeRegressor()])","{'estimator__max_depth': 4, 'estimator__min_samples_leaf': 50, 'estimator__min_samples_split': 20, 'feature_selection__n_features_to_select': 10, 'feature_selection__step': 5}",-0.222224,0.025497,0.037027,0.055896


doing permutation test on importance; this may take time.


Unnamed: 0,predictor,coef,feature_importance,fa_abs
98,IMI_perceived_choice*ni,,1.818891,1.818891
159,TRSQ*san,,0.166071,0.166071
0,BSCS,,0.0,0.0
157,PCS*san,,0.0,0.0
146,SST_prop_successful_stops*ni,,0.0,0.0
147,SST_GRTmean*ni,,0.0,0.0
148,SST_SSD*ni,,0.0,0.0
149,SST_PostErrorSlowW1_mean*ni,,0.0,0.0
150,SST_mean_ssrt_0*ni,,0.0,0.0
151,ROC_Crave_Regulate_Minus_Look*ni,,0.0,0.0


  results_vs_cors = final_results_wide.merge(group_correlations, left_index=True, right_index=True, how='outer')


Unnamed: 0,"(feature_importance, base)","(feature_importance, ni)","(feature_importance, san)",ichi_cor,ni_cor,san_cor,abs_effect_sum
ACES_abuse,0.0,0.0,0.0,,,,0.0
RTFS_factor_2,0.0,0.0,0.0,,,,0.0
SST_SSD,0.0,0.0,0.0,,,,0.0
SST_PostErrorSlowW1_mean,0.0,0.0,0.0,,,,0.0
SST_GRTmean,0.0,0.0,0.0,,,,0.0
SRHI_unhealthy,0.0,0.0,0.0,,,,0.0
SRHI_sum,0.0,0.0,0.0,,,,0.0
SRHI_healthy_minus_unhealthy,0.0,0.0,0.0,,,,0.0
SRHI_healthy,0.0,0.0,0.0,,,,0.0
RTFS_factor_1,0.0,0.0,0.0,,,,0.0


{'results_vs_cors':                                (feature_importance, base)  \
 ACES_abuse                                            0.0   
 RTFS_factor_2                                         0.0   
 SST_SSD                                               0.0   
 SST_PostErrorSlowW1_mean                              0.0   
 SST_GRTmean                                           0.0   
 ...                                                   ...   
 NCS_new_solutions_to_problems                         0.0   
 NCS_like_responsibility                               0.0   
 NCS_intellectual_task                                 0.0   
 NCS_get_job_done                                      0.0   
 zipcode_median_income_acs                             0.0   
 
                                (feature_importance, ni)  \
 ACES_abuse                                          0.0   
 RTFS_factor_2                                       0.0   
 SST_SSD                                             0.

In [43]:
print("scores:")
print(scores)
overall_score = np.mean(scores)
print("overall_score:")
print(overall_score)


scores:
[0.7627218640787623, 0.7826353643654684, 0.8032456225910188, 0.8309837058677088, 0.8250458722553565]
overall_score:
0.8009264858316628


In [44]:



# # Great--now print a scatterplot of the series group_outcomes and each column of the dataframe group_data
# # do this separately for each group
# for group_name in ['ichi','ni','san']:

#     #print(group_name)
#     group_data = predictor_data_nona.loc[group_assignments_nona==group_name,base_regressors]
#     group_outcomes = outcome_measures_nona.loc[group_assignments_nona==group_name,'d_cancer_promoting_minus_preventing_FFQ']


graph_data = predictor_data_nona.loc[:,base_regressors].copy()
graph_data['outcome'] = outcome_measures_nona.loc[:,'d_cancer_promoting_minus_preventing_FFQ']
graph_data['group'] = group_assignments_nona

df_melted = pd.melt(graph_data, id_vars=['outcome', 'group'], var_name='columns')


# Create a FacetGrid with scatter plots
#wrap the facetgrid so that the rows and columns are equal, as much as is possible
#allow the axes for each facet to vary freely to best display data in each facet
#be sure to allow enough room for the title
g = sns.FacetGrid(df_melted, col='columns', hue='group', col_wrap=3, height=3, aspect=1, sharex=False, sharey=False,margin_titles=True)
#do the scatterplot; include a trendline for each group
g.map(sns.regplot,  'value','outcome', ci=None, scatter_kws={'alpha':0.5}, line_kws={'alpha':0.5})
# Add a title for the whole plot
g.fig.suptitle('Outcome vs. each predictor variable, by group')

# Add a legend
g.add_legend()

plt.subplots_adjust(top=0.9)
# Show the plot
plt.show()


NameError: name 'sns' is not defined

# Conclusion so far

The feature selection applied here hasn't helped very much. That surprises me because in test_limited_predictors, I got clear evidence that cutting down irrelevant predictors improved model performance.

One reason might be that we've actually cut down on useful predictors--unlike in `test_limited_predictors.ipynb`, we can't cheat by removing predictors we know to be irrelevant. That means we're left with less information in the model itself.

We've only really tried SelectKBest(); there might be other feature selection mechanisms that could do the job. But I don't know yet.