In [27]:
import yaml
from yaml.loader import SafeLoader
from socket import gethostname
import numpy as np
import pandas as pd
from sklearn.base import clone
from dev_interaction_util import generate_synthetic_dev_outcomes, generate_synthetic_dev_data, set_up_interactions
from dev_interaction_util import do_scoring_loop, get_best_model, summarize_overall_df_results, do_final_fit, present_model_results, present_results_vs_ground_truth_cors
from ml_util import *
# Imputing with MICE
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn import linear_model
from ml_util import get_data_for_imputation
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.inspection import permutation_importance
from IPython.display import display, HTML
from sklearn.base import clone
from sklearn.inspection import permutation_importance



In [2]:


print(gethostname())
# Open the file and load the file
with open('config.yml') as f:
    all_yaml = yaml.load(f, Loader=SafeLoader)
    if gethostname() in all_yaml.keys():
        config = all_yaml[gethostname()]
    else:
        config = all_yaml['default']
        
print(config)



Benjamins-MacBook-Pro-2.local
{'dropbox_data_dir': '/Users/benjaminsmith/Dropbox (University of Oregon)/UO-SAN Lab/Berkman Lab/Devaluation/analysis_files/data/'}


This notebook is derived from pre_registered_preview.ipynb.

The aim is to look at how the model pipeline does with different sets of ground truths. If we plug in five actual effects, or ten, or twenty, how many are actually identified and how many irrelevant effects are identified?

This can't be too black and white, because of course in real life, teh features are correlated iwth one another. But at least, the features we select to be correlated should _actually be_ the most correlated.

In [3]:
dropbox_data_dir = config['dropbox_data_dir']


# Introduction
This is a pre-registered analysis for measuring moderations of the intervention.

We'll cross-validate the intervention moderations.

For this analysis, we'll try to make predictions based on some synthetic data. we'll take wave 1 data and randomly mix in changes based on our predictors, then try to model how we would predict those things. Finally, we'll make the predictions.

# Load data

In [4]:
data_by_ppt_path = dropbox_data_dir + '/data_by_ppt.csv'
data_codebook_path = dropbox_data_dir + 'data_codebook.csv'




In [5]:
data_by_ppt = pd.read_csv(data_by_ppt_path)
data_codebook = pd.read_csv(data_codebook_path)

In [6]:
#find out which columns in data_by_ppt are missing from the codebook
data_by_ppt.columns.difference(data_codebook['VarName'])


#copy our outcome measures, bf_1 and FFQ_1, into a new dataframe
data_by_ppt['bf_2'] = data_by_ppt.bf_1
#need to decide what sort of FFQ we want to use
data_by_ppt['cancer_promoting_minus_preventing_FFQ_1'] = data_by_ppt.cancer_promoting_minus_preventing_FFQ
data_by_ppt['cancer_promoting_minus_preventing_FFQ_2'] = data_by_ppt.cancer_promoting_minus_preventing_FFQ

# do a report on missing data
analysis_data  = data_by_ppt.loc[:,data_codebook.loc[data_codebook.IsSelectedPredictor,"VarName"]].copy()
outcome_measures = data_by_ppt.loc[:,data_codebook.loc[data_codebook.IsSelectedOutcomeMeasure,"VarName"]].copy()

na_values = pd.DataFrame(data_by_ppt.isna().sum())
na_values.columns = ['NA_Count']
na_values['prop_NA'] = na_values.NA_Count / data_by_ppt.shape[0]
data_codebook = data_codebook.merge(na_values, left_on='VarName', right_index=True)

data_codebook.to_csv(dropbox_data_dir + 'data_metadata.csv', index=False)

Need to count the number of valid and missing entries in each of our data predictors

## Converting data to numeric format

In [7]:
one_hot_vals = pd.get_dummies(analysis_data.birthsex_factor)
#there's only two variables here so we can convert this into a dummy variable
analysis_data.drop(columns=['birthsex_factor'], inplace=True)
one_hot_vals.columns = ['birthsex_factor_' + str(col) for col in one_hot_vals.columns]
analysis_data = analysis_data.join(one_hot_vals.iloc[:,1:])

In [8]:
analysis_data

Unnamed: 0,BSCS,EDM,BIS_11,PCS,RS,TRSQ,ACES_neglectful_parenting,ACES_abuse,ACES_sum,ACES_divorced_separated,...,zipcode_median_income_acs,household_income_per_person,SST_prop_successful_stops,SST_GRTmean,SST_SSD,SST_PostErrorSlowW1_mean,SST_mean_ssrt_0,ROC_Crave_Regulate_Minus_Look,WTP_unhealthy_minus_healthy,birthsex_factor_Male
0,2.538462,3.250,72,7.0,20.0,63.0,,,,,...,,,,,,,,-0.5125,-0.312500,1
1,2.384615,1.750,89,9.0,22.0,63.0,,,,,...,,,,,,,,,0.440524,0
2,3.384615,2.500,63,9.0,18.0,57.0,,,,,...,,,0.500000,533.315052,284.375,0.058297,0.247061,-0.8000,-0.190476,0
3,3.076923,2.800,75,,,64.0,,,,,...,,,0.312500,498.167248,103.125,0.027730,0.446583,-0.8000,0.170363,0
4,3.307692,2.750,64,12.0,21.0,55.0,,,,,...,,,0.562500,626.507764,250.000,0.105660,0.369308,-1.5500,-0.494624,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270,3.461538,4.000,58,18.0,17.0,54.0,0.0,1.0,3.0,1.0,...,-0.690347,1.768485,0.523438,,,,0.357362,-0.0125,-1.008152,1
271,3.692308,3.875,54,17.0,13.0,55.0,2.0,2.0,5.0,0.0,...,-0.511475,-0.234851,0.492188,,,,0.335849,-0.1500,-1.889247,1
272,3.461538,3.125,69,11.0,13.0,53.0,1.0,1.0,6.0,1.0,...,1.335248,0.099038,0.507812,,,,0.273736,,0.516129,1
273,2.846154,3.000,62,15.0,22.0,84.0,0.0,1.0,4.0,1.0,...,0.855379,-0.234851,0.479167,,,,0.401098,-0.9875,-0.151210,0


# Missing data 

Apply missing data imputation to columns including cSES, ACES_sum, ses_aggregate, zipcode_median_income_acs, IMI, mcarthur social standing, based on demographic and self-report predictors.

In [9]:
# import importlib
# importlib.reload(ml_util)


Based on this experiment, I'm going for Ridge regression with 10 nearest features. The values it imputes are a compromise between simply using the nearest mean, which is conservative when using these values for prediction because it doesn't introduce erroneous variance, but isn't very informative, and then using all available information, which Ridge regression with an unlimited number of features would do. It's a tough choice between this and KNN, which doesn't assume normality. Overall I'm going with KNN, because it picks up on relationships between the two variables while not generating extreme values like KNN seems to do.

In [10]:
imputer = IterativeImputer(estimator=linear_model.Ridge(),n_nearest_features=10,max_iter=100,random_state=0)
analysis_data_imputed = get_data_for_imputation(analysis_data)

#this dataset is already filtered for columns so we don't need to filter those further.
analysis_data_imputed = pd.DataFrame(imputer.fit_transform(analysis_data_imputed), columns=analysis_data_imputed.columns)
imputed_datapoint = analysis_data.isna()
# do_aces_cses_imputation_diagnostic(analysis_data_imputed, imputed_datapoint,'ridge_10')




# Run 1 - base with Lasso and Ridge

In [11]:
def do_hyperparameter_selection_on_fold_grid_and_lasso(X, y,cv,alpha_range):
    if alpha_range is None:
        alpha_10pow_lower = 6
        alpha_10pow_upper = -1
        alpha_increments=1
        alpha_range = np.power(10,np.linspace(-alpha_10pow_lower,alpha_10pow_upper,(alpha_10pow_lower+alpha_10pow_upper)*alpha_increments+1))

    ############
    #RIDGE
    ridge_parameters = {'alpha':alpha_range}
    ridge_model = linear_model.Ridge()
    print(ridge_parameters)
    #do a gridsearch, using the same folds as the outer loop
    ridge_grid_search_cv = GridSearchCV(estimator=get_estimator_with_preprocessing(ridge_model), param_grid = get_param_grid_with_preprocessing(ridge_parameters), cv=cv,scoring='neg_mean_absolute_error')
    ridge_grid_search_cv.fit(X,y)

    ############
    #LASSO
    lasso_parameters = {'alpha':alpha_range}
    lasso_model = linear_model.Lasso()
    print(lasso_parameters)
    lasso_grid_search_cv = GridSearchCV(estimator=get_estimator_with_preprocessing(lasso_model), param_grid = get_param_grid_with_preprocessing(lasso_parameters), cv=cv,scoring='neg_mean_absolute_error')
    lasso_grid_search_cv.fit(X,y)



    all_cv_results = [ridge_grid_search_cv, lasso_grid_search_cv]

    #create a dataframe with the best parameters, best mean_test_score, and name of the model

    best_params_df = pd.DataFrame({
        'model': [cv_result.estimator for cv_result in all_cv_results],
        'model_name': [cv_result.estimator.__class__.__name__ for cv_result in all_cv_results],
        'best_params': [extract_estimator_params_from_gridsearch(cv_result.best_params_) for cv_result in all_cv_results],
        'best_score': [cv_result.best_score_ for cv_result in all_cv_results],
        'best_raw_params' : [cv_result.best_params_ for cv_result in all_cv_results]
        })
    
    best_params_df = best_params_df.sort_values('best_score',ascending=False).reset_index(drop=True)

    best_model = clone(best_params_df['model'][0])
    best_model_params = best_params_df['best_raw_params'][0]
    best_model.set_params(**best_model_params)

    return {
        'best_model': best_model,
        'best_params_df':best_params_df,
        'raw_cv_results':all_cv_results
    }





In [12]:
def do_hyperparameter_selection_on_fold_grid_and_lasso_a2(X,y,cv):
    alpha_10pow_lower = 2
    alpha_10pow_upper = 0
    alpha_increments=1
    alpha_range = np.concatenate([np.power(10,np.linspace(-alpha_10pow_lower,alpha_10pow_upper,(alpha_10pow_lower+alpha_10pow_upper)*alpha_increments+1)),
        [0.2,0.4,0.6,0.8,1.0]])
    return(do_hyperparameter_selection_on_fold_grid_and_lasso(X,y,cv,alpha_range))

In [15]:

#set np random seed
np.random.seed(3161527)

group_names = ['ichi','ni','san']
#assign each row randomly to a group
group_assignments = np.random.choice(group_names,analysis_data_imputed.shape[0])

#synthetic outcomes
outcome_measures = generate_synthetic_dev_outcomes(outcome_measures)

# add synthetic primary and interaction effects


#set up the interaction effects
custom_interaction_effects_g1 = [0]*analysis_data_imputed.shape[1]
custom_interaction_effects_g1[0] = 0.15
custom_interaction_effects_g1[1] = 0.15
custom_interaction_effects_g1[2] = -0.15
custom_interaction_effects_g1[3] = -0.15

custom_interaction_effects_g2 = [0]*analysis_data_imputed.shape[1]
custom_interaction_effects_g2[4] = 0.15
custom_interaction_effects_g2[5] = 0.15
custom_interaction_effects_g2[6] = -0.15
custom_interaction_effects_g2[7] = -0.15

custom_interaction_effects = {'ni':custom_interaction_effects_g1,'san':custom_interaction_effects_g2}



synthetic_data = generate_synthetic_dev_data(analysis_data_imputed, group_assignments,outcome_measures, group_interaction_effects = custom_interaction_effects)
interaction_effect_df = synthetic_data['X_weights']
outcome_measures = synthetic_data['y']

# Set up outcome measures and group assignment one-hot

outcome_measures = calculate_outcome_changes(outcome_measures)
group_assignment_onehots = pd.get_dummies(group_assignments).loc[:,['ni','san']]

predictor_data = set_up_interactions(analysis_data_imputed, group_assignment_onehots)


#remove any NA values for this outcome measure in both the predictor data and the outcome data
outcome_nas = outcome_measures['d_bf'].isna()

outcome_measures_nona = outcome_measures.loc[~outcome_nas,:]
predictor_data_nona = predictor_data.loc[~outcome_nas,:]
group_assignment_onehots_nonan = group_assignment_onehots.loc[~outcome_nas,:]
group_assignments_nona = group_assignments[~outcome_nas]




### Try out CV with simple gridsearch


scoring_data = do_scoring_loop(X=predictor_data_nona, y= outcome_measures_nona['d_bf'], 
                groups = group_assignments_nona, 
                hyperparameter_selection_on_fold=do_hyperparameter_selection_on_fold_grid_and_lasso_a2,
                outer_folds=10)

scores = scoring_data['scores']
best_models = scoring_data['best_models']
best_params_df_list = scoring_data['best_params_df_list']
raw_cv_results_list = scoring_data['raw_cv_results_list']

print("scores:")
print(scores)
overall_score = np.mean(scores)
print("overall_score:")
print(overall_score)



best_model = get_best_model(summarize_overall_df_results(raw_cv_results_list))
final_fit = do_final_fit(X=predictor_data_nona, y= outcome_measures_nona['d_bf'], final_model=best_model)
final_results = present_model_results(X=predictor_data_nona, final_fit=final_fit, y=outcome_measures_nona['d_bf'])

#print rows of final_results where feature_name is the list of features to check
base_regressors = interaction_effect_df.predictor[interaction_effect_df.interaction_effect!=0]
regressors_to_check = [x+y for y in ['','*ni','*san'] for x in base_regressors]
final_results['planned_regression'] = final_results['predictor'].isin(regressors_to_check)

present_results_vs_ground_truth_cors(predictor_data_nona,outcome_measures_nona,group_assignments_nona,final_results,base_regressors)

['ni' 'san']
[1.28335298 0.42953651]
['san' 'san' 'ni' 'ichi' 'san' 'san' 'ichi' 'san' 'san' 'san' 'ni' 'ichi'
 'ichi' 'ichi' 'ichi' 'san' 'san' 'san' 'ichi' 'ichi' 'san' 'san' 'ni'
 'ni' 'ni' 'ni' 'ni' 'ni' 'ni' 'san' 'ni' 'san' 'ni' 'ichi' 'ni' 'san'
 'ni' 'ichi' 'san' 'ni' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ni' 'ni'
 'san' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ichi' 'ni' 'ni' 'ni' 'ichi' 'san'
 'ni' 'ni' 'ichi' 'ni' 'ichi' 'san' 'ni' 'ni' 'ni' 'san' 'ichi' 'ni' 'san'
 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'ichi' 'san' 'ichi' 'san' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'san' 'san' 'san' 'ichi'
 'ni' 'san' 'ichi' 'ichi' 'san' 'ni' 'ichi' 'san' 'ni' 'ni' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'ichi' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ichi'
 'ni' 'ni' 'ichi' 'ni' 'ni' 'ichi' 'ichi' 'san' 'san' 'ni' 'ichi' 'ni'
 'ichi' 'ichi' 'san' 'ichi' 'ni' 'san' 'san' 'ni' 'ni' 'san' 'san' 'san'
 'ichi' 'san' 'ni' 'san' 'ichi' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ni' 'ni'
 'ichi' 'ni' 'ichi' '

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


outer split1
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


outer split2
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


outer split3
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


outer split4
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


outer split5
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


outer split6
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


outer split7
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


outer split8
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


outer split9
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


scores:
[0.1274999616527538, -0.06874259686177808, -0.12140026760493905, 0.1346752210401354, 0.14266708486256807, -0.34680626449621577, 0.1639186985513973, -0.011859369652341467, -0.32995614971410037, 0.07945506380367717]
overall_score:
-0.0230548618418843


Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test_score,mean_test_score,std_test_score,std_test_score
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
model_description,params_str,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.2},-3.570679,0.048386,0.473254,0.10198
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.4},-3.603683,0.032778,0.457453,0.149324
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.6},-3.656658,0.045859,0.461504,0.156509
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.1},-3.690697,0.131174,0.481553,0.118551
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.8},-3.694966,0.04317,0.467687,0.150113
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 1.0},-3.719681,0.047648,0.474915,0.134263
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.01},-5.258356,0.147121,0.716611,0.167308
"dict_values([StandardScaler(), Ridge()])",{'estimator__alpha': 1.0},-5.674324,0.203889,0.747942,0.184292
"dict_values([StandardScaler(), Ridge()])",{'estimator__alpha': 0.8},-5.85523,0.227252,0.765743,0.188165
"dict_values([StandardScaler(), Ridge()])",{'estimator__alpha': 0.6},-6.096521,0.252987,0.793524,0.186907


doing permutation test on importance; this may take time.
Number of selected features: 37


Unnamed: 0,predictor,coef,feature_importance,fa_abs
160,ACES_neglectful_parenting*san,-0.69599,0.054959,0.054959
158,RS*san,0.701057,0.053638,0.053638
79,EDM*ni,0.700994,0.051069,0.051069
3,PCS,-0.549016,0.043063,0.043063
161,ACES_abuse*san,-0.536262,0.033398,0.033398
206,TESQ_E_controlling_temptations*san,0.515337,0.031561,0.031561
60,RTFS_f1_minus_f2,-0.435506,0.026611,0.026611
145,household_income_per_person*ni,0.336127,0.019956,0.019956
46,RMQ_locomotion,0.337818,0.016616,0.016616
25,NCS_like_responsibility,-0.285976,0.013954,0.013954


  results_vs_cors = final_results_wide.merge(group_correlations, left_index=True, right_index=True, how='outer')


Unnamed: 0,"(coef, base)","(coef, ni)","(coef, san)","(feature_importance, base)","(feature_importance, ni)","(feature_importance, san)",ichi_cor,ni_cor,san_cor,abs_effect_sum
ACES_neglectful_parenting,-0.169,0.0,-0.696,0.004,0.0,0.055,-0.046,-0.046,-0.476,0.059
RS,0.0,-0.0,0.701,0.0,0.0,0.054,0.039,-0.185,0.394,0.054
EDM,0.0,0.701,-0.0,0.0,0.051,0.0,0.053,0.282,-0.13,0.051
PCS,-0.549,-0.0,0.0,0.043,0.0,0.0,-0.058,-0.066,-0.062,0.043
ACES_abuse,-0.0,-0.061,-0.536,0.0,0.002,0.033,0.147,-0.125,-0.414,0.036
TESQ_E_controlling_temptations,0.0,0.0,0.515,0.0,0.0,0.032,,,,0.032
RTFS_f1_minus_f2,-0.436,-0.0,-0.0,0.027,0.0,0.0,,,,0.027
household_income_per_person,0.055,0.336,0.035,0.002,0.02,0.001,,,,0.022
RMQ_locomotion,0.338,0.0,0.0,0.017,0.0,0.0,,,,0.017
NCS_like_responsibility,-0.286,-0.0,0.0,0.014,0.0,0.0,,,,0.014


# Run 2: Add KNN, Regression Tree

In [16]:


def do_hyperparameter_selection_on_2(X, y,cv):
    #alpha parameters for Ridge and Lasso
    alpha_10pow_lower = 2
    alpha_10pow_upper = 0
    alpha_increments=1
    alpha_range = np.concatenate([np.power(10,np.linspace(-alpha_10pow_lower,alpha_10pow_upper,(alpha_10pow_lower+alpha_10pow_upper)*alpha_increments+1)),
        [0.2,0.4,0.6,0.8,1.0]])

    ############
    #RIDGE
    ridge_parameters = {'alpha':alpha_range}
    ridge_model = linear_model.Ridge()
    print(ridge_parameters)
    #do a gridsearch, using the same folds as the outer loop
    ridge_grid_search_cv = GridSearchCV(estimator=get_estimator_with_preprocessing(ridge_model), param_grid = get_param_grid_with_preprocessing(ridge_parameters), cv=cv,scoring='neg_mean_absolute_error')
    ridge_grid_search_cv.fit(X,y)

    ############
    #LASSO
    lasso_parameters = {'alpha':alpha_range}
    lasso_model = linear_model.Lasso()
    print(lasso_parameters)
    lasso_grid_search_cv = GridSearchCV(estimator=get_estimator_with_preprocessing(lasso_model), param_grid = get_param_grid_with_preprocessing(lasso_parameters), cv=cv,scoring='neg_mean_absolute_error')
    lasso_grid_search_cv.fit(X,y)

    #######
    #KNN
    knn_parameters = {'n_neighbors':np.unique(np.round(np.power(10,np.linspace(0,2,2*5+1)))).astype(int)}
    knn_model = KNeighborsRegressor()
    print(knn_parameters)
    knn_grid_search_cv = GridSearchCV(estimator=get_estimator_with_preprocessing(knn_model), param_grid = get_param_grid_with_preprocessing(knn_parameters), cv=cv,scoring='neg_mean_absolute_error')
    knn_grid_search_cv.fit(X,y)


    ###########
    #Decision tree regressor
    dt_regressor_parameters = {
        'max_depth':[2, 3,5,10],
        'min_samples_split':[5,20,50],
        'min_samples_leaf':[5,20,50]
    }             
    print(dt_regressor_parameters)
    dt_grid_search_cv = GridSearchCV(estimator=get_estimator_with_preprocessing(DecisionTreeRegressor()), param_grid = get_param_grid_with_preprocessing(dt_regressor_parameters), cv=cv,scoring='neg_mean_absolute_error')
    dt_grid_search_cv.fit(X,y)



    #all_cv_results = [ridge_grid_search_cv, lasso_grid_search_cv, knn_grid_search_cv, dt_grid_search_cv, rf_grid_search_cv, gb_grid_search_cv]
    all_cv_results = [ridge_grid_search_cv, lasso_grid_search_cv, knn_grid_search_cv, dt_grid_search_cv]
    #all_cv_results = [ridge_grid_search_cv, lasso_grid_search_cv]

    #create a dataframe with the best parameters, best mean_test_score, and name of the model

    best_params_df = pd.DataFrame({
        'model': [cv_result.estimator for cv_result in all_cv_results],
        'model_name': [cv_result.estimator.__class__.__name__ for cv_result in all_cv_results],
        'best_params': [extract_estimator_params_from_gridsearch(cv_result.best_params_) for cv_result in all_cv_results],
        'best_score': [cv_result.best_score_ for cv_result in all_cv_results],
        'best_raw_params' : [cv_result.best_params_ for cv_result in all_cv_results]
        })
    
    best_params_df = best_params_df.sort_values('best_score',ascending=False).reset_index(drop=True)

    best_model = clone(best_params_df['model'][0])
    best_model_params = best_params_df['best_raw_params'][0]
    best_model.set_params(**best_model_params)

    return {
        'best_model': best_model,
        'best_params_df':best_params_df,
        'raw_cv_results':all_cv_results
    }





In [17]:

#set np random seed
np.random.seed(3161527)

group_names = ['ichi','ni','san']
#assign each row randomly to a group
group_assignments = np.random.choice(group_names,analysis_data_imputed.shape[0])

#synthetic outcomes
outcome_measures = generate_synthetic_dev_outcomes(outcome_measures)

# add synthetic primary and interaction effects


#set up the interaction effects
custom_interaction_effects_g1 = [0]*analysis_data_imputed.shape[1]
custom_interaction_effects_g1[0] = 0.15
custom_interaction_effects_g1[1] = 0.15
custom_interaction_effects_g1[2] = -0.15
custom_interaction_effects_g1[3] = -0.15

custom_interaction_effects_g2 = [0]*analysis_data_imputed.shape[1]
custom_interaction_effects_g2[4] = 0.15
custom_interaction_effects_g2[5] = 0.15
custom_interaction_effects_g2[6] = -0.15
custom_interaction_effects_g2[7] = -0.15

custom_interaction_effects = {'ni':custom_interaction_effects_g1,'san':custom_interaction_effects_g2}



synthetic_data = generate_synthetic_dev_data(analysis_data_imputed, group_assignments,outcome_measures, group_interaction_effects = custom_interaction_effects)
interaction_effect_df = synthetic_data['X_weights']
outcome_measures = synthetic_data['y']

# Set up outcome measures and group assignment one-hot

outcome_measures = calculate_outcome_changes(outcome_measures)
group_assignment_onehots = pd.get_dummies(group_assignments).loc[:,['ni','san']]

predictor_data = set_up_interactions(analysis_data_imputed, group_assignment_onehots)


#remove any NA values for this outcome measure in both the predictor data and the outcome data
outcome_nas = outcome_measures['d_bf'].isna()

outcome_measures_nona = outcome_measures.loc[~outcome_nas,:]
predictor_data_nona = predictor_data.loc[~outcome_nas,:]
group_assignment_onehots_nonan = group_assignment_onehots.loc[~outcome_nas,:]
group_assignments_nona = group_assignments[~outcome_nas]




### Try out CV with simple gridsearch


scoring_data = do_scoring_loop(X=predictor_data_nona, y= outcome_measures_nona['d_bf'], 
                groups = group_assignments_nona, 
                hyperparameter_selection_on_fold=do_hyperparameter_selection_on_2,
                outer_folds=10)

scores = scoring_data['scores']
best_models = scoring_data['best_models']
best_params_df_list = scoring_data['best_params_df_list']
raw_cv_results_list = scoring_data['raw_cv_results_list']

print("scores:")
print(scores)
overall_score = np.mean(scores)
print("overall_score:")
print(overall_score)





['ni' 'san']
[1.28335298 0.42953651]
['san' 'san' 'ni' 'ichi' 'san' 'san' 'ichi' 'san' 'san' 'san' 'ni' 'ichi'
 'ichi' 'ichi' 'ichi' 'san' 'san' 'san' 'ichi' 'ichi' 'san' 'san' 'ni'
 'ni' 'ni' 'ni' 'ni' 'ni' 'ni' 'san' 'ni' 'san' 'ni' 'ichi' 'ni' 'san'
 'ni' 'ichi' 'san' 'ni' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ni' 'ni'
 'san' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ichi' 'ni' 'ni' 'ni' 'ichi' 'san'
 'ni' 'ni' 'ichi' 'ni' 'ichi' 'san' 'ni' 'ni' 'ni' 'san' 'ichi' 'ni' 'san'
 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'ichi' 'san' 'ichi' 'san' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'san' 'san' 'san' 'ichi'
 'ni' 'san' 'ichi' 'ichi' 'san' 'ni' 'ichi' 'san' 'ni' 'ni' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'ichi' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ichi'
 'ni' 'ni' 'ichi' 'ni' 'ni' 'ichi' 'ichi' 'san' 'san' 'ni' 'ichi' 'ni'
 'ichi' 'ichi' 'san' 'ichi' 'ni' 'san' 'san' 'ni' 'ni' 'san' 'san' 'san'
 'ichi' 'san' 'ni' 'san' 'ichi' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ni' 'ni'
 'ichi' 'ni' 'ichi' '

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'n_neighbors': array([  1,   2,   3,   4,   6,  10,  16,  25,  40,  63, 100])}
{'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
outer split1
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'n_neighbors': array([  1,   2,   3,   4,   6,  10,  16,  25,  40,  63, 100])}
{'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
outer split2
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'n_neighbors': array([  1,   2,   3,   4,   6,  10,  16,  25,  40,  63, 100])}
{'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
outer split3
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'n_neighbors': array([  1,   2,   3,   4,   6,  10,  16,  25,  40,  63, 100])}
{'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
outer split4
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'n_neighbors': array([  1,   2,   3,   4,   6,  10,  16,  25,  40,  63, 100])}
{'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
outer split5
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'n_neighbors': array([  1,   2,   3,   4,   6,  10,  16,  25,  40,  63, 100])}
{'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
outer split6
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'n_neighbors': array([  1,   2,   3,   4,   6,  10,  16,  25,  40,  63, 100])}
{'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
outer split7
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'n_neighbors': array([  1,   2,   3,   4,   6,  10,  16,  25,  40,  63, 100])}
{'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
outer split8
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'n_neighbors': array([  1,   2,   3,   4,   6,  10,  16,  25,  40,  63, 100])}
{'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
outer split9
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'n_neighbors': array([  1,   2,   3,   4,   6,  10,  16,  25,  40,  63, 100])}
{'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
scores:
[0.18887905438132735, 0.022742377445344797, -0.14188282450331902, 0.15195767246541614, -0.002334168817955451, -0.34680626449621577, -0.1066757539084151, 0.11551380039900305, -0.04622236716585104, 0.051474484811547816]
overall_score:
-0.011335398938911723


In [18]:
best_model = get_best_model(summarize_overall_df_results(raw_cv_results_list))
final_fit = do_final_fit(X=predictor_data_nona, y= outcome_measures_nona['d_bf'], final_model=best_model)
final_results = present_model_results(X=predictor_data_nona, final_fit=final_fit,y=outcome_measures_nona['d_bf'])

#print rows of final_results where feature_name is the list of features to check
base_regressors = interaction_effect_df.predictor[interaction_effect_df.interaction_effect!=0]
regressors_to_check = [x+y for y in ['','*ni','*san'] for x in base_regressors]
final_results['planned_regression'] = final_results['predictor'].isin(regressors_to_check)



Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test_score,mean_test_score,std_test_score,std_test_score
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
model_description,params_str,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
"dict_values([StandardScaler(), KNeighborsRegressor()])",{'estimator__n_neighbors': 25},-3.52202,0.046123,0.476817,0.135261
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.2},-3.570679,0.048386,0.473254,0.10198
"dict_values([StandardScaler(), KNeighborsRegressor()])",{'estimator__n_neighbors': 40},-3.572833,0.041259,0.458208,0.135367
"dict_values([StandardScaler(), KNeighborsRegressor()])",{'estimator__n_neighbors': 16},-3.593369,0.053888,0.457388,0.112382
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.4},-3.603683,0.032778,0.457453,0.149324
"dict_values([StandardScaler(), KNeighborsRegressor()])",{'estimator__n_neighbors': 10},-3.655157,0.036885,0.433918,0.146004
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.6},-3.656658,0.045859,0.461504,0.156509
"dict_values([StandardScaler(), KNeighborsRegressor()])",{'estimator__n_neighbors': 63},-3.65967,0.050463,0.496013,0.142917
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.1},-3.690697,0.131174,0.481553,0.118551
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.8},-3.694966,0.04317,0.467687,0.150113


doing permutation test on importance; this may take time.


Unnamed: 0,predictor,coef,feature_importance,fa_abs
161,ACES_abuse*san,,0.011046,0.011046
4,RS,,0.009779,0.009779
160,ACES_neglectful_parenting*san,,0.009589,0.009589
0,BSCS,,0.00882,0.00882
2,BIS_11,,0.007906,0.007906
84,ACES_neglectful_parenting*ni,,-0.0071,0.0071
138,RTFS_f1_minus_f2*ni,,0.006893,0.006893
43,PLAN_temporal_orientation,,0.006702,0.006702
63,cSES,,0.00659,0.00659
210,TESQ_E_goal_deliberation*san,,0.006364,0.006364


In [19]:

present_results_vs_ground_truth_cors(predictor_data_nona,outcome_measures_nona,group_assignments_nona,final_results,base_regressors)

  results_vs_cors = final_results_wide.merge(group_correlations, left_index=True, right_index=True, how='outer')


Unnamed: 0,"(feature_importance, base)","(feature_importance, ni)","(feature_importance, san)",ichi_cor,ni_cor,san_cor,abs_effect_sum
ACES_neglectful_parenting,0.002,-0.007,0.01,-0.046,-0.046,-0.476,0.019
ACES_abuse,0.006,-0.001,0.011,0.147,-0.125,-0.414,0.018
RTFS_f1_minus_f2,0.004,0.007,0.005,,,,0.016
TESQ_E_goal_deliberation,0.006,0.002,0.006,,,,0.014
NCS_thinking_not_fun,0.003,0.006,-0.004,,,,0.013
RS,0.01,0.0,0.002,0.039,-0.185,0.394,0.012
household_income_per_person,0.005,0.004,0.003,,,,0.011
NCS_prefer_little_thought,-0.005,-0.001,-0.005,,,,0.011
ACES_sum,0.003,-0.003,0.005,,,,0.011
BSCS,0.009,0.001,-0.0,-0.137,0.448,-0.08,0.01


# Run 3: Add ensemble methods

In [24]:


def do_hyperparameter_selection3(X, y,cv):
    #alpha parameters for Ridge and Lasso
    alpha_10pow_lower = 2
    alpha_10pow_upper = 0
    alpha_increments=1
    alpha_range = np.concatenate([np.power(10,np.linspace(-alpha_10pow_lower,alpha_10pow_upper,(alpha_10pow_lower+alpha_10pow_upper)*alpha_increments+1)),
        [0.2,0.4,0.6,0.8,1.0]])

    ############
    #RIDGE
    ridge_parameters = {'alpha':alpha_range}
    ridge_model = linear_model.Ridge()
    print(ridge_parameters)
    #do a gridsearch, using the same folds as the outer loop
    ridge_grid_search_cv = GridSearchCV(estimator=get_estimator_with_preprocessing(ridge_model), param_grid = get_param_grid_with_preprocessing(ridge_parameters), cv=cv,scoring='neg_mean_absolute_error')
    ridge_grid_search_cv.fit(X,y)

    ############
    #LASSO
    lasso_parameters = {'alpha':alpha_range}
    lasso_model = linear_model.Lasso()
    print(lasso_parameters)
    lasso_grid_search_cv = GridSearchCV(estimator=get_estimator_with_preprocessing(lasso_model), param_grid = get_param_grid_with_preprocessing(lasso_parameters), cv=cv,scoring='neg_mean_absolute_error')
    lasso_grid_search_cv.fit(X,y)

    #######
    #KNN
    knn_parameters = {'n_neighbors':np.unique(np.round(np.power(10,np.linspace(0,2,2*5+1)))).astype(int)}
    knn_model = KNeighborsRegressor()
    print(knn_parameters)
    knn_grid_search_cv = GridSearchCV(estimator=get_estimator_with_preprocessing(knn_model), param_grid = get_param_grid_with_preprocessing(knn_parameters), cv=cv,scoring='neg_mean_absolute_error')
    knn_grid_search_cv.fit(X,y)


    ###########
    #Decision tree regressor
    dt_regressor_parameters = {
        'max_depth':[2, 3,5,10],
        'min_samples_split':[5,20,50],
        'min_samples_leaf':[5,20,50]
    }             
    print(dt_regressor_parameters)
    dt_grid_search_cv = GridSearchCV(estimator=get_estimator_with_preprocessing(DecisionTreeRegressor()), param_grid = get_param_grid_with_preprocessing(dt_regressor_parameters), cv=cv,scoring='neg_mean_absolute_error')
    dt_grid_search_cv.fit(X,y)

    ###########
    #Random forest regressor
    rf_regressor_parameters = {
        'n_estimators':[10,25,50],
        'max_depth':[2, 3,5,10],
        'min_samples_split':[5,20,50],
        'min_samples_leaf':[5,20,50]
    }
    print(rf_regressor_parameters)
    rf_grid_search_cv = GridSearchCV(estimator=get_estimator_with_preprocessing(RandomForestRegressor()), param_grid = get_param_grid_with_preprocessing(rf_regressor_parameters), cv=cv,scoring='neg_mean_absolute_error')
    rf_grid_search_cv.fit(X,y)

    ###########
    #Gradient boosting regressor
    gb_regressor_parameters = {
        'n_estimators':[10,20,50],
        'max_depth':[2, 3,5,10],
        'min_samples_split':[5,20,50],
        'min_samples_leaf':[5,20,50]
    }
    print(gb_regressor_parameters)
    gb_grid_search_cv = GridSearchCV(estimator=get_estimator_with_preprocessing(GradientBoostingRegressor()), param_grid = get_param_grid_with_preprocessing(gb_regressor_parameters), cv=cv,scoring='neg_mean_absolute_error')
    gb_grid_search_cv.fit(X,y)


    all_cv_results = [ridge_grid_search_cv, lasso_grid_search_cv, knn_grid_search_cv, dt_grid_search_cv, rf_grid_search_cv, gb_grid_search_cv]
    #all_cv_results = [ridge_grid_search_cv, lasso_grid_search_cv, knn_grid_search_cv, dt_grid_search_cv]
    #all_cv_results = [ridge_grid_search_cv, lasso_grid_search_cv]

    #create a dataframe with the best parameters, best mean_test_score, and name of the model

    best_params_df = pd.DataFrame({
        'model': [cv_result.estimator for cv_result in all_cv_results],
        'model_name': [cv_result.estimator.__class__.__name__ for cv_result in all_cv_results],
        'best_params': [extract_estimator_params_from_gridsearch(cv_result.best_params_) for cv_result in all_cv_results],
        'best_score': [cv_result.best_score_ for cv_result in all_cv_results],
        'best_raw_params' : [cv_result.best_params_ for cv_result in all_cv_results]
        })
    
    best_params_df = best_params_df.sort_values('best_score',ascending=False).reset_index(drop=True)

    best_model = clone(best_params_df['model'][0])
    best_model_params = best_params_df['best_raw_params'][0]
    best_model.set_params(**best_model_params)

    return {
        'best_model': best_model,
        'best_params_df':best_params_df,
        'raw_cv_results':all_cv_results
    }





In [25]:

#set np random seed
np.random.seed(3161527)

group_names = ['ichi','ni','san']
#assign each row randomly to a group
group_assignments = np.random.choice(group_names,analysis_data_imputed.shape[0])

#synthetic outcomes
outcome_measures = generate_synthetic_dev_outcomes(outcome_measures)

# add synthetic primary and interaction effects


#set up the interaction effects
custom_interaction_effects_g1 = [0]*analysis_data_imputed.shape[1]
custom_interaction_effects_g1[0] = 0.15
custom_interaction_effects_g1[1] = 0.15
custom_interaction_effects_g1[2] = -0.15
custom_interaction_effects_g1[3] = -0.15

custom_interaction_effects_g2 = [0]*analysis_data_imputed.shape[1]
custom_interaction_effects_g2[4] = 0.15
custom_interaction_effects_g2[5] = 0.15
custom_interaction_effects_g2[6] = -0.15
custom_interaction_effects_g2[7] = -0.15

custom_interaction_effects = {'ni':custom_interaction_effects_g1,'san':custom_interaction_effects_g2}



synthetic_data = generate_synthetic_dev_data(analysis_data_imputed, group_assignments,outcome_measures, group_interaction_effects = custom_interaction_effects)
interaction_effect_df = synthetic_data['X_weights']
outcome_measures = synthetic_data['y']

# Set up outcome measures and group assignment one-hot

outcome_measures = calculate_outcome_changes(outcome_measures)
group_assignment_onehots = pd.get_dummies(group_assignments).loc[:,['ni','san']]

predictor_data = set_up_interactions(analysis_data_imputed, group_assignment_onehots)


#remove any NA values for this outcome measure in both the predictor data and the outcome data
outcome_nas = outcome_measures['d_bf'].isna()

outcome_measures_nona = outcome_measures.loc[~outcome_nas,:]
predictor_data_nona = predictor_data.loc[~outcome_nas,:]
group_assignment_onehots_nonan = group_assignment_onehots.loc[~outcome_nas,:]
group_assignments_nona = group_assignments[~outcome_nas]




### Try out CV with simple gridsearch


scoring_data = do_scoring_loop(X=predictor_data_nona, y= outcome_measures_nona['d_bf'], 
                groups = group_assignments_nona, 
                hyperparameter_selection_on_fold=do_hyperparameter_selection3,
                outer_folds=10)

scores = scoring_data['scores']
best_models = scoring_data['best_models']
best_params_df_list = scoring_data['best_params_df_list']
raw_cv_results_list = scoring_data['raw_cv_results_list']

print("scores:")
print(scores)
overall_score = np.mean(scores)
print("overall_score:")
print(overall_score)





['ni' 'san']
[1.28335298 0.42953651]
['san' 'san' 'ni' 'ichi' 'san' 'san' 'ichi' 'san' 'san' 'san' 'ni' 'ichi'
 'ichi' 'ichi' 'ichi' 'san' 'san' 'san' 'ichi' 'ichi' 'san' 'san' 'ni'
 'ni' 'ni' 'ni' 'ni' 'ni' 'ni' 'san' 'ni' 'san' 'ni' 'ichi' 'ni' 'san'
 'ni' 'ichi' 'san' 'ni' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ni' 'ni'
 'san' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ichi' 'ni' 'ni' 'ni' 'ichi' 'san'
 'ni' 'ni' 'ichi' 'ni' 'ichi' 'san' 'ni' 'ni' 'ni' 'san' 'ichi' 'ni' 'san'
 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'ichi' 'san' 'ichi' 'san' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'san' 'san' 'san' 'ichi'
 'ni' 'san' 'ichi' 'ichi' 'san' 'ni' 'ichi' 'san' 'ni' 'ni' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'ichi' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ichi'
 'ni' 'ni' 'ichi' 'ni' 'ni' 'ichi' 'ichi' 'san' 'san' 'ni' 'ichi' 'ni'
 'ichi' 'ichi' 'san' 'ichi' 'ni' 'san' 'san' 'ni' 'ni' 'san' 'san' 'san'
 'ichi' 'san' 'ni' 'san' 'ichi' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ni' 'ni'
 'ichi' 'ni' 'ichi' '

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'n_neighbors': array([  1,   2,   3,   4,   6,  10,  16,  25,  40,  63, 100])}
{'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
{'n_estimators': [10, 25, 50], 'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
{'n_estimators': [10, 20, 50], 'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
outer split1
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'n_neighbors': array([  1,   2,   3,   4,   6,  10,  16,  25,  40,  63, 100])}
{'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
{'n_estimators': [10, 25, 50], 'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
{'n_estimators': [10, 20, 50], 'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
outer split2
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'n_neighbors': array([  1,   2,   3,   4,   6,  10,  16,  25,  40,  63, 100])}
{'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
{'n_estimators': [10, 25, 50], 'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
{'n_estimators': [10, 20, 50], 'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
outer split3
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'n_neighbors': array([  1,   2,   3,   4,   6,  10,  16,  25,  40,  63, 100])}
{'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
{'n_estimators': [10, 25, 50], 'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
{'n_estimators': [10, 20, 50], 'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
outer split4
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'n_neighbors': array([  1,   2,   3,   4,   6,  10,  16,  25,  40,  63, 100])}
{'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
{'n_estimators': [10, 25, 50], 'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
{'n_estimators': [10, 20, 50], 'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
outer split5
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'n_neighbors': array([  1,   2,   3,   4,   6,  10,  16,  25,  40,  63, 100])}
{'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
{'n_estimators': [10, 25, 50], 'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
{'n_estimators': [10, 20, 50], 'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
outer split6
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'n_neighbors': array([  1,   2,   3,   4,   6,  10,  16,  25,  40,  63, 100])}
{'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
{'n_estimators': [10, 25, 50], 'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
{'n_estimators': [10, 20, 50], 'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
outer split7
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'n_neighbors': array([  1,   2,   3,   4,   6,  10,  16,  25,  40,  63, 100])}
{'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
{'n_estimators': [10, 25, 50], 'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
{'n_estimators': [10, 20, 50], 'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
outer split8
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'n_neighbors': array([  1,   2,   3,   4,   6,  10,  16,  25,  40,  63, 100])}
{'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
{'n_estimators': [10, 25, 50], 'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
{'n_estimators': [10, 20, 50], 'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
outer split9
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}
{'alpha': array([0.01, 0.1 , 1.  , 0.2 , 0.4 , 0.6 , 0.8 , 1.  ])}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'n_neighbors': array([  1,   2,   3,   4,   6,  10,  16,  25,  40,  63, 100])}
{'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
{'n_estimators': [10, 25, 50], 'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
{'n_estimators': [10, 20, 50], 'max_depth': [2, 3, 5, 10], 'min_samples_split': [5, 20, 50], 'min_samples_leaf': [5, 20, 50]}
scores:
[0.14570889633271344, 0.022742377445344797, -0.11088661094519092, 0.15195767246541614, -0.002334168817955451, -0.17501960910903525, 0.024359206831576663, 0.003100690678360052, -0.21485603642571127, 0.039961914483410466]
overall_score:
-0.011526566706107133


In [26]:
best_model = get_best_model(summarize_overall_df_results(raw_cv_results_list))
final_fit = do_final_fit(X=predictor_data_nona, y= outcome_measures_nona['d_bf'], final_model=best_model)
final_results = present_model_results(X=predictor_data_nona, final_fit=final_fit,y=outcome_measures_nona['d_bf'])

#print rows of final_results where feature_name is the list of features to check
base_regressors = interaction_effect_df.predictor[interaction_effect_df.interaction_effect!=0]
regressors_to_check = [x+y for y in ['','*ni','*san'] for x in base_regressors]
final_results['planned_regression'] = final_results['predictor'].isin(regressors_to_check)


present_results_vs_ground_truth_cors(predictor_data_nona,outcome_measures_nona,group_assignments_nona,final_results,base_regressors)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test_score,mean_test_score,std_test_score,std_test_score
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
model_description,params_str,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
"dict_values([StandardScaler(), KNeighborsRegressor()])",{'estimator__n_neighbors': 25},-3.52202,0.046123,0.476817,0.135261
"dict_values([StandardScaler(), RandomForestRegressor()])","{'estimator__max_depth': 5, 'estimator__min_samples_leaf': 5, 'estimator__min_samples_split': 5, 'estimator__n_estimators': 50}",-3.536705,0.046388,0.46803,0.154991
"dict_values([StandardScaler(), GradientBoostingRegressor()])","{'estimator__max_depth': 2, 'estimator__min_samples_leaf': 5, 'estimator__min_samples_split': 5, 'estimator__n_estimators': 50}",-3.536963,0.094484,0.445854,0.168284
"dict_values([StandardScaler(), GradientBoostingRegressor()])","{'estimator__max_depth': 3, 'estimator__min_samples_leaf': 5, 'estimator__min_samples_split': 5, 'estimator__n_estimators': 20}",-3.54776,0.05872,0.459199,0.163647
"dict_values([StandardScaler(), GradientBoostingRegressor()])","{'estimator__max_depth': 3, 'estimator__min_samples_leaf': 5, 'estimator__min_samples_split': 20, 'estimator__n_estimators': 20}",-3.548086,0.075975,0.450669,0.144703
"dict_values([StandardScaler(), GradientBoostingRegressor()])","{'estimator__max_depth': 3, 'estimator__min_samples_leaf': 5, 'estimator__min_samples_split': 50, 'estimator__n_estimators': 20}",-3.552207,0.06326,0.450955,0.168243
"dict_values([StandardScaler(), RandomForestRegressor()])","{'estimator__max_depth': 3, 'estimator__min_samples_leaf': 5, 'estimator__min_samples_split': 20, 'estimator__n_estimators': 50}",-3.555798,0.053225,0.468279,0.150676
"dict_values([StandardScaler(), GradientBoostingRegressor()])","{'estimator__max_depth': 2, 'estimator__min_samples_leaf': 20, 'estimator__min_samples_split': 50, 'estimator__n_estimators': 50}",-3.557039,0.053247,0.435255,0.149365
"dict_values([StandardScaler(), RandomForestRegressor()])","{'estimator__max_depth': 5, 'estimator__min_samples_leaf': 5, 'estimator__min_samples_split': 20, 'estimator__n_estimators': 50}",-3.557052,0.050776,0.47536,0.153762
"dict_values([StandardScaler(), GradientBoostingRegressor()])","{'estimator__max_depth': 3, 'estimator__min_samples_leaf': 5, 'estimator__min_samples_split': 5, 'estimator__n_estimators': 50}",-3.558018,0.07788,0.475142,0.168075


doing permutation test on importance; this may take time.


Unnamed: 0,predictor,coef,feature_importance,fa_abs
160,ACES_neglectful_parenting*san,,0.010747,0.010747
0,BSCS,,0.010636,0.010636
161,ACES_abuse*san,,0.010211,0.010211
2,BIS_11,,0.009116,0.009116
4,RS,,0.007943,0.007943
84,ACES_neglectful_parenting*ni,,-0.00778,0.00778
43,PLAN_temporal_orientation,,0.007111,0.007111
7,ACES_abuse,,0.006684,0.006684
42,PLAN_mental_forecasting,,0.006547,0.006547
50,SRHI_unhealthy,,0.006543,0.006543


  results_vs_cors = final_results_wide.merge(group_correlations, left_index=True, right_index=True, how='outer')


Unnamed: 0,"(feature_importance, base)","(feature_importance, ni)","(feature_importance, san)",ichi_cor,ni_cor,san_cor,abs_effect_sum
ACES_neglectful_parenting,0.0,-0.008,0.011,-0.046,-0.046,-0.476,0.019
ACES_abuse,0.007,-0.0,0.01,0.147,-0.125,-0.414,0.017
RTFS_f1_minus_f2,0.004,0.005,0.005,,,,0.014
TESQ_E_goal_deliberation,0.005,0.001,0.006,,,,0.012
BSCS,0.011,0.001,0.0,-0.137,0.448,-0.08,0.012
NCS_thinking_not_fun,0.003,0.005,-0.003,,,,0.011
BIS_11,0.009,0.002,-0.001,0.047,-0.528,0.02,0.011
NCS_relief_not_satisfaction,0.002,0.005,-0.005,,,,0.011
RS,0.008,0.001,0.002,0.039,-0.185,0.394,0.011
SST_PostErrorSlowW1_mean,0.004,0.004,0.002,,,,0.011


to do:
 - figure out and document why we're using different forms of error measurement
 - try adding feature selection to the pipeline