In [2]:
import yaml
from yaml.loader import SafeLoader
from socket import gethostname
import numpy as np
import pandas as pd
from sklearn.base import clone
from dev_interaction_util import generate_synthetic_dev_outcomes, generate_synthetic_dev_data, set_up_interactions
from dev_interaction_util import do_scoring_loop, get_best_model, summarize_overall_df_results, do_final_fit, present_model_results, present_results_vs_ground_truth_cors
from dev_interaction_util import load_and_preprocess_data, impute_data
from ml_util import *
# Imputing with MICE
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn import linear_model
from ml_util import get_data_for_imputation
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.inspection import permutation_importance
import numpy as np
from IPython.display import display, HTML
from sklearn.base import clone
from sklearn.inspection import permutation_importance
import seaborn as sns
from sklearn.feature_selection import SelectKBest, f_regression, RFE


In [3]:


print(gethostname())
# Open the file and load the file
with open('config.yml') as f:
    all_yaml = yaml.load(f, Loader=SafeLoader)
    if gethostname() in all_yaml.keys():
        config = all_yaml[gethostname()]
    else:
        config = all_yaml['default']
        
print(config)



Benjamins-MacBook-Pro-2.local
{'dropbox_data_dir': '/Users/benjaminsmith/Dropbox (University of Oregon)/UO-SAN Lab/Berkman Lab/Devaluation/analysis_files/data/'}


This notebook is derived from `test_feature_selection.ipynb`.

In [4]:
dropbox_data_dir = config['dropbox_data_dir']


In [5]:
analysis_data, outcome_measures = load_and_preprocess_data(dropbox_data_dir)

In [6]:
analysis_data_imputed = impute_data(analysis_data)



In [7]:

#loops through the different estimators and feature selection methods and does a grid search over all to find the best hyperparameters
def do_hyperparameter_selection_loop(X, y,cv):
    #alpha parameters for Ridge and Lasso
    alpha_10pow_lower = 1
    alpha_10pow_upper = 0
    alpha_increments=1
    alpha_range = np.concatenate([np.power(10,np.linspace(-alpha_10pow_lower,alpha_10pow_upper,(alpha_10pow_lower+alpha_10pow_upper)*alpha_increments+1)),
        [0.2,0.4,0.6,0.8,1.0]])
    
    all_cv_results = []

    pipeline_estimator_name = 'estimator'
    feature_selection_name = 'feature_selection'


    #define the param_grid for the estimators
    estimators_to_run = {
        'Ridge':{
            'estimator':linear_model.Ridge,
            'parameters':{'alpha':alpha_range}
        },
        'Lasso':{
            'estimator':linear_model.Lasso,
            'parameters':{'alpha':alpha_range}
        },
        'DecisionTreeRegressor':{
            'estimator':DecisionTreeRegressor,
            'parameters':{
                'max_depth':[2, 4],
                'min_samples_split':[20,50],
                'min_samples_leaf':[20,50]
            }
        }             
    }

    k_max_val = np.min([50,X.shape[1]])

    for estimator_name,estimator_dict in estimators_to_run.items():
        #param grid for the feature seelction
        #this is here because we need to know the estimator to pass to the feature selector
        feature_selectors_to_run = {
            'None':None,
            'KBest':{
                'selector':SelectKBest(),
                'parameters':{
                    'score_func' : [f_regression], 
                    'k' : [20,k_max_val]
                    }
            },
            'RFE':{
                'selector':RFE(linear_model.LinearRegression()),
                'parameters':{
                    'n_features_to_select' : [10,25],
                    #'verbose':[1],
                    'step':[5]
                }
            }
        }
        for selector_name, selector_dict in feature_selectors_to_run.items():
        #create the estimator
            if selector_name == 'None':
                pipeline = Pipeline([('scaler',StandardScaler()),
                                     (pipeline_estimator_name,estimator_dict['estimator']())])
                selector_params = {}
            else:
                pipeline = Pipeline([('scaler',StandardScaler()),
                                     (feature_selection_name,selector_dict['selector']), 
                                     (pipeline_estimator_name,estimator_dict['estimator']())])
                selector_params = selector_dict['parameters']

            estimator_param_grid = {(pipeline_estimator_name + '__'+k):v for k,v in estimator_dict['parameters'].items()}
            selector_param_grid = {(feature_selection_name + '__'+k):v for k,v in selector_params.items()}
            #combine the two param grid dictionaries
            full_param_grid = {**selector_param_grid, **estimator_param_grid}
            print(pipeline)
            print(full_param_grid)

            
        
            gs_1 = GridSearchCV(estimator=pipeline, 
                                param_grid = full_param_grid, 
                                cv=cv,scoring='neg_mean_absolute_error',verbose=1)
            gs_1.fit(X,y)
            all_cv_results.append(gs_1)

    #create a dataframe with the best parameters, best mean_test_score, and name of the model

    best_params_df = pd.DataFrame({
        'model': [cv_result.estimator for cv_result in all_cv_results],
        'model_name': [cv_result.estimator.__class__.__name__ for cv_result in all_cv_results],
        'best_params': [extract_estimator_params_from_gridsearch(cv_result.best_params_) for cv_result in all_cv_results],
        'best_score': [cv_result.best_score_ for cv_result in all_cv_results],
        'best_raw_params' : [cv_result.best_params_ for cv_result in all_cv_results]
        })
    
    best_params_df = best_params_df.sort_values('best_score',ascending=False).reset_index(drop=True)

    best_model = clone(best_params_df['model'][0])
    best_model_params = best_params_df['best_raw_params'][0]
    best_model.set_params(**best_model_params)

    return {
        'best_model': best_model,
        'best_params_df':best_params_df,
        'raw_cv_results':all_cv_results
    }





# Improving fit with manual theory-driven feature

My past analysis showed that by manually removing some features before the analysis starts, we can improve performance beyond the chance performance otherwise seen.

So, it might be useful to understand how much we can improve our performance by manual feature selection before the automatic feature selection applies.

This was previously done in `test_limited_predictors.ipynb`. We tested as few as 2 distractor features. In that test, predictor features generally had correlations in the range of |r|=0.06 to 0.53, with most around 0.4 (we should confirm that because it seems fishy that PCS was detegted as an effect, but didn't model as a large predictor). With most `|r|=0.4`, this seems unrealistically high to expect, and we should aim to build a pipeline capable of detecting more subtle effects than that. An approximate `|r|=0.3` can be achieved by mixing in a predictor scaled to 8% of normal scale.

I can imagine it is plausible to cut down to as few as two self-report, one behavioral, and one neural measure per intervention, plus sex and age. That would yield 10 different variables. At the other end, we might want 10 self-report, two behavioral, and five neural measures per intervention tested, plus 6 different demographic variables--a total of 40 variables. Let's see how these would perform, as well as mid-range of 20 predictor variables. In each case we'll restrict to three valid predictors per intervention.

In [8]:
#create an empty data frame with numeric two columns, n_features and overall_score
overall_scores = pd.DataFrame(columns=['n_features','overall_score'])

In [9]:
analysis_data_imputed.columns

Index(['BSCS', 'EDM', 'BIS_11', 'PCS', 'RS', 'TRSQ',
       'ACES_neglectful_parenting', 'ACES_abuse', 'ACES_sum',
       'ACES_divorced_separated', 'ACES_household_dysfunction',
       'BFI_agreeableness', 'BFI_conscientiousness', 'BFI_extraversion',
       'BFI_neuroticism', 'BFI_openness', 'DEMO_mcarthur_social_standing',
       'IMI_effort_importance', 'IMI_value_usefulness',
       'IMI_interest_enjoyment', 'IMI_perceived_choice',
       'IMI_perceived_competence', 'NCS_get_job_done', 'NCS_intellectual_task',
       'NCS_deliberating_issues', 'NCS_like_responsibility',
       'NCS_thinking_not_exciting', 'NCS_avoid_depth', 'NCS_thinking_not_fun',
       'NCS_thought_appealing', 'NCS_think_minimally', 'NCS_prefer_complex',
       'NCS_prefer_little_thought', 'NCS_relief_not_satisfaction',
       'NCS_tasks_little_thought', 'NCS_new_solutions_to_problems',
       'NCS_abstract_thinking', 'NCS_total', 'NCS_small_daily_projects',
       'NCS_solve_puzzles', 'NCS_satisfaction_in_delibe

## 10 variables

In [30]:


def generate_synthetic_dev_data(analysis_data_imputed, group_assignments, outcome_measures, group_interaction_effects = None):
    """
        This function generates synthetic data for the intervention moderation analysis.
        It takes in the imputed data, the group assignments, and the outcome measures.
        It also takes in a dictionary of predictor interaction effects.
        The dictionary should have the following structure:
        group_interaction_effects = {'group_name': list_of_effect_sizes}
        where list_of_effect_sizes is a list of effect sizes for each predictor and should have a length equal to the number of predictors.
    """

    #make a copy of the outcome measures
    #don't want to modify the original
    outcome_measures=outcome_measures.copy()
    np.random.seed(3201203)

    #create a normed version of the predictor array
    #normalize each column to have mean 0 and std 1
    predictors_normed = analysis_data_imputed.copy()
    for col in predictors_normed.columns:
        predictors_normed[col] = (predictors_normed[col] - np.mean(predictors_normed[col]))/np.std(predictors_normed[col])




    #sample effect size from a normal distribution for each predictor
    #sample from a normal distribution with mean 0 and std 0.1
    #then add to the predictor value
    #for each group, calculate main effect
    active_groups = np.unique(group_assignments)[1:]
    print(active_groups)
    group_main_effects = np.random.normal(0,1,active_groups.shape[0])
    print(group_main_effects)


    #apply the main effect. note that the first group will not have a main effect
    for i,group in enumerate(active_groups):
        for om in ['bf_2','cancer_promoting_minus_preventing_FFQ_w2','FFQ_v2_Mean_Energy_w2']:
            om_mean = np.nanmean(outcome_measures[om])
            om_sd = np.nanstd(outcome_measures[om])
            outcome_measures[om] = outcome_measures[om] + (group_assignments==group)*group_main_effects[i]

    interaction_effects_list = []
    print(group_assignments)
    groups = np.unique(group_assignments)
    print(groups)

    #apply the interaction effect
    for i,group in enumerate(groups): #all three groups
        print(group)
        #generate interaction effect for group
        #check to see if there are pre-defined interaction effects for this group
        if group_interaction_effects is None:
            predictor_interaction_effects = np.random.normal(0,0.5,predictors_normed.shape[1])
        elif group in group_interaction_effects.keys():
            predictor_interaction_effects = group_interaction_effects[group]
        else:
            print("no interaction effects for group: " + str(group) + ". No effects will be included.")

        #print some of the fake effects we're generating
        effect_summary = pd.DataFrame(
            {'feature_name':analysis_data_imputed.columns,
            'interaction_effect':predictor_interaction_effects})
        effect_summary['interaction_effect_abs'] = np.abs(effect_summary.interaction_effect)
        effect_summary = effect_summary.sort_values('interaction_effect_abs',ascending=False)
        last_nonzero_effect = np.max(np.where(effect_summary.interaction_effect_abs>0.001))
        print(effect_summary.iloc[0:min(20,last_nonzero_effect+2),0:2])
        #just add an effect of 1 to the first item only.
        # predictor_interaction_effects = [0]*(predictors_normed.shape[1])
        # predictor_interaction_effects[i] = 0.5
        # print(predictor_interaction_effects[0:10])
        #multiply the predictor interaction effect by the predictor values
        predictor_interaction_values = predictors_normed * predictor_interaction_effects
        # print(predictor_interaction_values.iloc[0:10,0:5])
        outcome_zscore_change = (group_assignments==group)*np.sum(predictor_interaction_values,axis=1)
        # print("zscore:")
        # print(outcome_zscore_change.head())
        # #add that to the outcome measures
        for om in ['bf_2','cancer_promoting_minus_preventing_FFQ_w2','FFQ_v2_Mean_Energy_w2']:
            #take mean and sd of non-nan values
            print(om)
            om_mean = np.nanmean(outcome_measures[om])
            om_sd = np.nanstd(outcome_measures[om])
            # print(om_mean)
            # print(om_sd)
            
            outcome_measures.loc[group_assignments==group,om] = outcome_measures.loc[group_assignments==group,om] + outcome_zscore_change[group_assignments==group]*om_sd

        
        interaction_effects_list.append(
            pd.DataFrame(
            {'group':[group]*predictors_normed.shape[1],
            'predictor':predictors_normed.columns,
            'interaction_effect':predictor_interaction_effects})
        )


        interaction_effect_df = pd.concat(interaction_effects_list)

        interaction_effect_abs = np.abs(interaction_effect_df.interaction_effect)
        #sort by absolute value of interaction effect
        interaction_effect_df['interaction_effect_abs'] = interaction_effect_abs
        interaction_effect_df = interaction_effect_df.sort_values('interaction_effect_abs',ascending=False)
        interaction_effect_df = interaction_effect_df.drop('interaction_effect_abs',axis=1)

    return({'X_weights':interaction_effect_df,'y':outcome_measures})    



In [31]:
def run_full_limited_predictor_analysis(total_predictor_count, outcome_measures, analysis_data_imputed, effect_size, custom_interaction_effects=None):

    #set np random seed
    np.random.seed(3161527)

    group_names = ['ichi','ni','san']
    #assign each row randomly to a group
    group_assignments = np.random.choice(group_names,analysis_data_imputed.shape[0])
    

    #synthetic outcomes
    outcome_measures = generate_synthetic_dev_outcomes(outcome_measures)

    #create a limited set of predictors
    analysis_data_smol = analysis_data_imputed.iloc[:,0:total_predictor_count]

    # add synthetic primary and interaction effects

    if custom_interaction_effects is None:
        #set up the interaction effects
        #0.08 will give us correlations around 0.3 between the interaction effects and the outcome
        custom_interaction_effects_g1 = [0]*analysis_data_smol.shape[1]
        custom_interaction_effects_g1[0] = effect_size
        custom_interaction_effects_g1[1] = effect_size
        custom_interaction_effects_g1[2] = -effect_size

        custom_interaction_effects_g2 = [0]*analysis_data_smol.shape[1]
        custom_interaction_effects_g2[4] = effect_size
        custom_interaction_effects_g2[5] = effect_size
        custom_interaction_effects_g2[6] = -effect_size

        custom_interaction_effects = {'ni':custom_interaction_effects_g1,'san':custom_interaction_effects_g2}



    synthetic_data = generate_synthetic_dev_data(analysis_data_smol, group_assignments,outcome_measures, group_interaction_effects = custom_interaction_effects)
    interaction_effect_df = synthetic_data['X_weights']
    outcome_measures = synthetic_data['y']

    # Set up outcome measures and group assignment one-hot

    outcome_measures = calculate_outcome_changes(outcome_measures)
    group_assignment_onehots = pd.get_dummies(group_assignments).loc[:,['ni','san']]

    predictor_data = set_up_interactions(analysis_data_smol, group_assignment_onehots)


    #remove any NA values for this outcome measure in both the predictor data and the outcome data
    outcome_nas = outcome_measures['d_bf'].isna()

    outcome_measures_nona = outcome_measures.loc[~outcome_nas,:]
    predictor_data_nona = predictor_data.loc[~outcome_nas,:]
    group_assignment_onehots_nonan = group_assignment_onehots.loc[~outcome_nas,:]
    group_assignments_nona = group_assignments[~outcome_nas]

    ### Try out CV with simple gridsearch

    scoring_data = do_scoring_loop(X=predictor_data_nona, y= outcome_measures_nona['d_bf'], 
                    groups = group_assignments_nona, 
                    hyperparameter_selection_on_fold=do_hyperparameter_selection_loop,
                    outer_folds=5)

    scores = scoring_data['scores']
    best_models = scoring_data['best_models']
    best_params_df_list = scoring_data['best_params_df_list']
    raw_cv_results_list = scoring_data['raw_cv_results_list']

    print("scores:")
    print(scores)
    overall_score = np.mean(scores)
    print("overall_score:")
    print(overall_score)



    best_model = get_best_model(summarize_overall_df_results(raw_cv_results_list))
    final_fit = do_final_fit(X=predictor_data_nona, y= outcome_measures_nona['d_bf'], final_model=best_model)
    final_results = present_model_results(X=predictor_data_nona, final_fit=final_fit, y=outcome_measures_nona['d_bf'])

    #print rows of final_results where feature_name is the list of features to check
    base_regressors = interaction_effect_df.predictor[interaction_effect_df.interaction_effect!=0]
    regressors_to_check = [x+y for y in ['','*ni','*san'] for x in base_regressors]
    final_results['planned_regression'] = final_results['predictor'].isin(regressors_to_check)

    present_results_vs_ground_truth_cors(predictor_data_nona,outcome_measures_nona,group_assignments_nona,final_results,base_regressors)

    return(overall_score)




In [33]:
overall_scores = pd.DataFrame(columns=['n_features','effect_size','overall_score'])
predictors = 15


#run the analysis with a limited number of predictors

for effect_size in [0.08,0.1,0.15,0.2]:
    print(effect_size)

    custom_interaction_effects_g0 = [0]*predictors
    custom_interaction_effects_g0[0] = effect_size

    custom_interaction_effects_g1 = [0]*predictors
    custom_interaction_effects_g1[0] = effect_size #all three groups, i.e., main effect
    custom_interaction_effects_g1[1] = effect_size #group 1 and 2, i.e., interaction effect of intervention
    custom_interaction_effects_g1[2] = effect_size #group 1 only, i.e., interaction effect of group1
    custom_interaction_effects_g1[3] = -effect_size

    custom_interaction_effects_g2 = [0]*predictors
    custom_interaction_effects_g2[0] = effect_size
    custom_interaction_effects_g2[1] = effect_size
    custom_interaction_effects_g2[4] = effect_size
    custom_interaction_effects_g2[5] = -effect_size
    custom_interaction_effects = {'ichi':custom_interaction_effects_g0, 'ni':custom_interaction_effects_g1,'san':custom_interaction_effects_g2}

    overall_score = run_full_limited_predictor_analysis(predictors, outcome_measures, analysis_data_imputed, effect_size= es, custom_interaction_effects = custom_interaction_effects)
    overall_scores = overall_scores.append({'n_features':predictors, 'effect_size':effect_size,'overall_score':overall_score},ignore_index=True)

0.08
['ni' 'san']
[1.28335298 0.42953651]
['san' 'san' 'ni' 'ichi' 'san' 'san' 'ichi' 'san' 'san' 'san' 'ni' 'ichi'
 'ichi' 'ichi' 'ichi' 'san' 'san' 'san' 'ichi' 'ichi' 'san' 'san' 'ni'
 'ni' 'ni' 'ni' 'ni' 'ni' 'ni' 'san' 'ni' 'san' 'ni' 'ichi' 'ni' 'san'
 'ni' 'ichi' 'san' 'ni' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ni' 'ni'
 'san' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ichi' 'ni' 'ni' 'ni' 'ichi' 'san'
 'ni' 'ni' 'ichi' 'ni' 'ichi' 'san' 'ni' 'ni' 'ni' 'san' 'ichi' 'ni' 'san'
 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'ichi' 'san' 'ichi' 'san' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'san' 'san' 'san' 'ichi'
 'ni' 'san' 'ichi' 'ichi' 'san' 'ni' 'ichi' 'san' 'ni' 'ni' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'ichi' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ichi'
 'ni' 'ni' 'ichi' 'ni' 'ni' 'ichi' 'ichi' 'san' 'san' 'ni' 'ichi' 'ni'
 'ichi' 'ichi' 'san' 'ichi' 'ni' 'san' 'san' 'ni' 'ni' 'san' 'san' 'san'
 'ichi' 'san' 'ni' 'san' 'ichi' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ni' 'ni'
 'ichi' 'ni' 'ic

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test_score,mean_test_score,std_test_score,std_test_score
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
model_description,params_str,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Lasso()])","{'estimator__alpha': 0.4, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.20888,0.052412,0.261376,0.034357
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Lasso()])","{'estimator__alpha': 0.2, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.215247,0.071968,0.250701,0.040395
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Lasso()])","{'estimator__alpha': 0.1, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.216379,0.08137,0.237897,0.073249
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Lasso()])","{'estimator__alpha': 0.6, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.221587,0.046897,0.275199,0.038982
"dict_values([StandardScaler(), SelectKBest(), Lasso()])","{'estimator__alpha': 0.4, 'feature_selection__k': 20, 'feature_selection__score_func': <function f_regression at 0x180f689d0>}",-3.225424,0.050577,0.265746,0.032294
"dict_values([StandardScaler(), SelectKBest(), Lasso()])","{'estimator__alpha': 0.4, 'feature_selection__k': 47, 'feature_selection__score_func': <function f_regression at 0x180f689d0>}",-3.226303,0.0499,0.265704,0.03243
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.4},-3.226303,0.0499,0.265704,0.03243
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Lasso()])","{'estimator__alpha': 0.8, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.228568,0.047891,0.279804,0.039583
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Lasso()])","{'estimator__alpha': 1.0, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.22879,0.046488,0.281644,0.037526
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Lasso()])","{'estimator__alpha': 1.0, 'feature_selection__n_features_to_select': 10, 'feature_selection__step': 5}",-3.22879,0.046488,0.281644,0.037526


doing permutation test on importance; this may take time.
Number of selected features: 1


Unnamed: 0,predictor,coef,feature_importance,fa_abs
17,BSCS*ni,0.276951,0.023104,0.023104
6,ACES_neglectful_parenting,-0.0,0.0,0.0
29,BFI_conscientiousness*ni,0.0,0.0,0.0
44,BFI_conscientiousness*san,0.0,0.0,0.0
43,BFI_agreeableness*san,0.0,0.0,0.0
42,ACES_household_dysfunction*san,0.0,0.0,0.0
41,ACES_divorced_separated*san,0.0,0.0,0.0
40,ACES_sum*san,0.0,0.0,0.0
39,ACES_abuse*san,0.0,0.0,0.0
38,ACES_neglectful_parenting*san,0.0,0.0,0.0


  results_vs_cors = final_results_wide.merge(group_correlations, left_index=True, right_index=True, how='outer')


Unnamed: 0,"(coef, base)","(coef, ni)","(coef, san)","(feature_importance, base)","(feature_importance, ni)","(feature_importance, san)",ichi_cor,ni_cor,san_cor,abs_effect_sum
BSCS,,0.277,0.0,,0.023,0.0,0.04,0.082,0.334,0.023
BSCS,,0.277,0.0,,0.023,0.0,0.04,0.082,0.334,0.023
BSCS,,0.277,0.0,,0.023,0.0,0.04,0.082,0.334,0.023


0.1
['ni' 'san']
[1.28335298 0.42953651]
['san' 'san' 'ni' 'ichi' 'san' 'san' 'ichi' 'san' 'san' 'san' 'ni' 'ichi'
 'ichi' 'ichi' 'ichi' 'san' 'san' 'san' 'ichi' 'ichi' 'san' 'san' 'ni'
 'ni' 'ni' 'ni' 'ni' 'ni' 'ni' 'san' 'ni' 'san' 'ni' 'ichi' 'ni' 'san'
 'ni' 'ichi' 'san' 'ni' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ni' 'ni'
 'san' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ichi' 'ni' 'ni' 'ni' 'ichi' 'san'
 'ni' 'ni' 'ichi' 'ni' 'ichi' 'san' 'ni' 'ni' 'ni' 'san' 'ichi' 'ni' 'san'
 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'ichi' 'san' 'ichi' 'san' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'san' 'san' 'san' 'ichi'
 'ni' 'san' 'ichi' 'ichi' 'san' 'ni' 'ichi' 'san' 'ni' 'ni' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'ichi' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ichi'
 'ni' 'ni' 'ichi' 'ni' 'ni' 'ichi' 'ichi' 'san' 'san' 'ni' 'ichi' 'ni'
 'ichi' 'ichi' 'san' 'ichi' 'ni' 'san' 'san' 'ni' 'ni' 'san' 'san' 'san'
 'ichi' 'san' 'ni' 'san' 'ichi' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ni' 'ni'
 'ichi' 'ni' 'ich

  overall_scores = overall_scores.append({'n_features':predictors, 'effect_size':effect_size,'overall_score':overall_score},ignore_index=True)


Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_selection', RFE(estimator=LinearRegression())),
                ('estimator', Ridge())])
{'feature_selection__n_features_to_select': [10, 25], 'feature_selection__step': [5], 'estimator__alpha': array([0.1, 1. , 0.2, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 14 candidates, totalling 56 fits
Pipeline(steps=[('scaler', StandardScaler()), ('estimator', Lasso())])
{'estimator__alpha': array([0.1, 1. , 0.2, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 7 candidates, totalling 28 fits
Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_selection', SelectKBest()), ('estimator', Lasso())])
{'feature_selection__score_func': [<function f_regression at 0x180f689d0>], 'feature_selection__k': [20, 47], 'estimator__alpha': array([0.1, 1. , 0.2, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 14 candidates, totalling 56 fits
Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test_score,mean_test_score,std_test_score,std_test_score
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
model_description,params_str,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Lasso()])","{'estimator__alpha': 0.4, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.250887,0.060835,0.254496,0.035885
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Lasso()])","{'estimator__alpha': 0.1, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.259553,0.103321,0.235015,0.073479
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Lasso()])","{'estimator__alpha': 0.2, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.26013,0.083882,0.239376,0.048836
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Lasso()])","{'estimator__alpha': 0.6, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.263369,0.055294,0.27063,0.040663
"dict_values([StandardScaler(), SelectKBest(), Lasso()])","{'estimator__alpha': 0.4, 'feature_selection__k': 20, 'feature_selection__score_func': <function f_regression at 0x180f689d0>}",-3.265992,0.048114,0.263109,0.027892
"dict_values([StandardScaler(), SelectKBest(), Lasso()])","{'estimator__alpha': 0.2, 'feature_selection__k': 47, 'feature_selection__score_func': <function f_regression at 0x180f689d0>}",-3.266495,0.052342,0.248225,0.019135
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.2},-3.266495,0.052342,0.248225,0.019135
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.4},-3.26668,0.047348,0.263436,0.028153
"dict_values([StandardScaler(), SelectKBest(), Lasso()])","{'estimator__alpha': 0.4, 'feature_selection__k': 47, 'feature_selection__score_func': <function f_regression at 0x180f689d0>}",-3.26668,0.047348,0.263436,0.028153
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.1},-3.267818,0.060202,0.236184,0.050459


doing permutation test on importance; this may take time.
Number of selected features: 1


Unnamed: 0,predictor,coef,feature_importance,fa_abs
17,BSCS*ni,0.283603,0.023402,0.023402
6,ACES_neglectful_parenting,0.0,0.0,0.0
31,BFI_neuroticism*ni,0.0,0.0,0.0
44,BFI_conscientiousness*san,0.0,0.0,0.0
43,BFI_agreeableness*san,0.0,0.0,0.0
42,ACES_household_dysfunction*san,0.0,0.0,0.0
41,ACES_divorced_separated*san,0.0,0.0,0.0
40,ACES_sum*san,0.0,0.0,0.0
39,ACES_abuse*san,0.0,0.0,0.0
38,ACES_neglectful_parenting*san,0.0,0.0,0.0


  results_vs_cors = final_results_wide.merge(group_correlations, left_index=True, right_index=True, how='outer')


Unnamed: 0,"(coef, base)","(coef, ni)","(coef, san)","(feature_importance, base)","(feature_importance, ni)","(feature_importance, san)",ichi_cor,ni_cor,san_cor,abs_effect_sum
BSCS,,0.284,0.0,,0.023,0.0,0.085,0.096,0.385,0.023
BSCS,,0.284,0.0,,0.023,0.0,0.085,0.096,0.385,0.023
BSCS,,0.284,0.0,,0.023,0.0,0.085,0.096,0.385,0.023


0.15
['ni' 'san']
[1.28335298 0.42953651]
['san' 'san' 'ni' 'ichi' 'san' 'san' 'ichi' 'san' 'san' 'san' 'ni' 'ichi'
 'ichi' 'ichi' 'ichi' 'san' 'san' 'san' 'ichi' 'ichi' 'san' 'san' 'ni'
 'ni' 'ni' 'ni' 'ni' 'ni' 'ni' 'san' 'ni' 'san' 'ni' 'ichi' 'ni' 'san'
 'ni' 'ichi' 'san' 'ni' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ni' 'ni'
 'san' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ichi' 'ni' 'ni' 'ni' 'ichi' 'san'
 'ni' 'ni' 'ichi' 'ni' 'ichi' 'san' 'ni' 'ni' 'ni' 'san' 'ichi' 'ni' 'san'
 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'ichi' 'san' 'ichi' 'san' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'san' 'san' 'san' 'ichi'
 'ni' 'san' 'ichi' 'ichi' 'san' 'ni' 'ichi' 'san' 'ni' 'ni' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'ichi' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ichi'
 'ni' 'ni' 'ichi' 'ni' 'ni' 'ichi' 'ichi' 'san' 'san' 'ni' 'ichi' 'ni'
 'ichi' 'ichi' 'san' 'ichi' 'ni' 'san' 'san' 'ni' 'ni' 'san' 'san' 'san'
 'ichi' 'san' 'ni' 'san' 'ichi' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ni' 'ni'
 'ichi' 'ni' 'ic

  overall_scores = overall_scores.append({'n_features':predictors, 'effect_size':effect_size,'overall_score':overall_score},ignore_index=True)


Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_selection', RFE(estimator=LinearRegression())),
                ('estimator', Ridge())])
{'feature_selection__n_features_to_select': [10, 25], 'feature_selection__step': [5], 'estimator__alpha': array([0.1, 1. , 0.2, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 14 candidates, totalling 56 fits
Pipeline(steps=[('scaler', StandardScaler()), ('estimator', Lasso())])
{'estimator__alpha': array([0.1, 1. , 0.2, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 7 candidates, totalling 28 fits
Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_selection', SelectKBest()), ('estimator', Lasso())])
{'feature_selection__score_func': [<function f_regression at 0x180f689d0>], 'feature_selection__k': [20, 47], 'estimator__alpha': array([0.1, 1. , 0.2, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 14 candidates, totalling 56 fits
Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test_score,mean_test_score,std_test_score,std_test_score
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
model_description,params_str,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
"dict_values([StandardScaler(), SelectKBest(), Lasso()])","{'estimator__alpha': 0.1, 'feature_selection__k': 47, 'feature_selection__score_func': <function f_regression at 0x180f689d0>}",-3.345412,0.068854,0.236519,0.063104
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.1},-3.345412,0.068854,0.236519,0.063104
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 1.0, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.35118,0.130044,0.219064,0.085932
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 0.8, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.354476,0.140896,0.219418,0.089194
"dict_values([StandardScaler(), SelectKBest(), Lasso()])","{'estimator__alpha': 0.2, 'feature_selection__k': 47, 'feature_selection__score_func': <function f_regression at 0x180f689d0>}",-3.35684,0.062051,0.243133,0.040507
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.2},-3.35684,0.062051,0.243133,0.040507
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 0.6, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.359817,0.14346,0.221979,0.08788
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 0.4, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.367859,0.145902,0.227217,0.08852
"dict_values([StandardScaler(), SelectKBest(), Lasso()])","{'estimator__alpha': 0.4, 'feature_selection__k': 20, 'feature_selection__score_func': <function f_regression at 0x180f689d0>}",-3.376565,0.049554,0.260961,0.037149
"dict_values([StandardScaler(), SelectKBest(), Lasso()])","{'estimator__alpha': 0.4, 'feature_selection__k': 47, 'feature_selection__score_func': <function f_regression at 0x180f689d0>}",-3.377987,0.04908,0.259323,0.032232


doing permutation test on importance; this may take time.
Number of selected features: 14


Unnamed: 0,predictor,coef,feature_importance,fa_abs
19,BIS_11*ni,1.074144,0.12702,0.12702
0,BSCS,0.898951,0.106526,0.106526
1,EDM,0.817686,0.085481,0.085481
3,PCS,-0.601043,0.040948,0.040948
13,BFI_extraversion,0.515245,0.034378,0.034378
27,ACES_household_dysfunction*ni,-0.306943,0.016263,0.016263
45,BFI_extraversion*san,0.270325,0.01339,0.01339
11,BFI_agreeableness,-0.30217,0.012006,0.012006
39,ACES_abuse*san,0.152236,0.004659,0.004659
4,RS,0.16387,0.004137,0.004137


  results_vs_cors = final_results_wide.merge(group_correlations, left_index=True, right_index=True, how='outer')


Unnamed: 0,"(coef, base)","(coef, ni)","(coef, san)","(feature_importance, base)","(feature_importance, ni)","(feature_importance, san)",ichi_cor,ni_cor,san_cor,abs_effect_sum
BIS_11,-0.088,1.074,-0.0,0.003,0.127,0.0,-0.16,0.02,-0.3,0.13
BSCS,0.899,0.0,0.0,0.107,0.0,0.0,0.193,0.129,0.49,0.107
BSCS,0.899,0.0,0.0,0.107,0.0,0.0,0.193,0.129,0.49,0.107
BSCS,0.899,0.0,0.0,0.107,0.0,0.0,0.193,0.129,0.49,0.107
EDM,0.818,0.0,0.0,0.085,0.0,0.0,0.18,0.186,0.341,0.085
EDM,0.818,0.0,0.0,0.085,0.0,0.0,0.18,0.186,0.341,0.085
BFI_extraversion,0.515,0.0,0.27,0.034,0.0,0.013,,,,0.048
PCS,-0.601,-0.0,0.0,0.041,0.0,0.0,0.088,-0.154,0.146,0.041
ACES_household_dysfunction,0.111,-0.307,0.0,0.002,0.016,0.0,,,,0.018
BFI_agreeableness,-0.302,0.0,-0.0,0.012,0.0,0.0,,,,0.012


0.2
['ni' 'san']
[1.28335298 0.42953651]
['san' 'san' 'ni' 'ichi' 'san' 'san' 'ichi' 'san' 'san' 'san' 'ni' 'ichi'
 'ichi' 'ichi' 'ichi' 'san' 'san' 'san' 'ichi' 'ichi' 'san' 'san' 'ni'
 'ni' 'ni' 'ni' 'ni' 'ni' 'ni' 'san' 'ni' 'san' 'ni' 'ichi' 'ni' 'san'
 'ni' 'ichi' 'san' 'ni' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ni' 'ni'
 'san' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ichi' 'ni' 'ni' 'ni' 'ichi' 'san'
 'ni' 'ni' 'ichi' 'ni' 'ichi' 'san' 'ni' 'ni' 'ni' 'san' 'ichi' 'ni' 'san'
 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'ichi' 'san' 'ichi' 'san' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'san' 'san' 'san' 'ichi'
 'ni' 'san' 'ichi' 'ichi' 'san' 'ni' 'ichi' 'san' 'ni' 'ni' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'ichi' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ichi'
 'ni' 'ni' 'ichi' 'ni' 'ni' 'ichi' 'ichi' 'san' 'san' 'ni' 'ichi' 'ni'
 'ichi' 'ichi' 'san' 'ichi' 'ni' 'san' 'san' 'ni' 'ni' 'san' 'san' 'san'
 'ichi' 'san' 'ni' 'san' 'ichi' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ni' 'ni'
 'ichi' 'ni' 'ich

  overall_scores = overall_scores.append({'n_features':predictors, 'effect_size':effect_size,'overall_score':overall_score},ignore_index=True)


Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_selection', RFE(estimator=LinearRegression())),
                ('estimator', Ridge())])
{'feature_selection__n_features_to_select': [10, 25], 'feature_selection__step': [5], 'estimator__alpha': array([0.1, 1. , 0.2, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 14 candidates, totalling 56 fits
Pipeline(steps=[('scaler', StandardScaler()), ('estimator', Lasso())])
{'estimator__alpha': array([0.1, 1. , 0.2, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 7 candidates, totalling 28 fits
Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_selection', SelectKBest()), ('estimator', Lasso())])
{'feature_selection__score_func': [<function f_regression at 0x180f689d0>], 'feature_selection__k': [20, 47], 'estimator__alpha': array([0.1, 1. , 0.2, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 14 candidates, totalling 56 fits
Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test_score,mean_test_score,std_test_score,std_test_score
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
model_description,params_str,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
"dict_values([StandardScaler(), SelectKBest(), Ridge()])","{'estimator__alpha': 1.0, 'feature_selection__k': 47, 'feature_selection__score_func': <function f_regression at 0x180f689d0>}",-3.402754,0.081064,0.208122,0.037775
"dict_values([StandardScaler(), Ridge()])",{'estimator__alpha': 1.0},-3.402754,0.081064,0.208122,0.037775
"dict_values([StandardScaler(), SelectKBest(), Ridge()])","{'estimator__alpha': 0.8, 'feature_selection__k': 47, 'feature_selection__score_func': <function f_regression at 0x180f689d0>}",-3.410296,0.087644,0.208426,0.042423
"dict_values([StandardScaler(), Ridge()])",{'estimator__alpha': 0.8},-3.410296,0.087644,0.208426,0.042423
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.1},-3.414104,0.074568,0.238553,0.074037
"dict_values([StandardScaler(), SelectKBest(), Lasso()])","{'estimator__alpha': 0.1, 'feature_selection__k': 47, 'feature_selection__score_func': <function f_regression at 0x180f689d0>}",-3.414104,0.074568,0.238553,0.074037
"dict_values([StandardScaler(), SelectKBest(), Ridge()])","{'estimator__alpha': 0.6, 'feature_selection__k': 47, 'feature_selection__score_func': <function f_regression at 0x180f689d0>}",-3.421485,0.089166,0.211033,0.045205
"dict_values([StandardScaler(), Ridge()])",{'estimator__alpha': 0.6},-3.421485,0.089166,0.211033,0.045205
"dict_values([StandardScaler(), Ridge()])",{'estimator__alpha': 0.4},-3.436894,0.090597,0.21692,0.048782
"dict_values([StandardScaler(), SelectKBest(), Ridge()])","{'estimator__alpha': 0.4, 'feature_selection__k': 47, 'feature_selection__score_func': <function f_regression at 0x180f689d0>}",-3.436894,0.090597,0.21692,0.048782


doing permutation test on importance; this may take time.
Number of selected features: 47


Unnamed: 0,predictor,coef,feature_importance,fa_abs
37,TRSQ*san,-4.344148,1.793549,1.793549
19,BIS_11*ni,3.515426,1.187481,1.187481
36,RS*san,2.868618,0.792952,0.792952
45,BFI_extraversion*san,2.710318,0.707538,0.707538
32,BSCS*san,2.64867,0.668558,0.668558
29,BFI_conscientiousness*ni,-2.459893,0.607383,0.607383
20,PCS*ni,-2.349045,0.557377,0.557377
44,BFI_conscientiousness*san,-1.879941,0.336626,0.336626
17,BSCS*ni,1.617924,0.248639,0.248639
33,EDM*san,1.613077,0.244241,0.244241


  results_vs_cors = final_results_wide.merge(group_correlations, left_index=True, right_index=True, how='outer')


Unnamed: 0,"(coef, base)","(coef, ni)","(coef, san)","(feature_importance, base)","(feature_importance, ni)","(feature_importance, san)",ichi_cor,ni_cor,san_cor,abs_effect_sum
TRSQ,0.105,-0.854,-4.344,0.001,0.075,1.794,0.039,-0.117,-0.318,1.87
BIS_11,-0.174,3.515,-1.38,0.002,1.187,0.183,-0.223,0.052,-0.339,1.373
BSCS,1.299,1.618,2.649,0.155,0.249,0.669,0.295,0.157,0.567,1.072
BSCS,1.299,1.618,2.649,0.155,0.249,0.669,0.295,0.157,0.567,1.072
BSCS,1.299,1.618,2.649,0.155,0.249,0.669,0.295,0.157,0.567,1.072
BFI_conscientiousness,0.021,-2.46,-1.88,-0.0,0.607,0.337,,,,0.944
BFI_extraversion,-0.028,1.115,2.71,-0.0,0.116,0.708,,,,0.823
RS,0.186,-0.392,2.869,0.004,0.016,0.793,-0.052,-0.05,0.005,0.813
PCS,-0.363,-2.349,-0.136,0.015,0.557,0.002,0.133,-0.182,0.176,0.574
EDM,0.628,1.486,1.613,0.031,0.205,0.244,0.217,0.245,0.413,0.48


  overall_scores = overall_scores.append({'n_features':predictors, 'effect_size':effect_size,'overall_score':overall_score},ignore_index=True)


In [34]:
overall_scores

Unnamed: 0,n_features,effect_size,overall_score
0,15.0,0.08,-0.127407
1,15.0,0.1,-0.104549
2,15.0,0.15,-0.088353
3,15.0,0.2,0.039684
