In [1]:
import yaml
from yaml.loader import SafeLoader
from socket import gethostname
import numpy as np
import pandas as pd
from sklearn.base import clone
from dev_interaction_util import generate_synthetic_dev_outcomes, generate_synthetic_dev_data, set_up_interactions
from dev_interaction_util import do_scoring_loop, get_best_model, summarize_overall_df_results, do_final_fit, present_model_results, present_results_vs_ground_truth_cors
from dev_interaction_util import load_and_preprocess_data, impute_data, run_full_limited_predictor_analysis
from ml_util import *
# Imputing with MICE
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn import linear_model
from ml_util import get_data_for_imputation
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.inspection import permutation_importance
import numpy as np
from IPython.display import display, HTML
from sklearn.base import clone
from sklearn.inspection import permutation_importance
import seaborn as sns
from sklearn.feature_selection import SelectKBest, f_regression, RFE


In [2]:


print(gethostname())
# Open the file and load the file
with open('config.yml') as f:
    all_yaml = yaml.load(f, Loader=SafeLoader)
    if gethostname() in all_yaml.keys():
        config = all_yaml[gethostname()]
    else:
        config = all_yaml['default']
        
print(config)



Benjamins-MacBook-Pro-2.local
{'dropbox_data_dir': '/Users/benjaminsmith/Dropbox (University of Oregon)/UO-SAN Lab/Berkman Lab/Devaluation/analysis_files/data/'}


This notebook is derived from `test_feature_selection.ipynb`.

In [3]:
dropbox_data_dir = config['dropbox_data_dir']


In [4]:
analysis_data, outcome_measures = load_and_preprocess_data(dropbox_data_dir)

In [5]:
analysis_data_imputed = impute_data(analysis_data)



In [6]:


def do_hyperparameter_selection_loop_r2(X,y,cv):
    return(do_hyperparameter_selection_loop_w_metric(X,y,cv,'r2'))

#loops through the different estimators and feature selection methods and does a grid search over all to find the best hyperparameters
def do_hyperparameter_selection_loop(X, y,cv):
    return(do_hyperparameter_selection_loop_w_metric(X,y,cv,'neg_mean_absolute_error'))

#loops through the different estimators and feature selection methods and does a grid search over all to find the best hyperparameters
def do_hyperparameter_selection_loop_w_metric(X, y,cv,metric):
    #alpha parameters for Ridge and Lasso
    alpha_10pow_lower = 1
    alpha_10pow_upper = 0
    alpha_increments=1
    alpha_range = np.concatenate([np.power(10,np.linspace(-alpha_10pow_lower,alpha_10pow_upper,(alpha_10pow_lower+alpha_10pow_upper)*alpha_increments+1)),
        [0.2,0.3,0.4,0.6,0.8,1.0]])
    
    all_cv_results = []

    pipeline_estimator_name = 'estimator'
    feature_selection_name = 'feature_selection'


    #define the param_grid for the estimators
    estimators_to_run = {
        'Ridge':{
            'estimator':linear_model.Ridge,
            'parameters':{'alpha':alpha_range}
        },
        'Lasso':{
            'estimator':linear_model.Lasso,
            'parameters':{'alpha':alpha_range}
        },
        'DecisionTreeRegressor':{
            'estimator':DecisionTreeRegressor,
            'parameters':{
                'max_depth':[2, 4],
                'min_samples_split':[20,50],
                'min_samples_leaf':[20,50]
            }
        }             
    }

    k_max_val = np.min([50,X.shape[1]])

    for estimator_name,estimator_dict in estimators_to_run.items():
        #param grid for the feature seelction
        #this is here because we need to know the estimator to pass to the feature selector
        feature_selectors_to_run = {
            'None':None,
            'KBest':{
                'selector':SelectKBest(),
                'parameters':{
                    'score_func' : [f_regression], 
                    'k' : [10,25,k_max_val]
                    }
            },
            'RFE':{
                'selector':RFE(linear_model.LinearRegression()),
                'parameters':{
                    'n_features_to_select' : [10,25],
                    #'verbose':[1],
                    'step':[5]
                }
            }
        }
        for selector_name, selector_dict in feature_selectors_to_run.items():
        #create the estimator
            if selector_name == 'None':
                pipeline = Pipeline([('scaler',StandardScaler()),
                                     (pipeline_estimator_name,estimator_dict['estimator']())])
                selector_params = {}
            else:
                pipeline = Pipeline([('scaler',StandardScaler()),
                                     (feature_selection_name,selector_dict['selector']), 
                                     (pipeline_estimator_name,estimator_dict['estimator']())])
                selector_params = selector_dict['parameters']

            estimator_param_grid = {(pipeline_estimator_name + '__'+k):v for k,v in estimator_dict['parameters'].items()}
            selector_param_grid = {(feature_selection_name + '__'+k):v for k,v in selector_params.items()}
            #combine the two param grid dictionaries
            full_param_grid = {**selector_param_grid, **estimator_param_grid}
            print(pipeline)
            print(full_param_grid)

            
        
            gs_1 = GridSearchCV(estimator=pipeline, 
                                param_grid = full_param_grid, 
                                cv=cv,scoring=metric,verbose=1)
            gs_1.fit(X,y)
            all_cv_results.append(gs_1)

    #create a dataframe with the best parameters, best mean_test_score, and name of the model

    best_params_df = pd.DataFrame({
        'model': [cv_result.estimator for cv_result in all_cv_results],
        'model_name': [cv_result.estimator.__class__.__name__ for cv_result in all_cv_results],
        'best_params': [extract_estimator_params_from_gridsearch(cv_result.best_params_) for cv_result in all_cv_results],
        'best_score': [cv_result.best_score_ for cv_result in all_cv_results],
        'best_raw_params' : [cv_result.best_params_ for cv_result in all_cv_results]
        })
    
    best_params_df = best_params_df.sort_values('best_score',ascending=False).reset_index(drop=True)

    best_model = clone(best_params_df['model'][0])
    best_model_params = best_params_df['best_raw_params'][0]
    best_model.set_params(**best_model_params)

    return {
        'best_model': best_model,
        'best_params_df':best_params_df,
        'raw_cv_results':all_cv_results
    }


# Improving fit with manual theory-driven feature

My past analysis showed that by manually removing some features before the analysis starts, we can improve performance beyond the chance performance otherwise seen.

So, it might be useful to understand how much we can improve our performance by manual feature selection before the automatic feature selection applies.

This was previously done in `test_limited_predictors.ipynb`. We tested as few as 2 distractor features. In that test, predictor features generally had correlations in the range of |r|=0.06 to 0.53, with most around 0.4 (we should confirm that because it seems fishy that PCS was detegted as an effect, but didn't model as a large predictor). With most `|r|=0.4`, this seems unrealistically high to expect, and we should aim to build a pipeline capable of detecting more subtle effects than that. An approximate `|r|=0.3` can be achieved by mixing in a predictor scaled to 8% of normal scale.

I can imagine it is plausible to cut down to as few as two self-report, one behavioral, and one neural measure per intervention, plus sex and age. That would yield 10 different variables. At the other end, we might want 10 self-report, two behavioral, and five neural measures per intervention tested, plus 6 different demographic variables--a total of 40 variables. Let's see how these would perform, as well as mid-range of 20 predictor variables. In each case we'll restrict to three valid predictors per intervention.

In [7]:


def run_2_group_predictor_analysis(total_predictor_count, outcome_measures, analysis_data_imputed, effect_size, hyperparameter_optimizer,
                                        custom_interaction_effects=None
                                        ):

    #set np random seed
    np.random.seed(3161527)

    group_names = ['ichi','ni','san']
    #assign each row randomly to a group
    group_assignments_3 = np.random.choice(group_names,analysis_data_imputed.shape[0])
    
    #two-group analysis, comparing ichi vs ni and san
    group_assignments_2 = group_assignments_3.copy()
    group_assignments_2[group_assignments_2=='ni'] = 'nisan'
    group_assignments_2[group_assignments_2=='san'] = 'nisan'


    #synthetic outcomes
    outcome_measures = generate_synthetic_dev_outcomes(outcome_measures)

    #create a limited set of predictors
    analysis_data_smol = analysis_data_imputed.iloc[:,0:total_predictor_count]

    # add synthetic primary and interaction effects

    if custom_interaction_effects is None:
        #set up the interaction effects
        #0.08 will give us correlations around 0.3 between the interaction effects and the outcome
        custom_interaction_effects_g1 = [0]*analysis_data_smol.shape[1]
        custom_interaction_effects_g1[0] = effect_size
        custom_interaction_effects_g1[1] = effect_size
        custom_interaction_effects_g1[2] = -effect_size


        custom_interaction_effects = {'nisan':custom_interaction_effects_g1}

    


    synthetic_data = generate_synthetic_dev_data(analysis_data_smol, group_assignments_2,outcome_measures, group_interaction_effects = custom_interaction_effects)
    interaction_effect_df = synthetic_data['X_weights']
    outcome_measures = synthetic_data['y']

    # Set up outcome measures and group assignment one-hot

    outcome_measures = calculate_outcome_changes(outcome_measures)
    

    group_assignment_onehots = pd.get_dummies(group_assignments_2).loc[:,['nisan']]

    predictor_data = set_up_interactions(analysis_data_smol, group_assignment_onehots)


    #remove any NA values for this outcome measure in both the predictor data and the outcome data
    outcome_nas = outcome_measures['d_bf'].isna()

    outcome_measures_nona = outcome_measures.loc[~outcome_nas,:]
    predictor_data_nona = predictor_data.loc[~outcome_nas,:]
    group_assignment_onehots_nonan = group_assignment_onehots.loc[~outcome_nas,:]
    group_assignments_nona = group_assignments_2[~outcome_nas]

    ### Try out CV with simple gridsearch

    scoring_data = do_scoring_loop(X=predictor_data_nona, y= outcome_measures_nona['d_bf'], 
                    groups = group_assignments_nona, 
                    hyperparameter_selection_on_fold=hyperparameter_optimizer,
                    outer_folds=5)

    scores = scoring_data['scores']
    best_models = scoring_data['best_models']
    best_params_df_list = scoring_data['best_params_df_list']
    raw_cv_results_list = scoring_data['raw_cv_results_list']

    print("scores:")
    print(scores)
    overall_score = np.mean(scores)
    print("overall_score:")
    print(overall_score)



    best_model = get_best_model(summarize_overall_df_results(raw_cv_results_list))
    final_fit = do_final_fit(X=predictor_data_nona, y= outcome_measures_nona['d_bf'], final_model=best_model)
    final_results = present_model_results(X=predictor_data_nona, final_fit=final_fit, y=outcome_measures_nona['d_bf'])

    #print rows of final_results where feature_name is the list of features to check
    base_regressors = interaction_effect_df.predictor[interaction_effect_df.interaction_effect!=0]
    regressors_to_check = [x+y for y in ['','*nisan'] for x in base_regressors]
    final_results['planned_regression'] = final_results['predictor'].isin(regressors_to_check)

    present_results_vs_ground_truth_cors(predictor_data_nona,outcome_measures_nona,group_assignments_nona,final_results,base_regressors)

    return(overall_score)




In [8]:
overall_scores = pd.DataFrame(columns=['n_features','effect_size', 'overall_score'])
        
        
# hypers = {
#     'r2':do_hyperparameter_selection_loop_r2,
#     'mae':do_hyperparameter_selection_loop
# }


for pcount in [10,15]:
    for effect_size in [0.08,0.10,0.15]:
        custom_interaction_effects_g1    =  [0]*pcount
        custom_interaction_effects_g1[0] =  effect_size
        custom_interaction_effects_g1[1] =  effect_size
        custom_interaction_effects_g1[2] = -effect_size

        custom_interaction_effects_g2 = custom_interaction_effects_g1

        custom_interaction_effects = {'nisan':custom_interaction_effects_g1}


        overall_score = run_full_limited_predictor_analysis(
            pcount,
            outcome_measures,
            analysis_data_imputed,
            effect_size= effect_size,
            hyperparameter_optimizer = do_hyperparameter_selection_loop
            )

        #run the analysis with a limited number of predictors
        overall_scores = overall_scores.append(
            {'n_features':pcount,
            'effect_size':effect_size,
            'overall_score':overall_score
            #,'hyper_target':'mae'
            },
            ignore_index=True)

['ni' 'san']
[1.28335298 0.42953651]
['san' 'san' 'ni' 'ichi' 'san' 'san' 'ichi' 'san' 'san' 'san' 'ni' 'ichi'
 'ichi' 'ichi' 'ichi' 'san' 'san' 'san' 'ichi' 'ichi' 'san' 'san' 'ni'
 'ni' 'ni' 'ni' 'ni' 'ni' 'ni' 'san' 'ni' 'san' 'ni' 'ichi' 'ni' 'san'
 'ni' 'ichi' 'san' 'ni' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ni' 'ni'
 'san' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ichi' 'ni' 'ni' 'ni' 'ichi' 'san'
 'ni' 'ni' 'ichi' 'ni' 'ichi' 'san' 'ni' 'ni' 'ni' 'san' 'ichi' 'ni' 'san'
 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'ichi' 'san' 'ichi' 'san' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'san' 'san' 'san' 'ichi'
 'ni' 'san' 'ichi' 'ichi' 'san' 'ni' 'ichi' 'san' 'ni' 'ni' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'ichi' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ichi'
 'ni' 'ni' 'ichi' 'ni' 'ni' 'ichi' 'ichi' 'san' 'san' 'ni' 'ichi' 'ni'
 'ichi' 'ichi' 'san' 'ichi' 'ni' 'san' 'san' 'ni' 'ni' 'san' 'san' 'san'
 'ichi' 'san' 'ni' 'san' 'ichi' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ni' 'ni'
 'ichi' 'ni' 'ichi' '

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test_score,mean_test_score,std_test_score,std_test_score
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
model_description,params_str,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Lasso()])","{'estimator__alpha': 0.1, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.255178,0.036963,0.298413,0.065261
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Lasso()])","{'estimator__alpha': 0.2, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.262595,0.055916,0.325744,0.063801
"dict_values([StandardScaler(), SelectKBest(), Lasso()])","{'estimator__alpha': 0.1, 'feature_selection__k': 32, 'feature_selection__score_func': <function f_regression at 0x180487eb0>}",-3.263923,0.036615,0.278,0.0506
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.1},-3.263923,0.036615,0.278,0.0506
"dict_values([StandardScaler(), SelectKBest(), Lasso()])","{'estimator__alpha': 0.2, 'feature_selection__k': 32, 'feature_selection__score_func': <function f_regression at 0x180487eb0>}",-3.267383,0.061964,0.316593,0.046503
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.2},-3.267383,0.061964,0.316593,0.046503
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Lasso()])","{'estimator__alpha': 0.1, 'feature_selection__n_features_to_select': 10, 'feature_selection__step': 5}",-3.271499,0.080776,0.335304,0.084335
"dict_values([StandardScaler(), SelectKBest(), Lasso()])","{'estimator__alpha': 0.2, 'feature_selection__k': 25, 'feature_selection__score_func': <function f_regression at 0x180487eb0>}",-3.275463,0.061394,0.319901,0.048468
"dict_values([StandardScaler(), SelectKBest(), Lasso()])","{'estimator__alpha': 0.1, 'feature_selection__k': 25, 'feature_selection__score_func': <function f_regression at 0x180487eb0>}",-3.276075,0.030953,0.300211,0.069192
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 1.0, 'feature_selection__n_features_to_select': 10, 'feature_selection__step': 5}",-3.277201,0.077547,0.292307,0.081428


doing permutation test on importance; this may take time.
Number of selected features: 11


Unnamed: 0,predictor,coef,feature_importance,fa_abs
12,BSCS*ni,0.972764,0.126574,0.126574
26,RS*san,0.581897,0.051583,0.051583
2,BIS_11,-0.392702,0.021206,0.021206
7,ACES_abuse,0.1599,0.008119,0.008119
6,ACES_neglectful_parenting,-0.275182,0.007577,0.007577
1,EDM,0.119865,0.005431,0.005431
5,TRSQ,0.107321,0.003098,0.003098
30,ACES_sum*san,-0.156225,0.001753,0.001753
19,ACES_abuse*ni,-0.109025,0.000736,0.000736
18,ACES_neglectful_parenting*ni,0.0218,0.000542,0.000542


  results_vs_cors = final_results_wide.merge(group_correlations, left_index=True, right_index=True, how='outer')


Unnamed: 0,"(coef, base)","(coef, ni)","(coef, san)","(feature_importance, base)","(feature_importance, ni)","(feature_importance, san)",ichi_cor,ni_cor,san_cor,abs_effect_sum
BSCS,-0.0,0.973,-0.0,0.0,0.127,0.0,-0.137,0.35,0.008,0.127
RS,,,0.582,,,0.052,0.039,-0.154,0.26,0.052
BIS_11,-0.393,-0.0,0.0,0.021,0.0,0.0,0.047,-0.383,-0.043,0.021
ACES_abuse,0.16,-0.109,-0.0,0.008,0.001,0.0,,,,0.009
ACES_neglectful_parenting,-0.275,0.022,,0.008,0.001,,-0.046,-0.017,-0.218,0.008
EDM,0.12,,-0.0,0.005,,0.0,0.053,0.233,-0.056,0.005
TRSQ,0.107,-0.0,0.0,0.003,0.0,0.0,0.091,-0.222,0.264,0.003


['ni' 'san']
[1.28335298 0.42953651]
['san' 'san' 'ni' 'ichi' 'san' 'san' 'ichi' 'san' 'san' 'san' 'ni' 'ichi'
 'ichi' 'ichi' 'ichi' 'san' 'san' 'san' 'ichi' 'ichi' 'san' 'san' 'ni'
 'ni' 'ni' 'ni' 'ni' 'ni' 'ni' 'san' 'ni' 'san' 'ni' 'ichi' 'ni' 'san'
 'ni' 'ichi' 'san' 'ni' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ni' 'ni'
 'san' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ichi' 'ni' 'ni' 'ni' 'ichi' 'san'
 'ni' 'ni' 'ichi' 'ni' 'ichi' 'san' 'ni' 'ni' 'ni' 'san' 'ichi' 'ni' 'san'
 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'ichi' 'san' 'ichi' 'san' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'san' 'san' 'san' 'ichi'
 'ni' 'san' 'ichi' 'ichi' 'san' 'ni' 'ichi' 'san' 'ni' 'ni' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'ichi' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ichi'
 'ni' 'ni' 'ichi' 'ni' 'ni' 'ichi' 'ichi' 'san' 'san' 'ni' 'ichi' 'ni'
 'ichi' 'ichi' 'san' 'ichi' 'ni' 'san' 'san' 'ni' 'ni' 'san' 'san' 'san'
 'ichi' 'san' 'ni' 'san' 'ichi' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ni' 'ni'
 'ichi' 'ni' 'ichi' '

  overall_scores = overall_scores.append(


Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_selection', RFE(estimator=LinearRegression())),
                ('estimator', Ridge())])
{'feature_selection__n_features_to_select': [10, 25], 'feature_selection__step': [5], 'estimator__alpha': array([0.1, 1. , 0.2, 0.3, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 16 candidates, totalling 64 fits
Pipeline(steps=[('scaler', StandardScaler()), ('estimator', Lasso())])
{'estimator__alpha': array([0.1, 1. , 0.2, 0.3, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 8 candidates, totalling 32 fits
Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_selection', SelectKBest()), ('estimator', Lasso())])
{'feature_selection__score_func': [<function f_regression at 0x180487eb0>], 'feature_selection__k': [10, 25, 32], 'estimator__alpha': array([0.1, 1. , 0.2, 0.3, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 24 candidates, totalling 96 fits
Pipeline(steps=[('scaler', StandardScaler()),
       

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test_score,mean_test_score,std_test_score,std_test_score
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
model_description,params_str,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 1.0, 'feature_selection__n_features_to_select': 10, 'feature_selection__step': 5}",-3.280536,0.070711,0.303797,0.066271
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 0.8, 'feature_selection__n_features_to_select': 10, 'feature_selection__step': 5}",-3.28274,0.075581,0.303263,0.06862
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 0.6, 'feature_selection__n_features_to_select': 10, 'feature_selection__step': 5}",-3.285976,0.075457,0.30323,0.066774
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 0.4, 'feature_selection__n_features_to_select': 10, 'feature_selection__step': 5}",-3.291359,0.075054,0.302846,0.064125
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 1.0, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.29316,0.055817,0.257528,0.035962
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Lasso()])","{'estimator__alpha': 0.1, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.29427,0.042053,0.313554,0.06333
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 0.3, 'feature_selection__n_features_to_select': 10, 'feature_selection__step': 5}",-3.295468,0.074691,0.302547,0.062963
"dict_values([StandardScaler(), SelectKBest(), Lasso()])","{'estimator__alpha': 0.1, 'feature_selection__k': 32, 'feature_selection__score_func': <function f_regression at 0x180487eb0>}",-3.297475,0.033627,0.284506,0.053181
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.1},-3.297475,0.033627,0.284506,0.053181
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 0.2, 'feature_selection__n_features_to_select': 10, 'feature_selection__step': 5}",-3.300421,0.074639,0.302377,0.062179


doing permutation test on importance; this may take time.
Number of selected features: 10


Unnamed: 0,predictor,coef,feature_importance,fa_abs
11,san,-3.813027,1.65701,1.65701
12,BSCS*ni,3.054898,1.097093,1.097093
14,BIS_11*ni,-3.033321,1.044694,1.044694
27,TRSQ*san,2.389706,0.678421,0.678421
26,RS*san,1.935767,0.461544,0.461544
20,ACES_sum*ni,-1.514676,0.2569,0.2569
13,EDM*ni,1.42574,0.241843,0.241843
18,ACES_neglectful_parenting*ni,1.158152,0.164594,0.164594
6,ACES_neglectful_parenting,-1.102637,0.135514,0.135514
8,ACES_sum,0.773653,0.080227,0.080227


  results_vs_cors = final_results_wide.merge(group_correlations, left_index=True, right_index=True, how='outer')


Unnamed: 0,"(coef, base)","(coef, ni)","(coef, san)","(feature_importance, base)","(feature_importance, ni)","(feature_importance, san)",ichi_cor,ni_cor,san_cor,abs_effect_sum
san,-3.813,,,1.657,,,,,,1.657
BSCS,,3.055,,,1.097,,-0.137,0.415,-0.011,1.097
BIS_11,,-3.033,,,1.045,,0.047,-0.44,-0.033,1.045
TRSQ,,,2.39,,,0.678,0.091,-0.246,0.319,0.678
RS,,,1.936,,,0.462,0.039,-0.177,0.304,0.462
ACES_sum,0.774,-1.515,,0.08,0.257,,,,,0.337
ACES_neglectful_parenting,-1.103,1.158,,0.136,0.165,,-0.046,-0.014,-0.262,0.3
EDM,,1.426,,,0.242,,0.053,0.287,-0.065,0.242


['ni' 'san']
[1.28335298 0.42953651]
['san' 'san' 'ni' 'ichi' 'san' 'san' 'ichi' 'san' 'san' 'san' 'ni' 'ichi'
 'ichi' 'ichi' 'ichi' 'san' 'san' 'san' 'ichi' 'ichi' 'san' 'san' 'ni'
 'ni' 'ni' 'ni' 'ni' 'ni' 'ni' 'san' 'ni' 'san' 'ni' 'ichi' 'ni' 'san'
 'ni' 'ichi' 'san' 'ni' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ni' 'ni'
 'san' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ichi' 'ni' 'ni' 'ni' 'ichi' 'san'
 'ni' 'ni' 'ichi' 'ni' 'ichi' 'san' 'ni' 'ni' 'ni' 'san' 'ichi' 'ni' 'san'
 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'ichi' 'san' 'ichi' 'san' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'san' 'san' 'san' 'ichi'
 'ni' 'san' 'ichi' 'ichi' 'san' 'ni' 'ichi' 'san' 'ni' 'ni' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'ichi' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ichi'
 'ni' 'ni' 'ichi' 'ni' 'ni' 'ichi' 'ichi' 'san' 'san' 'ni' 'ichi' 'ni'
 'ichi' 'ichi' 'san' 'ichi' 'ni' 'san' 'san' 'ni' 'ni' 'san' 'san' 'san'
 'ichi' 'san' 'ni' 'san' 'ichi' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ni' 'ni'
 'ichi' 'ni' 'ichi' '

  overall_scores = overall_scores.append(


Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_selection', RFE(estimator=LinearRegression())),
                ('estimator', Ridge())])
{'feature_selection__n_features_to_select': [10, 25], 'feature_selection__step': [5], 'estimator__alpha': array([0.1, 1. , 0.2, 0.3, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 16 candidates, totalling 64 fits
Pipeline(steps=[('scaler', StandardScaler()), ('estimator', Lasso())])
{'estimator__alpha': array([0.1, 1. , 0.2, 0.3, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 8 candidates, totalling 32 fits
Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_selection', SelectKBest()), ('estimator', Lasso())])
{'feature_selection__score_func': [<function f_regression at 0x180487eb0>], 'feature_selection__k': [10, 25, 32], 'estimator__alpha': array([0.1, 1. , 0.2, 0.3, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 24 candidates, totalling 96 fits
Pipeline(steps=[('scaler', StandardScaler()),
       

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test_score,mean_test_score,std_test_score,std_test_score
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
model_description,params_str,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
"dict_values([StandardScaler(), SelectKBest(), Ridge()])","{'estimator__alpha': 1.0, 'feature_selection__k': 25, 'feature_selection__score_func': <function f_regression at 0x180487eb0>}",-3.309146,0.07657,0.25392,0.07256
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 1.0, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.313291,0.05871,0.271523,0.038085
"dict_values([StandardScaler(), SelectKBest(), Ridge()])","{'estimator__alpha': 0.8, 'feature_selection__k': 25, 'feature_selection__score_func': <function f_regression at 0x180487eb0>}",-3.313851,0.082828,0.250985,0.078523
"dict_values([StandardScaler(), SelectKBest(), Ridge()])","{'estimator__alpha': 1.0, 'feature_selection__k': 32, 'feature_selection__score_func': <function f_regression at 0x180487eb0>}",-3.314713,0.065306,0.255188,0.042587
"dict_values([StandardScaler(), Ridge()])",{'estimator__alpha': 1.0},-3.314713,0.065306,0.255188,0.042587
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 0.8, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.318083,0.062447,0.270421,0.040601
"dict_values([StandardScaler(), SelectKBest(), Ridge()])","{'estimator__alpha': 0.8, 'feature_selection__k': 32, 'feature_selection__score_func': <function f_regression at 0x180487eb0>}",-3.318897,0.069451,0.254652,0.043786
"dict_values([StandardScaler(), Ridge()])",{'estimator__alpha': 0.8},-3.318897,0.069451,0.254652,0.043786
"dict_values([StandardScaler(), SelectKBest(), Ridge()])","{'estimator__alpha': 0.6, 'feature_selection__k': 25, 'feature_selection__score_func': <function f_regression at 0x180487eb0>}",-3.320567,0.084618,0.248095,0.081016
"dict_values([StandardScaler(), SelectKBest(), Ridge()])","{'estimator__alpha': 0.6, 'feature_selection__k': 32, 'feature_selection__score_func': <function f_regression at 0x180487eb0>}",-3.324787,0.069078,0.254367,0.041618


doing permutation test on importance; this may take time.
Number of selected features: 25


Unnamed: 0,predictor,coef,feature_importance,fa_abs
12,BSCS*ni,4.668619,2.178039,2.178039
14,BIS_11*ni,-3.064834,0.892193,0.892193
17,TRSQ*ni,-3.028787,0.874044,0.874044
26,RS*san,2.816278,0.765277,0.765277
20,ACES_sum*ni,-1.98766,0.376462,0.376462
10,ni,1.538929,0.241385,0.241385
13,EDM*ni,1.388399,0.199411,0.199411
23,EDM*san,-1.295907,0.177492,0.177492
8,ACES_sum,1.279268,0.162508,0.162508
18,ACES_neglectful_parenting*ni,1.044533,0.114822,0.114822


  results_vs_cors = final_results_wide.merge(group_correlations, left_index=True, right_index=True, how='outer')


Unnamed: 0,"(coef, base)","(coef, ni)","(coef, san)","(feature_importance, base)","(feature_importance, ni)","(feature_importance, san)",ichi_cor,ni_cor,san_cor,abs_effect_sum
BSCS,-0.355,4.669,,0.013,2.178,,-0.137,0.544,-0.051,2.191
TRSQ,0.702,-3.029,,0.053,0.874,,0.091,-0.291,0.432,0.927
BIS_11,-0.504,-3.065,,0.025,0.892,,0.047,-0.55,-0.009,0.918
RS,,-0.122,2.816,,0.001,0.765,0.039,-0.223,0.397,0.766
ACES_sum,1.279,-1.988,-0.997,0.163,0.376,0.103,,,,0.642
EDM,0.409,1.388,-1.296,0.016,0.199,0.177,0.053,0.394,-0.084,0.393
ni,1.539,,,0.241,,,,,,0.241
ACES_neglectful_parenting,-0.896,1.045,-0.558,0.081,0.115,0.036,-0.046,-0.006,-0.355,0.232


['ni' 'san']
[1.28335298 0.42953651]
['san' 'san' 'ni' 'ichi' 'san' 'san' 'ichi' 'san' 'san' 'san' 'ni' 'ichi'
 'ichi' 'ichi' 'ichi' 'san' 'san' 'san' 'ichi' 'ichi' 'san' 'san' 'ni'
 'ni' 'ni' 'ni' 'ni' 'ni' 'ni' 'san' 'ni' 'san' 'ni' 'ichi' 'ni' 'san'
 'ni' 'ichi' 'san' 'ni' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ni' 'ni'
 'san' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ichi' 'ni' 'ni' 'ni' 'ichi' 'san'
 'ni' 'ni' 'ichi' 'ni' 'ichi' 'san' 'ni' 'ni' 'ni' 'san' 'ichi' 'ni' 'san'
 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'ichi' 'san' 'ichi' 'san' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'san' 'san' 'san' 'ichi'
 'ni' 'san' 'ichi' 'ichi' 'san' 'ni' 'ichi' 'san' 'ni' 'ni' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'ichi' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ichi'
 'ni' 'ni' 'ichi' 'ni' 'ni' 'ichi' 'ichi' 'san' 'san' 'ni' 'ichi' 'ni'
 'ichi' 'ichi' 'san' 'ichi' 'ni' 'san' 'san' 'ni' 'ni' 'san' 'san' 'san'
 'ichi' 'san' 'ni' 'san' 'ichi' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ni' 'ni'
 'ichi' 'ni' 'ichi' '

  overall_scores = overall_scores.append(


Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_selection', RFE(estimator=LinearRegression())),
                ('estimator', Ridge())])
{'feature_selection__n_features_to_select': [10, 25], 'feature_selection__step': [5], 'estimator__alpha': array([0.1, 1. , 0.2, 0.3, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 16 candidates, totalling 64 fits
Pipeline(steps=[('scaler', StandardScaler()), ('estimator', Lasso())])
{'estimator__alpha': array([0.1, 1. , 0.2, 0.3, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 8 candidates, totalling 32 fits
Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_selection', SelectKBest()), ('estimator', Lasso())])
{'feature_selection__score_func': [<function f_regression at 0x180487eb0>], 'feature_selection__k': [10, 25, 47], 'estimator__alpha': array([0.1, 1. , 0.2, 0.3, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 24 candidates, totalling 96 fits
Pipeline(steps=[('scaler', StandardScaler()),
       

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test_score,mean_test_score,std_test_score,std_test_score
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
model_description,params_str,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Lasso()])","{'estimator__alpha': 0.1, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.250385,0.053035,0.298879,0.081511
"dict_values([StandardScaler(), SelectKBest(), Lasso()])","{'estimator__alpha': 0.1, 'feature_selection__k': 47, 'feature_selection__score_func': <function f_regression at 0x180487eb0>}",-3.253106,0.047541,0.289953,0.035348
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.1},-3.253106,0.047541,0.289953,0.035348
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Lasso()])","{'estimator__alpha': 0.2, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.264384,0.061781,0.327677,0.069726
"dict_values([StandardScaler(), SelectKBest(), Lasso()])","{'estimator__alpha': 0.2, 'feature_selection__k': 47, 'feature_selection__score_func': <function f_regression at 0x180487eb0>}",-3.267684,0.049697,0.329519,0.044714
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.2},-3.267684,0.049697,0.329519,0.044714
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Lasso()])","{'estimator__alpha': 0.3, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.279107,0.06821,0.339821,0.050708
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.3},-3.279376,0.066624,0.342303,0.043856
"dict_values([StandardScaler(), SelectKBest(), Lasso()])","{'estimator__alpha': 0.3, 'feature_selection__k': 47, 'feature_selection__score_func': <function f_regression at 0x180487eb0>}",-3.279376,0.066624,0.342303,0.043856
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 1.0, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.281872,0.081962,0.25787,0.092071


doing permutation test on importance; this may take time.
Number of selected features: 9


Unnamed: 0,predictor,coef,feature_importance,fa_abs
17,BSCS*ni,1.724228,0.382029,0.382029
45,BFI_extraversion*san,0.792246,0.096467,0.096467
19,BIS_11*ni,-0.611822,0.05063,0.05063
38,ACES_neglectful_parenting*san,-0.425717,0.028504,0.028504
27,ACES_household_dysfunction*ni,-0.234528,0.009089,0.009089
41,ACES_divorced_separated*san,-0.16172,0.00441,0.00441
7,ACES_abuse,0.133042,0.00352,0.00352
24,ACES_abuse*ni,-0.127256,0.002904,0.002904
10,ACES_household_dysfunction,0.027042,4.9e-05,4.9e-05
37,TRSQ*san,0.0,0.0,0.0


  results_vs_cors = final_results_wide.merge(group_correlations, left_index=True, right_index=True, how='outer')


Unnamed: 0,"(coef, base)","(coef, ni)","(coef, san)","(feature_importance, base)","(feature_importance, ni)","(feature_importance, san)",ichi_cor,ni_cor,san_cor,abs_effect_sum
BSCS,,1.724,-0.0,,0.382,0.0,-0.137,0.35,0.008,0.382
BFI_extraversion,,,0.792,,,0.096,,,,0.096
BIS_11,,-0.612,,,0.051,,0.047,-0.383,-0.043,0.051
ACES_neglectful_parenting,-0.0,0.0,-0.426,0.0,0.0,0.029,-0.046,-0.017,-0.218,0.029
ACES_household_dysfunction,0.027,-0.235,-0.0,0.0,0.009,0.0,,,,0.009
ACES_abuse,0.133,-0.127,0.0,0.004,0.003,0.0,,,,0.006
ACES_divorced_separated,-0.0,-0.0,-0.162,0.0,0.0,0.004,,,,0.004


['ni' 'san']
[1.28335298 0.42953651]
['san' 'san' 'ni' 'ichi' 'san' 'san' 'ichi' 'san' 'san' 'san' 'ni' 'ichi'
 'ichi' 'ichi' 'ichi' 'san' 'san' 'san' 'ichi' 'ichi' 'san' 'san' 'ni'
 'ni' 'ni' 'ni' 'ni' 'ni' 'ni' 'san' 'ni' 'san' 'ni' 'ichi' 'ni' 'san'
 'ni' 'ichi' 'san' 'ni' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ni' 'ni'
 'san' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ichi' 'ni' 'ni' 'ni' 'ichi' 'san'
 'ni' 'ni' 'ichi' 'ni' 'ichi' 'san' 'ni' 'ni' 'ni' 'san' 'ichi' 'ni' 'san'
 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'ichi' 'san' 'ichi' 'san' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'san' 'san' 'san' 'ichi'
 'ni' 'san' 'ichi' 'ichi' 'san' 'ni' 'ichi' 'san' 'ni' 'ni' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'ichi' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ichi'
 'ni' 'ni' 'ichi' 'ni' 'ni' 'ichi' 'ichi' 'san' 'san' 'ni' 'ichi' 'ni'
 'ichi' 'ichi' 'san' 'ichi' 'ni' 'san' 'san' 'ni' 'ni' 'san' 'san' 'san'
 'ichi' 'san' 'ni' 'san' 'ichi' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ni' 'ni'
 'ichi' 'ni' 'ichi' '

  overall_scores = overall_scores.append(


Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_selection', SelectKBest()), ('estimator', Ridge())])
{'feature_selection__score_func': [<function f_regression at 0x180487eb0>], 'feature_selection__k': [10, 25, 47], 'estimator__alpha': array([0.1, 1. , 0.2, 0.3, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 24 candidates, totalling 96 fits
Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_selection', RFE(estimator=LinearRegression())),
                ('estimator', Ridge())])
{'feature_selection__n_features_to_select': [10, 25], 'feature_selection__step': [5], 'estimator__alpha': array([0.1, 1. , 0.2, 0.3, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 16 candidates, totalling 64 fits
Pipeline(steps=[('scaler', StandardScaler()), ('estimator', Lasso())])
{'estimator__alpha': array([0.1, 1. , 0.2, 0.3, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 8 candidates, totalling 32 fits
Pipeline(steps=[('scaler', StandardScaler()),
       

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test_score,mean_test_score,std_test_score,std_test_score
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
model_description,params_str,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
"dict_values([StandardScaler(), SelectKBest(), Lasso()])","{'estimator__alpha': 0.1, 'feature_selection__k': 47, 'feature_selection__score_func': <function f_regression at 0x180487eb0>}",-3.285414,0.048881,0.298713,0.035208
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.1},-3.285414,0.048881,0.298713,0.035208
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Lasso()])","{'estimator__alpha': 0.1, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.300585,0.056362,0.319323,0.067024
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 1.0, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.304495,0.120749,0.276454,0.067585
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 0.8, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.311944,0.131912,0.276239,0.07076
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 0.6, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.321352,0.136164,0.276253,0.070526
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.2},-3.32423,0.046617,0.339216,0.047183
"dict_values([StandardScaler(), SelectKBest(), Lasso()])","{'estimator__alpha': 0.2, 'feature_selection__k': 47, 'feature_selection__score_func': <function f_regression at 0x180487eb0>}",-3.32423,0.046617,0.339216,0.047183
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Lasso()])","{'estimator__alpha': 0.2, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.327913,0.067239,0.341078,0.061043
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 0.4, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.334452,0.140091,0.276935,0.070247


doing permutation test on importance; this may take time.
Number of selected features: 19


Unnamed: 0,predictor,coef,feature_importance,fa_abs
17,BSCS*ni,1.520835,0.280856,0.280856
36,RS*san,0.60134,0.053674,0.053674
18,EDM*ni,0.521283,0.037669,0.037669
38,ACES_neglectful_parenting*san,-0.454181,0.030541,0.030541
2,BIS_11,-0.443326,0.030219,0.030219
19,BIS_11*ni,-0.452073,0.028686,0.028686
31,BFI_neuroticism*ni,-0.448212,0.028388,0.028388
13,BFI_extraversion,0.331615,0.022058,0.022058
45,BFI_extraversion*san,0.301765,0.017198,0.017198
11,BFI_agreeableness,-0.325563,0.014493,0.014493


  results_vs_cors = final_results_wide.merge(group_correlations, left_index=True, right_index=True, how='outer')


Unnamed: 0,"(coef, base)","(coef, ni)","(coef, san)","(feature_importance, base)","(feature_importance, ni)","(feature_importance, san)",ichi_cor,ni_cor,san_cor,abs_effect_sum
BSCS,-0.0,1.521,-0.0,0.0,0.281,0.0,-0.137,0.415,-0.011,0.281
BIS_11,-0.443,-0.452,-0.0,0.03,0.029,0.0,0.047,-0.44,-0.033,0.059
RS,0.108,-0.0,0.601,0.004,0.0,0.054,0.039,-0.177,0.304,0.058
EDM,0.119,0.521,-0.0,0.003,0.038,0.0,0.053,0.287,-0.065,0.04
BFI_extraversion,0.332,0.0,0.302,0.022,0.0,0.017,,,,0.039
ACES_neglectful_parenting,-0.072,0.0,-0.454,0.002,0.0,0.031,-0.046,-0.014,-0.262,0.032
BFI_neuroticism,-0.0,-0.448,0.0,0.0,0.028,0.0,,,,0.028
BFI_agreeableness,-0.326,0.0,-0.0,0.014,0.0,0.0,,,,0.014
ACES_divorced_separated,-0.0,-0.0,-0.263,0.0,0.0,0.01,,,,0.01
ACES_household_dysfunction,0.037,-0.19,-0.0,0.0,0.006,0.0,,,,0.006


['ni' 'san']
[1.28335298 0.42953651]
['san' 'san' 'ni' 'ichi' 'san' 'san' 'ichi' 'san' 'san' 'san' 'ni' 'ichi'
 'ichi' 'ichi' 'ichi' 'san' 'san' 'san' 'ichi' 'ichi' 'san' 'san' 'ni'
 'ni' 'ni' 'ni' 'ni' 'ni' 'ni' 'san' 'ni' 'san' 'ni' 'ichi' 'ni' 'san'
 'ni' 'ichi' 'san' 'ni' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ni' 'ni'
 'san' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ichi' 'ni' 'ni' 'ni' 'ichi' 'san'
 'ni' 'ni' 'ichi' 'ni' 'ichi' 'san' 'ni' 'ni' 'ni' 'san' 'ichi' 'ni' 'san'
 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'ichi' 'san' 'ichi' 'san' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'san' 'ni' 'san' 'ni' 'ichi' 'san' 'san' 'san' 'ichi'
 'ni' 'san' 'ichi' 'ichi' 'san' 'ni' 'ichi' 'san' 'ni' 'ni' 'san' 'ni'
 'ichi' 'ni' 'ichi' 'ichi' 'ni' 'ichi' 'ichi' 'ichi' 'san' 'san' 'ichi'
 'ni' 'ni' 'ichi' 'ni' 'ni' 'ichi' 'ichi' 'san' 'san' 'ni' 'ichi' 'ni'
 'ichi' 'ichi' 'san' 'ichi' 'ni' 'san' 'san' 'ni' 'ni' 'san' 'san' 'san'
 'ichi' 'san' 'ni' 'san' 'ichi' 'ichi' 'ichi' 'ni' 'san' 'ni' 'ni' 'ni'
 'ichi' 'ni' 'ichi' '

  overall_scores = overall_scores.append(


Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_selection', RFE(estimator=LinearRegression())),
                ('estimator', Ridge())])
{'feature_selection__n_features_to_select': [10, 25], 'feature_selection__step': [5], 'estimator__alpha': array([0.1, 1. , 0.2, 0.3, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 16 candidates, totalling 64 fits
Pipeline(steps=[('scaler', StandardScaler()), ('estimator', Lasso())])
{'estimator__alpha': array([0.1, 1. , 0.2, 0.3, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 8 candidates, totalling 32 fits
Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_selection', SelectKBest()), ('estimator', Lasso())])
{'feature_selection__score_func': [<function f_regression at 0x180487eb0>], 'feature_selection__k': [10, 25, 47], 'estimator__alpha': array([0.1, 1. , 0.2, 0.3, 0.4, 0.6, 0.8, 1. ])}
Fitting 4 folds for each of 24 candidates, totalling 96 fits
Pipeline(steps=[('scaler', StandardScaler()),
       

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test_score,mean_test_score,std_test_score,std_test_score
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
model_description,params_str,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 1.0, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.315547,0.117999,0.264293,0.103017
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 0.8, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.31975,0.129352,0.262778,0.108908
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 0.6, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.325414,0.133698,0.262301,0.110055
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 0.4, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.333861,0.137856,0.262968,0.112172
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 0.3, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.339882,0.139326,0.263513,0.113842
"dict_values([StandardScaler(), Lasso()])",{'estimator__alpha': 0.1},-3.347436,0.053581,0.310509,0.039265
"dict_values([StandardScaler(), SelectKBest(), Lasso()])","{'estimator__alpha': 0.1, 'feature_selection__k': 47, 'feature_selection__score_func': <function f_regression at 0x180487eb0>}",-3.347436,0.053581,0.310509,0.039265
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 0.2, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.347742,0.140151,0.264546,0.116099
"dict_values([StandardScaler(), RFE(estimator=LinearRegression()), Ridge()])","{'estimator__alpha': 0.1, 'feature_selection__n_features_to_select': 25, 'feature_selection__step': 5}",-3.358515,0.140169,0.267755,0.118292
"dict_values([StandardScaler(), Ridge()])",{'estimator__alpha': 1.0},-3.372101,0.080915,0.234127,0.04791


doing permutation test on importance; this may take time.
Number of selected features: 25


Unnamed: 0,predictor,coef,feature_importance,fa_abs
17,BSCS*ni,5.096231,2.541397,2.541397
19,BIS_11*ni,-4.81943,2.295673,2.295673
16,san,-4.441786,1.903963,1.903963
37,TRSQ*san,3.266038,1.053144,1.053144
36,RS*san,2.745558,0.742469,0.742469
45,BFI_extraversion*san,2.183413,0.471304,0.471304
43,BFI_agreeableness*san,-1.864833,0.334916,0.334916
15,ni,1.278988,0.164381,0.164381
27,ACES_household_dysfunction*ni,-1.091946,0.112083,0.112083
23,ACES_neglectful_parenting*ni,0.964651,0.093608,0.093608


  results_vs_cors = final_results_wide.merge(group_correlations, left_index=True, right_index=True, how='outer')


Unnamed: 0,"(coef, base)","(coef, ni)","(coef, san)","(feature_importance, base)","(feature_importance, ni)","(feature_importance, san)",ichi_cor,ni_cor,san_cor,abs_effect_sum
BSCS,,5.096,0.662,,2.541,0.044,-0.137,0.544,-0.051,2.585
BIS_11,,-4.819,,,2.296,,0.047,-0.55,-0.009,2.296
san,-4.442,,,1.904,,,,,,1.904
TRSQ,,,3.266,,,1.053,0.091,-0.291,0.432,1.053
RS,,,2.746,,,0.742,0.039,-0.223,0.397,0.742
BFI_extraversion,,,2.183,,,0.471,,,,0.471
BFI_agreeableness,,,-1.865,,,0.335,,,,0.335
ACES_household_dysfunction,0.862,-1.092,-0.412,0.075,0.112,0.017,,,,0.204
ni,1.279,,,0.164,,,,,,0.164
ACES_neglectful_parenting,-0.689,0.965,-0.426,0.049,0.094,0.017,-0.046,-0.006,-0.355,0.16


  overall_scores = overall_scores.append(


In [9]:
overall_scores

Unnamed: 0,n_features,effect_size,overall_score
0,10.0,0.08,-0.08398
1,10.0,0.1,-0.048563
2,10.0,0.15,0.079415
3,15.0,0.08,-0.078301
4,15.0,0.1,-0.121002
5,15.0,0.15,0.074678
