In [1]:
import yaml
from yaml.loader import SafeLoader
from socket import gethostname
import numpy as np
import pandas as pd
from sklearn.base import clone
from dev_interaction_util import generate_synthetic_dev_outcomes, generate_synthetic_dev_data, set_up_interactions
from dev_interaction_util import do_scoring_loop, get_best_model, summarize_overall_df_results, do_final_fit, present_model_results, present_results_vs_ground_truth_cors
from dev_interaction_util import load_and_preprocess_data, impute_data, run_full_limited_predictor_analysis
from ml_util import *
# Imputing with MICE
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn import linear_model
from ml_util import get_data_for_imputation
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.inspection import permutation_importance
import numpy as np
from IPython.display import display, HTML
from sklearn.base import clone
from sklearn.inspection import permutation_importance
import seaborn as sns
from sklearn.feature_selection import SelectKBest, f_regression, RFE


In [2]:


print(gethostname())
# Open the file and load the file
with open('config.yml') as f:
    all_yaml = yaml.load(f, Loader=SafeLoader)
    if gethostname() in all_yaml.keys():
        config = all_yaml[gethostname()]
    else:
        config = all_yaml['default']
        
print(config)



Benjamins-MacBook-Pro-2.local
{'dropbox_data_dir': '/Users/benjaminsmith/Dropbox (University of Oregon)/UO-SAN Lab/Berkman Lab/Devaluation/analysis_files/data/'}


This notebook is derived from `test_feature_selection.ipynb`.

In [3]:
dropbox_data_dir = config['dropbox_data_dir']


In [4]:
analysis_data, outcome_measures = load_and_preprocess_data(dropbox_data_dir)

In [6]:
outcome_measures['bf_1']

0      33.0
1       NaN
2      39.8
3      40.8
4      47.6
       ... 
270    29.9
271    33.1
272    27.4
273    42.1
274    33.3
Name: bf_1, Length: 275, dtype: float64

In [9]:
outcome_measures['cancer_promoting_minus_preventing_FFQ']

0     -0.113462
1     -0.682692
2     -0.698077
3      0.321154
4     -0.603846
         ...   
270   -0.400000
271   -0.619231
272    0.157692
273   -0.709615
274   -1.011538
Name: cancer_promoting_minus_preventing_FFQ, Length: 275, dtype: float64

In [5]:
analysis_data_imputed = impute_data(analysis_data)



In [6]:


def do_hyperparameter_selection_loop_r2(X,y,cv):
    return(do_hyperparameter_selection_loop_w_metric(X,y,cv,'r2'))

#loops through the different estimators and feature selection methods and does a grid search over all to find the best hyperparameters
def do_hyperparameter_selection_loop(X, y,cv):
    return(do_hyperparameter_selection_loop_w_metric(X,y,cv,'neg_mean_absolute_error'))

#loops through the different estimators and feature selection methods and does a grid search over all to find the best hyperparameters
def do_hyperparameter_selection_loop_w_metric(X, y,cv,metric):
    #alpha parameters for Ridge and Lasso
    alpha_10pow_lower = 1
    alpha_10pow_upper = 0
    alpha_increments=1
    alpha_range = np.concatenate([np.power(10,np.linspace(-alpha_10pow_lower,alpha_10pow_upper,(alpha_10pow_lower+alpha_10pow_upper)*alpha_increments+1)),
        [0.2,0.3,0.4,0.6,0.8,1.0]])
    
    all_cv_results = []

    pipeline_estimator_name = 'estimator'
    feature_selection_name = 'feature_selection'


    #define the param_grid for the estimators
    estimators_to_run = {
        'Ridge':{
            'estimator':linear_model.Ridge,
            'parameters':{'alpha':alpha_range}
        },
        'Lasso':{
            'estimator':linear_model.Lasso,
            'parameters':{'alpha':alpha_range}
        },
        'DecisionTreeRegressor':{
            'estimator':DecisionTreeRegressor,
            'parameters':{
                'max_depth':[2, 4],
                'min_samples_split':[20,50],
                'min_samples_leaf':[20,50]
            }
        }             
    }

    k_max_val = np.min([50,X.shape[1]])

    for estimator_name,estimator_dict in estimators_to_run.items():
        #param grid for the feature seelction
        #this is here because we need to know the estimator to pass to the feature selector
        feature_selectors_to_run = {
            'None':None,
            'KBest':{
                'selector':SelectKBest(),
                'parameters':{
                    'score_func' : [f_regression], 
                    'k' : [10,25,k_max_val]
                    }
            },
            'RFE':{
                'selector':RFE(linear_model.LinearRegression()),
                'parameters':{
                    'n_features_to_select' : [10,25],
                    #'verbose':[1],
                    'step':[5]
                }
            }
        }
        for selector_name, selector_dict in feature_selectors_to_run.items():
        #create the estimator
            if selector_name == 'None':
                pipeline = Pipeline([('scaler',StandardScaler()),
                                     (pipeline_estimator_name,estimator_dict['estimator']())])
                selector_params = {}
            else:
                pipeline = Pipeline([('scaler',StandardScaler()),
                                     (feature_selection_name,selector_dict['selector']), 
                                     (pipeline_estimator_name,estimator_dict['estimator']())])
                selector_params = selector_dict['parameters']

            estimator_param_grid = {(pipeline_estimator_name + '__'+k):v for k,v in estimator_dict['parameters'].items()}
            selector_param_grid = {(feature_selection_name + '__'+k):v for k,v in selector_params.items()}
            #combine the two param grid dictionaries
            full_param_grid = {**selector_param_grid, **estimator_param_grid}
            print(pipeline)
            print(full_param_grid)

            
        
            gs_1 = GridSearchCV(estimator=pipeline, 
                                param_grid = full_param_grid, 
                                cv=cv,scoring=metric,verbose=1)
            gs_1.fit(X,y)
            all_cv_results.append(gs_1)

    #create a dataframe with the best parameters, best mean_test_score, and name of the model

    best_params_df = pd.DataFrame({
        'model': [cv_result.estimator for cv_result in all_cv_results],
        'model_name': [cv_result.estimator.__class__.__name__ for cv_result in all_cv_results],
        'best_params': [extract_estimator_params_from_gridsearch(cv_result.best_params_) for cv_result in all_cv_results],
        'best_score': [cv_result.best_score_ for cv_result in all_cv_results],
        'best_raw_params' : [cv_result.best_params_ for cv_result in all_cv_results]
        })
    
    best_params_df = best_params_df.sort_values('best_score',ascending=False).reset_index(drop=True)

    best_model = clone(best_params_df['model'][0])
    best_model_params = best_params_df['best_raw_params'][0]
    best_model.set_params(**best_model_params)

    return {
        'best_model': best_model,
        'best_params_df':best_params_df,
        'raw_cv_results':all_cv_results
    }


# Improving fit with manual theory-driven feature

My past analysis showed that by manually removing some features before the analysis starts, we can improve performance beyond the chance performance otherwise seen.

So, it might be useful to understand how much we can improve our performance by manual feature selection before the automatic feature selection applies.

This was previously done in `test_limited_predictors.ipynb`. We tested as few as 2 distractor features. In that test, predictor features generally had correlations in the range of |r|=0.06 to 0.53, with most around 0.4 (we should confirm that because it seems fishy that PCS was detegted as an effect, but didn't model as a large predictor). With most `|r|=0.4`, this seems unrealistically high to expect, and we should aim to build a pipeline capable of detecting more subtle effects than that. An approximate `|r|=0.3` can be achieved by mixing in a predictor scaled to 8% of normal scale.

I can imagine it is plausible to cut down to as few as two self-report, one behavioral, and one neural measure per intervention, plus sex and age. That would yield 10 different variables. At the other end, we might want 10 self-report, two behavioral, and five neural measures per intervention tested, plus 6 different demographic variables--a total of 40 variables. Let's see how these would perform, as well as mid-range of 20 predictor variables. In each case we'll restrict to three valid predictors per intervention.

In [None]:


def run_2_group_predictor_analysis(total_predictor_count, outcome_measures, analysis_data_imputed, effect_size, hyperparameter_optimizer,
                                        custom_interaction_effects=None
                                        ):



    ### Try out CV with simple gridsearch

    scoring_data = do_scoring_loop(X=predictor_data_nona, y= outcome_measures_nona['d_bf'], 
                    groups = group_assignments_nona, 
                    hyperparameter_selection_on_fold=hyperparameter_optimizer,
                    outer_folds=5)

    scores = scoring_data['scores']
    best_models = scoring_data['best_models']
    best_params_df_list = scoring_data['best_params_df_list']
    raw_cv_results_list = scoring_data['raw_cv_results_list']

    print("scores:")
    print(scores)
    overall_score = np.mean(scores)
    print("overall_score:")
    print(overall_score)



    best_model = get_best_model(summarize_overall_df_results(raw_cv_results_list))
    final_fit = do_final_fit(X=predictor_data_nona, y= outcome_measures_nona['d_bf'], final_model=best_model)
    final_results = present_model_results(X=predictor_data_nona, final_fit=final_fit, y=outcome_measures_nona['d_bf'])

    #print rows of final_results where feature_name is the list of features to check
    base_regressors = interaction_effect_df.predictor[interaction_effect_df.interaction_effect!=0]
    regressors_to_check = [x+y for y in ['','*nisan'] for x in base_regressors]
    final_results['planned_regression'] = final_results['predictor'].isin(regressors_to_check)

    present_results_vs_ground_truth_cors(predictor_data_nona,outcome_measures_nona,group_assignments_nona,final_results,base_regressors)

    return(overall_score)




In [None]:


# def run_2_group_predictor_analysis(total_predictor_count, outcome_measures, analysis_data_imputed, effect_size, hyperparameter_optimizer,
#                                         custom_interaction_effects=None
#                                         ):

#set np random seed
np.random.seed(3161527)

group_names = ['ichi','ni','san']
#assign each row randomly to a group
group_assignments_3 = np.random.choice(group_names,analysis_data_imputed.shape[0])

#two-group analysis, comparing ichi vs ni and san
group_assignments_2 = group_assignments_3.copy()
group_assignments_2[group_assignments_2=='ni'] = 'nisan'
group_assignments_2[group_assignments_2=='san'] = 'nisan'


#synthetic outcomes
outcome_measures = generate_synthetic_dev_outcomes(outcome_measures)

#create a limited set of predictors
# analysis_data_smol = analysis_data_imputed.iloc[:,0:total_predictor_count]

# # add synthetic primary and interaction effects

# if custom_interaction_effects is None:
#     #set up the interaction effects
#     #0.08 will give us correlations around 0.3 between the interaction effects and the outcome
#     custom_interaction_effects_g1 = [0]*analysis_data_smol.shape[1]
#     custom_interaction_effects_g1[0] = effect_size
#     custom_interaction_effects_g1[1] = effect_size
#     custom_interaction_effects_g1[2] = -effect_size


#     custom_interaction_effects = {'nisan':custom_interaction_effects_g1}




# synthetic_data = generate_synthetic_dev_data(analysis_data_smol, group_assignments_2,outcome_measures, group_interaction_effects = custom_interaction_effects)
# interaction_effect_df = synthetic_data['X_weights']
outcome_measures = outcome_measures

# Set up outcome measures and group assignment one-hot

outcome_measures = calculate_outcome_changes(outcome_measures)


# group_assignment_onehots = pd.get_dummies(group_assignments_2).loc[:,['nisan']]

# predictor_data = set_up_interactions(analysis_data_smol, group_assignment_onehots)


#remove any NA values for this outcome measure in both the predictor data and the outcome data
outcome_nas = outcome_measures['d_bf'].isna()

outcome_measures_nona = outcome_measures.loc[~outcome_nas,:]
# predictor_data_nona = predictor_data.loc[~outcome_nas,:]
# group_assignment_onehots_nonan = group_assignment_onehots.loc[~outcome_nas,:]
# group_assignments_nona = group_assignments_2[~outcome_nas]


In [None]:
outcome_measures['d_bf']

In [9]:
overall_scores

Unnamed: 0,n_features,effect_size,overall_score
0,15.0,0.08,-0.078301
1,15.0,0.1,-0.121002
2,15.0,0.12,0.004276
3,15.0,0.14,0.054448
4,15.0,0.16,0.101453
5,20.0,0.08,-0.090946
6,20.0,0.1,-0.045115
7,20.0,0.12,-0.033768
8,20.0,0.14,0.035614
9,20.0,0.16,0.119037
