In [1]:
import pandas as pd
import numpy as np
import scipy.stats as ss
import sklearn
import sklearn.decomposition as skdec
import matplotlib.pyplot as plt
import factor_analyzer
from rpy2.robjects.packages import importr










In [2]:
task_df = pd.read_csv('taskdata_imputed.csv')
del task_df['Unnamed: 0']

# Cleaning the Data

### What Ian did:
1 select variables, choose from:
- noDDM
- EZ ddm
- hddm

2 clean variables * NOTE THIS IS BEING DONE FULL DATASET (task + survey) *
- transform_remove_skew
 - for positively skewed (skew > 1) vars: 
  - shift so min is 0 (or 1?), then positive_subset = log(shifted(var)). remove outliers of positive subset.
  - keep those w/ new_skew < thresh, drop those are still too skewed
 - for negatively skewed (skew < 1) vars: 
  - negative subset = log(negative_subset.max()+1 - negative_subset) 
  - I believe this creates a right skewed dist with a min 1(?) then takes the log
  - then remove outliers, keep successful transforms, drop failures
- remove_outliers
- remove_correlated_task_variables ????

selected_variables_clean = transform_remove_skew(selected_variables)

selected_variables_clean = remove_outliers(selected_variables_clean)

selected_variables_clean = remove_correlated_task_variables(selected_variables_clean)

3 impute data
- missForest

4 separate task & survey|

# Test EFA appropriateness

### 1. Sample size (participants:variables)

In [3]:
#sample size
ncases, nvars = task_df.shape
sample_ratio = ncases / nvars
if sample_ratio > 3:
    print('good enough for a pilot')
if sample_ratio >= 5:
    print('reasonable for a full analysis, but not ideal')
if sample_ratio >= 20:
    print('good enough to publish with!')

good enough for a pilot


### 2. bartlett test, want p < 0.5 (or 0.1, or 0.01 ...)

from https://www.statisticshowto.datasciencecentral.com/bartletts-test/#BTs :

Bartlett’s test for Sphericity compares your correlation matrix (a matrix of Pearson correlations) to the identity matrix. In other words, it checks if there is a redundancy between variables that can be summarized with some factors

In [4]:
chi_square_value,p=factor_analyzer.calculate_bartlett_sphericity(task_df)
chi_square_value, p

(39472.44094151645, 0.0)

In [5]:
chi_square_value1, p1 = ss.bartlett(*[task_df[col].values for col in task_df.columns])
print(chi_square_value1)
print(p1)
chi_square_value2, p2 = ss.bartlett(*task_df.values)
print(chi_square_value2)
print(p2)

680469.5429067654
0.0
6620.585584576937
0.0


### 3. Kaiser-Meyer-Olkin (KMO) Test, want a score >= 0.6

from https://www.statisticshowto.datasciencecentral.com/kaiser-meyer-olkin/ :

Kaiser-Meyer-Olkin (KMO) Test is a measure of how suited your data is for Factor Analysis. The test measures sampling adequacy for each variable in the model and for the complete model. The statistic is a measure of the proportion of variance among variables that might be common variance. The lower the proportion, the more suited your data is to Factor Analysis.

KMO returns values between 0 and 1. A rule of thumb for interpreting the statistic:

KMO values between 0.8 and 1 indicate the sampling is adequate.

KMO values less than 0.6 indicate the sampling is not adequate and that remedial action should be taken. Some authors put this value at 0.5, so use your own judgment for values between 0.5 and 0.6.

KMO Values close to zero means that there are large partial correlations compared to the sum of correlations. In other words, there are widespread correlations which are a large problem for factor analysis

For reference, Kaiser put the following values on the results:

* 0.00 to 0.49 unacceptable.
* 0.50 to 0.59 miserable.
* 0.60 to 0.69 mediocre.
* 0.70 to 0.79 middling.
* 0.80 to 0.89 meritorious.
* 0.90 to 1.00 marvelous.

In [6]:
kmo_all,kmo_model=factor_analyzer.calculate_kmo(task_df)
kmo_model



0.7569843212073796

#### Other implementaiton of Bartlett and KMO
https://github.com/Sarmentor/KMO-Bartlett-Tests-Python/blob/master/tests_correlation.py

In [7]:
def bartlett_sphericity(dataset, corr_method="pearson"):
    
    r"""
    
    Parameters
    ----------
    dataset : dataframe, mandatory (numerical or ordinal variables)
        
    corr_method : {'pearson', 'spearman'}, optional
        
    Returns
    -------
    out : namedtuple
        The function outputs the test value (chi2), the degrees of freedom (ddl)
        and the p-value.
        It also delivers the n_p_ratio if the number of instances (n) divided 
        by the numbers of variables (p) is more than 5. A warning might be issued.
        
        Ex:
        chi2:  410.27280642443156
        ddl:  45.0
        p-value:  8.73359410503e-61
        n_p_ratio:    20.00
        
        Out: Bartlett_Sphericity_Test_Results(chi2=410.27280642443156, ddl=45.0, pvalue=8.7335941050291506e-61)
    
    References
    ----------
    
    [1] Bartlett,  M.  S.,  (1951),  The  Effect  of  Standardization  on  a  chi  square  Approximation  in  Factor
    Analysis, Biometrika, 38, 337-344.
    [2] R. Sarmento and V. Costa, (2017)
    "Comparative Approaches to Using R and Python for Statistical Data Analysis", IGI-Global.
    
    Examples
    --------
    illustration how to use the function.
    
    >>> bartlett_sphericity(survey_data, corr_method="spearman")
    chi2:  410.27280642443145
    ddl:  45.0
    p-value:  8.73359410503e-61
    n_p_ratio:    20.00
    C:\Users\Rui Sarmento\Anaconda3\lib\site-packages\spyderlib\widgets\externalshell\start_ipython_kernel.py:75: 
    UserWarning: NOTE: we advise  to  use  this  test  only  if  the number of instances (n) divided by the number of variables (p) is lower than 5. Please try the KMO test, for example.
    backend_o = CONF.get('ipython_console', 'pylab/backend', 0)
    Out[12]: Bartlett_Sphericity_Test_Results(chi2=410.27280642443156, ddl=45.0, pvalue=8.7335941050291506e-61)
    """
    
    import numpy as np
    import math as math
    import scipy.stats as stats
    import warnings as warnings
    import collections

    #Dimensions of the Dataset
    n = dataset.shape[0]
    p = dataset.shape[1]
    n_p_ratio = n / p
    
    #Several Calculations
    chi2 = - (n - 1 - (2 * p + 5) / 6) * math.log(np.linalg.det(dataset.corr(method=corr_method)))
    #Freedom Degree
    ddl = p * (p - 1) / 2
    #p-value
    pvalue = stats.chi2.pdf(chi2 , ddl)
    
    Result = collections.namedtuple("Bartlett_Sphericity_Test_Results", ["chi2", "ddl", "pvalue"], rename=False)   
    
    #Output of the results - named tuple
    result = Result(chi2=chi2,ddl=ddl,pvalue=pvalue) 

    
    #Output of the function
    if n_p_ratio > 5 :
        print("n_p_ratio: {0:8.2f}".format(n_p_ratio))
        warnings.warn("NOTE: we advise  to  use  this  test  only  if  the number of instances (n) divided by the number of variables (p) is lower than 5. Please try the KMO test, for example.")
        
    
    return result

def kmo(dataset_corr):
    
    import numpy as np
    import math as math
    import collections
    
    r"""
    
    Parameters
    ----------
    dataset_corr : ndarray
        Array containing dataset correlation
        
    Returns
    -------
    out : namedtuple
        The function outputs the test value (value), the test value per variable (per_variable)
       
        Ex:
        Out[30]: 
        KMO_Test_Results(value=0.798844102413, 
        per_variable=
        Q1     0.812160468405
        Q2     0.774161264483
        Q3     0.786819432663
        Q4     0.766251123086
        Q5     0.800579196084
        Q6     0.842927745203 
        Q7     0.792010173432 
        Q8     0.862037322891
        Q9     0.714795031915 
        Q10    0.856497242574
        dtype: float64)
    
    References
    ----------    
    [1] Kaiser, H. F. (1970). A second generation little jiffy. Psychometrika, 35(4), 401-415.
    [2] Kaiser, H. F. (1974). An index of factorial simplicity. Psychometrika, 39(1), 31-36.
    [3] R. Sarmento and V. Costa, (2017)
    "Comparative Approaches to Using R and Python for Statistical Data Analysis", IGI-Global
    
    Examples
    --------
    illustration how to use the function.
    
    >>> kmo_test(survey_data.corr(method="spearman"))
         
        KMO_Test_Results(value=0.798844102413, 
        per_variable=
        Q1     0.812160468405
        Q2     0.774161264483
        Q3     0.786819432663
        Q4     0.766251123086
        Q5     0.800579196084
        Q6     0.842927745203 
        Q7     0.792010173432 
        Q8     0.862037322891
        Q9     0.714795031915 
        Q10    0.856497242574
        dtype: float64) 
"""
    
    

    #KMO Test
    #inverse of the correlation matrix
    corr_inv = np.linalg.inv(dataset_corr)
    nrow_inv_corr, ncol_inv_corr = dataset_corr.shape
    
    #partial correlation matrix
    A = np.ones((nrow_inv_corr,ncol_inv_corr))
    for i in range(0,nrow_inv_corr,1):
        for j in range(i,ncol_inv_corr,1):
            #above the diagonal
            A[i,j] = - (corr_inv[i,j]) / (math.sqrt(corr_inv[i,i] * corr_inv[j,j]))
            #below the diagonal
            A[j,i] = A[i,j]
    
    #transform to an array of arrays ("matrix" with Python)
    dataset_corr = np.asarray(dataset_corr)
        
    #KMO value
    kmo_num = np.sum(np.square(dataset_corr)) - np.sum(np.square(np.diagonal(dataset_corr)))
    kmo_denom = kmo_num + np.sum(np.square(A)) - np.sum(np.square(np.diagonal(A)))
    kmo_value = kmo_num / kmo_denom
    
    
    kmo_j = [None]*dataset_corr.shape[1]
    #KMO per variable (diagonal of the spss anti-image matrix)
    for j in range(0, dataset_corr.shape[1]):
        kmo_j_num = np.sum(dataset_corr[:,[j]] ** 2) - dataset_corr[j,j] ** 2
        kmo_j_denom = kmo_j_num + np.sum(A[:,[j]] ** 2) - A[j,j] ** 2
        kmo_j[j] = kmo_j_num / kmo_j_denom

    
    Result = collections.namedtuple("KMO_Test_Results", ["value", "per_variable"])   
    
    #Output of the results - named tuple    
    return Result(value=kmo_value,per_variable=kmo_j)

In [8]:
chi2_spear,ddl_spear,pvalue_spear = bartlett_sphericity(task_df, corr_method="spearman")
print(chi2_spear,ddl_spear,pvalue_spear )
chi2_son,ddl_son,pvalue_son = bartlett_sphericity(task_df, corr_method="pearson")
print(chi2_son,ddl_son,pvalue_son)
print('pearson = same as the factor analyzer!')

40151.519350792536 9730.0 0.0
39345.32680860707 9730.0 0.0
pearson = same as the factor analyzer!


In [9]:
dataset_corr_spear = task_df.corr(method="spearman")
value_spear,per_variable_spear = kmo(dataset_corr_spear)
print(value_spear)

dataset_corr_son = task_df.corr(method="pearson")
value_son,per_variable_son = kmo(dataset_corr_son)
print(value_son)
print('pearson = same as the factor analyzer!')

0.7720771290329701
0.7576891243402228
pearson = same as the factor analyzer!


# Perform EFA

## 1. sklearn

In [None]:
fa = skdec.FactorAnalysis(n_components = 5)
fa.fit(task_df)
z_scores = ss.zscore(fa.score_samples(task_df))
# for idx, score in enumerate(fa.score_samples(task_df)):
#     print(score, z_scores[idx])
    
fa.components_

In [None]:
factors = skdec.FactorAnalysis(svd_method='lapack').fit(task_df)
transformed_factors = factors.transform(task_df)

factors1 = skdec.FactorAnalysis(tol=1e-10).fit(task_df)

In [None]:
efa_df = pd.DataFrame(factors.components_, columns=task_df.columns)

In [None]:
transformed_df = pd.DataFrame(transformed_factors)

In [None]:
factors.score(task_df)

In [None]:
factors1.score(task_df)

In [None]:
pd.DataFrame(factors1.get_covariance())

In [None]:
efa_df1 = pd.DataFrame(factors1.components_, columns=task_df.columns)

In [None]:
efa_df1

## 2. using factor_analyzer

In [None]:
fa = factor_analyzer.FactorAnalyzer(rotation=None)
fa.fit(task_df) #140 gotten from output of FactorAnalysis().fit(task_df)

In [None]:
efa_df2 = pd.DataFrame(fa.loadings_)
efa_df2

In [None]:
DVs = task_df.columns.values

In [None]:
for DV in DVs:
    print(DV)

## Figuring out transformed vars

In [7]:
clean_df = pd.read_csv('meaningful_variables_clean.csv')
del clean_df['Unnamed: 0']

In [8]:
orig_df = pd.read_csv('meaningful_variables.csv')
del orig_df['Unnamed: 0']

## vars taken from .out as successfully or unsuccessfuly positively or negatively transformed into normal

****************************************
** Successfully transformed 31 positively skewed variables:
adaptive_n_back.avg_rt
adaptive_n_back.mean_load
angling_risk_task_always_sunny.release_coef_of_variation
attention_network_task.congruent_rt
attention_network_task.neutral_rt
bickel_titrator.hyp_discount_rate_large
bickel_titrator.hyp_discount_rate_medium
bickel_titrator.hyp_discount_rate_small
choice_reaction_time.avg_rt
columbia_card_task_hot.gain_sensitivity
dietary_decision.health_sensitivity
dospert_eb_survey.health_safety
dospert_rt_survey.ethical
hierarchical_rule.avg_rt
kirby.hyp_discount_rate_large
kirby.hyp_discount_rate_medium
kirby.hyp_discount_rate_small
local_global_letter.global_congruent_rt
motor_selective_stop_signal.proactive_control_rt
shift_task.model_beta
simon.congruent_sd_rt
simon.incongruent_sd_rt
simple_reaction_time.avg_rt
stim_selective_stop_signal.SSRT
stop_signal.SSRT_low
stop_signal.proactive_SSRT_speeding
threebytwo.avg_rt
threebytwo.cue_switch_cost_rt_100.0
threebytwo.task_switch_cost_rt_900.0
tower_of_london.avg_move_time
writing_task.positive_probability
****************************************
****************************************
Dropping 2 positively skewed data that could not be transformed successfully:
dickman_survey.dysfunctional
impulsive_venture_survey.impulsiveness
****************************************
****************************************
** Successfully transformed 26 negatively skewed variables:
attention_network_task.conflict_acc
attention_network_task.orienting_acc
columbia_card_task_cold.loss_sensitivity
dietary_decision.taste_sensitivity
dot_pattern_expectancy.AY-BY_acc
dot_pattern_expectancy.BX-BY_acc
dot_pattern_expectancy.BX-BY_rt
dot_pattern_expectancy.acc
go_nogo.acc
holt_laury_survey.beta
holt_laury_survey.prob_weighting
local_global_letter.conflict_acc
mpq_control_survey.control
recent_probes.acc
recent_probes.proactive_interference_acc
shape_matching.acc
shift_task.model_decay
simon.acc
simon.congruent_acc
simon.incongruent_acc
simon.simon_acc
stim_selective_stop_signal.ignore_acc
stroop.acc
stroop.stroop_acc
ten_item_personality_survey.agreeableness
threebytwo.acc
****************************************
****************************************
Dropping 8 negatively skewed data that could not be transformed successfully:
psychological_refractory_period_two_choices.task1_acc
choice_reaction_time.acc
motor_selective_stop_signal.ignore_acc
attention_network_task.acc
psychological_refractory_period_two_choices.task2_acc
information_sampling_task.Fixed_Win_acc
directed_forgetting.acc
local_global_letter.acc
****************************************
**************************************************
Dropping 25 variables with correlations above 0.85
**************************************************
angling_risk_task_always_sunny.release_score
angling_risk_task_always_sunny.keep_score
attention_network_task.neutral_rt.logTr
attention_network_task.congruent_rt.logTr
attention_network_task.incongruent_rt
dot_pattern_expectancy.avg_rt
go_nogo.dprime
hierarchical_rule.score
kirby.hyp_discount_rate_medium.logTr
kirby.percent_patient_large
kirby.percent_patient
kirby.percent_patient_small
kirby.percent_patient_medium
kirby.hyp_discount_rate_small.logTr
local_global_letter.congruent_rt
local_global_letter.global_congruent_rt.logTr
local_global_letter.incongruent_harm_acc
local_global_letter.incongruent_rt
local_global_letter.local_congruent_rt
probabilistic_selection.value_sensitivity
simon.incongruent_acc.ReflogTr
simon.congruent_avg_rt
simon.incongruent_avg_rt
stroop.congruent_rt
stroop.incongruent_rt

In [10]:
success_pos_vars = '''adaptive_n_back.avg_rt
adaptive_n_back.mean_load
angling_risk_task_always_sunny.release_coef_of_variation
attention_network_task.congruent_rt
attention_network_task.neutral_rt
bickel_titrator.hyp_discount_rate_large
bickel_titrator.hyp_discount_rate_medium
bickel_titrator.hyp_discount_rate_small
choice_reaction_time.avg_rt
columbia_card_task_hot.gain_sensitivity
dietary_decision.health_sensitivity
dospert_eb_survey.health_safety
dospert_rt_survey.ethical
hierarchical_rule.avg_rt
kirby.hyp_discount_rate_large
kirby.hyp_discount_rate_medium
kirby.hyp_discount_rate_small
local_global_letter.global_congruent_rt
motor_selective_stop_signal.proactive_control_rt
shift_task.model_beta
simon.congruent_sd_rt
simon.incongruent_sd_rt
simple_reaction_time.avg_rt
stim_selective_stop_signal.SSRT
stop_signal.SSRT_low
stop_signal.proactive_SSRT_speeding
threebytwo.avg_rt
threebytwo.cue_switch_cost_rt_100.0
threebytwo.task_switch_cost_rt_900.0
tower_of_london.avg_move_time
writing_task.positive_probability'''.split('\n')

fail_pos_vars = '''
dickman_survey.dysfunctional
impulsive_venture_survey.impulsiveness'''.split('\n')

success_neg_vars = '''
attention_network_task.conflict_acc
attention_network_task.orienting_acc
columbia_card_task_cold.loss_sensitivity
dietary_decision.taste_sensitivity
dot_pattern_expectancy.AY-BY_acc
dot_pattern_expectancy.BX-BY_acc
dot_pattern_expectancy.BX-BY_rt
dot_pattern_expectancy.acc
go_nogo.acc
holt_laury_survey.beta
holt_laury_survey.prob_weighting
local_global_letter.conflict_acc
mpq_control_survey.control
recent_probes.acc
recent_probes.proactive_interference_acc
shape_matching.acc
shift_task.model_decay
simon.acc
simon.congruent_acc
simon.incongruent_acc
simon.simon_acc
stim_selective_stop_signal.ignore_acc
stroop.acc
stroop.stroop_acc
ten_item_personality_survey.agreeableness
threebytwo.acc'''.split('\n')

fail_neg_vars = '''
choice_reaction_time.acc
motor_selective_stop_signal.ignore_acc
attention_network_task.acc
psychological_refractory_period_two_choices.task2_acc
information_sampling_task.Fixed_Win_acc
directed_forgetting.acc
local_global_letter.acc'''.split('\n')

over_correlated_vars='''
angling_risk_task_always_sunny.release_score
angling_risk_task_always_sunny.keep_score
attention_network_task.neutral_rt.logTr
attention_network_task.congruent_rt.logTr
attention_network_task.incongruent_rt
dot_pattern_expectancy.avg_rt
go_nogo.dprime
hierarchical_rule.score
kirby.hyp_discount_rate_medium.logTr
kirby.percent_patient_large
kirby.percent_patient
kirby.percent_patient_small
kirby.percent_patient_medium
kirby.hyp_discount_rate_small.logTr
local_global_letter.congruent_rt
local_global_letter.global_congruent_rt.logTr
local_global_letter.incongruent_harm_acc
local_global_letter.incongruent_rt
local_global_letter.local_congruent_rt
probabilistic_selection.value_sensitivity
simon.incongruent_acc.ReflogTr
simon.congruent_avg_rt
simon.incongruent_avg_rt
stroop.congruent_rt
stroop.incongruent_rt'''.split('\n')

In [11]:

orig_colums =  [col for col in orig_df.columns]

all_vars = success_pos_vars + fail_pos_vars + success_neg_vars +fail_neg_vars
# for var in all_vars:
#     if var != '':
#         if var in orig_colums:
#             print('yay! ' + var + ' is in the original')
#         else:
#             print('BOOO! ' + var + ' is NOT in the original')
            
# for var in orig_colums:
#     if var != '':
#         if var in all_vars:
#             print('yay! ' + var + 'has been transformed if necessary')
#         else:
#             print('BOOO! ' + var + ' was not touched!')
    

NameError: name 'orig_df' is not defined

In [None]:
columns = [col for col in clean_df.columns] #grab the columns of the "cleaned" 

print('SUCCESSFULL POSITIVELY SKEWED')
for var in success_pos_vars:
    if var not in over_correlated_vars:
        tmp_var = var+'.logTr'
        if tmp_var not in over_correlated_vars:
            if tmp_var in columns:
                print('yes! ' + tmp_var + ' has been kept as a column')
            elif var in columns:
                print('weirdly, ' + var + ' has been kept as a column, and not ' + tmp_var)
            else:
                print('no, ' + tmp_var + ' is no longer a column (for a mysterious reason!?)')
                
print('SUCCESSFUL NEGATIVELY SKEWED')
for var in success_neg_vars:
    if var not in over_correlated_vars:
        tmp_var = var+'.ReflogTr'
        if tmp_var not in over_correlated_vars:
            if tmp_var in columns:
                print('yes! ' + tmp_var + ' has been kept as a column')
            elif var in columns:
                print('weirdly, ' + var + ' has been kept as a column, and not ' + tmp_var)
            else:
                print('no, ' + tmp_var + ' is no longer a column (for a mysterious reason!?)')
# for var in fail_pos_vars:
#     tmp_var = var+'.logTr'
#     if (var+'.logTr') in columns:
#         print('yes!, ' + tmp_var + ' has been kept as a column')
#     else:
#         print('no, ' + tmp_var + ' is no longer a column (for a mysterious reason!?)')

selected_variables_clean = transform_remove_skew(selected_variables)

selected_variables_clean = remove_outliers(selected_variables_clean)

selected_variables_clean = remove_correlated_task_variables(selected_variables_clean)

selected_variables_clean.to_csv(path.join(directory, 'meaningful_variables_clean.csv'))

In [9]:
def transform_remove_skew(data, threshold=1, 
                          positive_skewed=None,
                          negative_skewed=None,
                          verbose=True):
    data = data.copy()
    if positive_skewed is None:
        positive_skewed = data.skew()>threshold
    if negative_skewed is None:
        negative_skewed = data.skew()<-threshold
    positive_subset = data.loc[:,positive_skewed]
    negative_subset = data.loc[:,negative_skewed]
    # transform variables
    # log transform for positive skew
    shift = pd.Series(0, index=positive_subset.columns)
    shift_variables = positive_subset.min()<=0
    shift[shift_variables] -= (positive_subset.min()[shift_variables]-1)
    positive_subset = np.log(positive_subset+shift)
    # remove outliers
    positive_tmp = remove_outliers(positive_subset)
    successful_transforms = positive_subset.loc[:,abs(positive_tmp.skew())<threshold]
    if verbose:
        print('*'*40)
        print('** Successfully transformed %s positively skewed variables:' % len(successful_transforms.columns))
        print('\n'.join(successful_transforms.columns))
        print('*'*40)
    dropped_vars = set(positive_subset)-set(successful_transforms)
    # replace transformed variables
    data.drop(positive_subset, axis=1, inplace = True)
    successful_transforms.columns = [i + '.logTr' for i in successful_transforms]
    if verbose:
        print('*'*40)
        print('Dropping %s positively skewed data that could not be transformed successfully:' % len(dropped_vars))
        print('\n'.join(dropped_vars))
        print('*'*40)
    data = pd.concat([data, successful_transforms], axis = 1)
    # reflected log transform for negative skew
    negative_subset = np.log(negative_subset.max()+1-negative_subset)
    negative_tmp = remove_outliers(negative_subset)
    successful_transforms = negative_subset.loc[:,abs(negative_tmp.skew())<threshold]
    if verbose:
        print('*'*40)
        print('** Successfully transformed %s negatively skewed variables:' % len(successful_transforms.columns))
        print('\n'.join(successful_transforms.columns))
        print('*'*40)
    dropped_vars = set(negative_subset)-set(successful_transforms)
    # replace transformed variables
    data.drop(negative_subset, axis=1, inplace = True)
    successful_transforms.columns = [i + '.ReflogTr' for i in successful_transforms]
    if verbose:
        print('*'*40)
        print('Dropping %s negatively skewed data that could not be transformed successfully:' % len(dropped_vars))
        print('\n'.join(dropped_vars))
        print('*'*40)
    data = pd.concat([data, successful_transforms], axis=1)
    return data.sort_index(axis = 1)

In [10]:
def remove_outliers(data, quantile_range = 2.5):
    '''Removes outliers more than 1.5IQR below Q1 or above Q3
    '''
    data = data.copy()
    quantiles = data.apply(lambda x: x.dropna().quantile([.25,.5,.75])).T
    lowlimit = np.array(quantiles.iloc[:,1] - quantile_range*(quantiles.iloc[:,2] - quantiles.iloc[:,0]))
    highlimit = np.array(quantiles.iloc[:,1] + quantile_range*(quantiles.iloc[:,2] - quantiles.iloc[:,0]))
    data_mat = data.values
    data_mat[np.logical_or((data_mat<lowlimit), (data_mat>highlimit))] = np.nan
    data = pd.DataFrame(data=data_mat, index=data.index, columns=data.columns)
    return data

In [11]:
def remove_correlated_task_variables(data, threshold=.85):
    tasks = np.unique([i.split('.')[0] for i in data.columns])
    columns_to_remove = []
    for task in tasks:
        task_data = data.filter(regex = '^%s' % task)
        corr_mat = task_data.corr().replace({1:0})
        i=0
        while True:
            kept_indices = np.where(abs(corr_mat.iloc[:,i])<threshold)[0]
            corr_mat = corr_mat.iloc[kept_indices,kept_indices]
            i+=1
            if i>=corr_mat.shape[0]:
                break
        columns_to_remove += list(set(task_data.columns)-set(corr_mat.columns))
    print( '*' * 50)
    print('Dropping %s variables with correlations above %s' % (len(columns_to_remove), threshold))
    print( '*' * 50)
    print('\n'.join(columns_to_remove))
    data = drop_vars(data,columns_to_remove)
    return data

In [12]:
def drop_vars(data, drop_vars = [], saved_vars = []):
    if len(drop_vars) == 0:
        # variables that are calculated without regard to their actual interest
        basic_vars = ["\.missed_percent$","\.acc$","\.avg_rt_error$","\.std_rt_error$","\.avg_rt$","\.std_rt$"]
        #unnecessary ddm params
        ddm_vars = ['.*\.(EZ|hddm)_(drift|thresh|non_decision).+$']
        # variables that are of theoretical interest, but we aren't certain enough to include in 2nd stage analysis
        exploratory_vars = ["\.congruency_seq", "\.post_error_slowing$"]
        # task variables that are irrelevent to second stage analysis, either because they are correlated
        # with other DV's or are just of no interest. Each row is a task
        task_vars = ["demographics", # demographics
                    "(keep|release)_loss_percent", # angling risk task
                    ".first_order", "bis11_survey.total", # bis11
                    "bis_bas_survey.BAS_total", 
                    "dietary_decision.prop_healthy_choice", # dietary decision
                    "dot_pattern_expectancy.*errors", # DPX errors
                    "eating_survey.total", # eating total score
                    "five_facet_mindfulness_survey.total", 
                    "\.risky_choices$", "\.number_of_switches", # holt and laury
                    "boxes_opened$", # information sampling task
                    "_total_points$", # IST
                    "\.go_acc$", "\.nogo_acc$", "\.go_rt$", "go_nogo.*error.*", #go_nogo
                    "discount_titrate.hyp_discount_rate", "discount_titrate.hyp_discount_rate_(glm|nm)"  #delay discounting
                    "kirby.percent_patient","kirby.hyp_discount_rate$",  "kirby.exp_discount.*", 
                    "\.warnings$", "_notnow$", "_now$", #kirby and delay discounting
                    "auc", # bickel
                    "local_global_letter.*error.*", # local global errors
                    "PRP_slowing", # PRP_two_choices
                    "shape_matching.*prim.*", # shape matching prime measures
                    "sensation_seeking_survey.total", # SSS
                    "DDS", "DNN", "DSD", "SDD", "SSS", "DDD", "stimulus_interference_rt", # shape matching
                    "shift_task.*errors", "shift_task.model_fit", "shift_task.conceptual_responses", #shift task
                    "shift_task.fail_to_maintain_set", 'shift_task.perseverative_responses', # shift task continued
                     "go_acc","stop_acc","go_rt_error","go_rt_std_error", "go_rt","go_rt_std", # stop signal
                     "stop_rt_error","stop_rt_error_std","SS_delay", "^stop_signal.SSRT$", # stop signal continue
                     "stop_signal.*errors", "inhibition_slope", # stop signal continued
                     "stroop.*errors", # stroop
                     "threebytwo.*inhibition", # threebytwo
                     "num_correct", "weighted_performance_score", # tower of london
                     "sentiment_label" ,# writing task
                     "log_ll", "match_pct", "min_rss", #fit indices
                     "num_trials", "num_stop_trials"#num trials
                    ]
        drop_vars = basic_vars + exploratory_vars + task_vars + ddm_vars
    drop_vars = '|'.join(drop_vars)
    if len(saved_vars) > 0 :
        saved_vars = '|'.join(saved_vars)
        saved_columns = data.filter(regex=saved_vars)
        dropped_data =  data.drop(data.filter(regex=drop_vars).columns, axis = 1)
        final_data = dropped_data.join(saved_columns).sort_index(axis = 1)
    else:
        final_data = data.drop(data.filter(regex=drop_vars).columns, axis = 1)
    return final_data

In [13]:
selected_vars_clean = transform_remove_skew(orig_df)

****************************************
** Successfully transformed 38 positively skewed variables:
adaptive_n_back.mean_load
angling_risk_task_always_sunny.release_coef_of_variation
attention_network_task.conflict_rt
attention_network_task.incongruent_rt
attention_network_task.neutral_rt
bickel_titrator.hyp_discount_rate_large
bickel_titrator.hyp_discount_rate_medium
bickel_titrator.hyp_discount_rate_small
bis11_survey.Motor
choice_reaction_time.avg_rt
columbia_card_task_cold.gain_sensitivity
columbia_card_task_hot.gain_sensitivity
directed_forgetting.proactive_interference_acc
dospert_eb_survey.health_safety
dot_pattern_expectancy.AX_rt
dot_pattern_expectancy.BX_rt
dot_pattern_expectancy.avg_rt
holt_laury_survey.beta
kirby.hyp_discount_rate_large
kirby.hyp_discount_rate_medium
kirby.hyp_discount_rate_small
motor_selective_stop_signal.ignore_rt_error_std
motor_selective_stop_signal.proactive_control_rt
shape_matching.avg_rt
shift_task.model_beta
simon.avg_rt
simon.congruent_avg_rt
si

  if __name__ == '__main__':
  if __name__ == '__main__':


In [14]:
print(len([i for i in selected_vars_clean.columns if '.logTr' in i]))
print(len([i for i in selected_vars_clean.columns if '.ReflogTr' in i]))


38
22


In [15]:
failed_dropped = '''
impulsive_venture_survey.impulsiveness
dickman_survey.dysfunctional
holt_laury_survey.prob_weighting
simon.congruent_acc
shape_matching.acc
information_sampling_task.Fixed_Win_acc
psychological_refractory_period_two_choices.task2_acc
motor_selective_stop_signal.ignore_acc
dot_pattern_expectancy.acc
stroop.acc
psychological_refractory_period_two_choices.task1_acc
simon.acc
directed_forgetting.acc'''.split('\n')

In [16]:
for var in failed_dropped:
    if var in selected_vars_clean.columns:
        print(var)

selected_variables_clean = transform_remove_skew(selected_variables)

selected_variables_clean = remove_outliers(selected_variables_clean)

selected_variables_clean = remove_correlated_task_variables(selected_variables_clean)

selected_variables_clean.to_csv(path.join(directory, 'meaningful_variables_clean.csv'))

In [17]:
selected_vars_clean = remove_outliers(selected_vars_clean)
selected_vars_clean = remove_correlated_task_variables(selected_vars_clean)

  if __name__ == '__main__':
  if __name__ == '__main__':


**************************************************
Dropping 24 variables with correlations above 0.85
**************************************************
angling_risk_task_always_sunny.release_score
angling_risk_task_always_sunny.keep_score
attention_network_task.incongruent_rt.logTr
attention_network_task.congruent_rt
attention_network_task.neutral_rt.logTr
dot_pattern_expectancy.avg_rt.logTr
go_nogo.dprime
hierarchical_rule.score
kirby.percent_patient_large
kirby.percent_patient
kirby.percent_patient_small
kirby.percent_patient_medium
kirby.hyp_discount_rate_medium.logTr
local_global_letter.incongruent_harm_acc.ReflogTr
local_global_letter.global_congruent_rt
local_global_letter.congruent_rt
local_global_letter.local_congruent_rt
local_global_letter.incongruent_rt
probabilistic_selection.avoid_trial_rt
probabilistic_selection.value_sensitivity
simon.congruent_avg_rt.logTr
simon.incongruent_avg_rt.logTr
stroop.incongruent_rt
stroop.congruent_rt.logTr


In [18]:
over_correlated_vars = '''angling_risk_task_always_sunny.keep_score
angling_risk_task_always_sunny.release_score
attention_network_task.incongruent_rt.logTr
attention_network_task.congruent_rt
attention_network_task.neutral_rt.logTr
dot_pattern_expectancy.avg_rt.logTr
go_nogo.dprime
hierarchical_rule.score
kirby.percent_patient_medium
kirby.percent_patient_small
kirby.hyp_discount_rate_medium.logTr
kirby.percent_patient_large
kirby.percent_patient
local_global_letter.local_congruent_rt
local_global_letter.global_congruent_rt
local_global_letter.incongruent_harm_acc.ReflogTr
local_global_letter.congruent_rt
local_global_letter.incongruent_rt
probabilistic_selection.avoid_trial_rt
probabilistic_selection.value_sensitivity
simon.congruent_avg_rt.logTr
simon.incongruent_avg_rt.logTr
stroop.congruent_rt.logTr
stroop.incongruent_rt'''.split('\n')

In [19]:
for var in over_correlated_vars:
    if var in selected_vars_clean.columns:
        print(var)

In [34]:
import rpy2.robjects
from rpy2.robjects import pandas2ri, Formula
from rpy2.robjects.packages import importr
from selfregulation.utils.utils import get_info
pandas2ri.activate()

def missForest(data):
#     try:
#         missForest = importr('missForest', lib_loc='/Library/Frameworks/R.framework/Versions/3.6/Resources/library')
#     except:
#         missForest = importr('missForest', lib_loc='/Library/Frameworks/R.framework/Resources/library')
    missForest = importr('missForest', lib_loc='/Library/Frameworks/R.framework/Versions/3.6/Resources/library')
    data_complete, error = missForest.missForest(data)
    imputed_df = pd.DataFrame(np.matrix(data_complete).T, index=data.index, columns=data.columns)
    return imputed_df, error

In [35]:
selected_variables_imputed, error = missForest(selected_vars_clean)

RRuntimeError: Error in library.dynam(lib, package, package.lib) : 
  shared object ‘randomForest.dylib’ not found


In [29]:
import rpy2.rinterface
rpy2.rinterface.set_initoptions((b'rpy2', b'--no-save', b'--no-restore', b'--quiet'))
from rpy2.robjects.packages import importr
base = importr('base')
print(base._libPaths())

RuntimeError: Options cannot be set once R has been initialized.