In [1]:
import matplotlib
# imports
from glob import glob
import numpy as np
from os import makedirs, path, remove
import pickle
import random
from shutil import copyfile, copytree, rmtree
import subprocess
import time

from dimensional_structure.results import Results
from dimensional_structure.cross_results_plots import (plot_corr_heatmap, 
                                                       plot_glasso_edge_strength,
                                                       plot_cross_within_prediction,
                                                       plot_cross_relationship,
                                                       plot_BIC,
                                                       plot_cross_silhouette,
                                                       plot_cross_communality)
from dimensional_structure.cross_results_utils import run_cross_prediction
from dimensional_structure.DA_plots import plot_DA
from dimensional_structure.EFA_plots import plot_EFA
from dimensional_structure.EFA_test_retest import (calc_EFA_retest,
                                                   plot_EFA_change, 
                                                   plot_EFA_retest, 
                                                   plot_cross_EFA_retest)
from dimensional_structure.HCA_plots import plot_HCA
from dimensional_structure.prediction_plots import (plot_results_prediction,
                                                    plot_prediction, 
                                                    plot_prediction_scatter,
                                                    plot_prediction_comparison,
                                                    plot_factor_fingerprint)
from dimensional_structure.prediction_utils import run_group_prediction
from selfregulation.utils.result_utils import load_results
from selfregulation.utils.utils import get_info, get_recent_dataset

Using TensorFlow backend.
  from pandas.core import datetools


In [2]:
verbose=True
dataset=None

In [3]:
# get dataset of interest
basedir=get_info('base_directory')
if dataset == None:
    dataset = get_recent_dataset()
dataset = path.join(basedir,'Data',dataset)
datafile = dataset.split(path.sep)[-1]

In [4]:
datafile

'Complete_02-22-2020'

In [5]:
# label subsets
demographic_factor_names = ['Drug Use',
                            'Mental Health',
                            'Problem Drinking',
                            'Daily Smoking',
                            'Binge Drinking',
                            'Lifetime Smoking',
                            'Obesity',
                            'Income / Life Milestones']


                                      
subsets = [{'name': 'task', 
            'regex': 'task',
            'oblimin_cluster_names': ['Conflict Processing',
                                      'Information Processing',
                                      'Shifting',
                                      'Speeded Information Processing',
                                      'Inhibition-Related Threshold',
                                      'Caution',
                                      'Perc/Resp',
                                      'Inhibition-Related Perc/Resp',
                                      'NA1',
                                      'Discounting',
                                      'NA2',
                                      'Cold/Model-Based',
                                      'Hot/Model-Free',
                                      'NA3',
                                      'NA4'],
            'oblimin_factor_names': ['Speeded IP', 'Strategic IP', 
                                     'Perc / Resp','Caution', 
                                     'Discounting']
                                     ,
            'varimax_cluster_names': None,
            'varimax_factor_names': ['Speeded IP', 'Strategic IP', 
                                     'Perc / Resp',  'Caution', 
                                     'Discounting'],
            'predict': True},
            {'name': 'survey',
             'regex': 'survey',
             'oblimin_cluster_names': ['Financial Risk-Taking',
                                       'Eating',
                                       'Behavioral Approach',
                                       'Behavioral Inhibition',
                                       'Mindfulness',
                                       'Impulsivity',
                                       'Goal-Direcedness',
                                       'Ethical/Health Risk-Taking',
                                       'Risk Perception',
                                       'Sensation Seeking',
                                       'Sociability',
                                       'Reward Sensitivity'],
             'oblimin_factor_names':  ['Sensation Seeking', 'Emotional Control',  
                                   'Mindfulness', 'Impulsivity',
                                   'Reward Sensitivity', 'Goal-Directedness', 
                                   'Risk Perception', 'Eating Control', 
                                   'Ethical Risk-Taking', 'Social Risk-Taking',
                                   'Financial Risk-Taking', 'Agreeableness'],
            'varimax_cluster_names': None,
            'varimax_factor_names': None,
             'predict': True},
             {'name': 'main_subset', 
            'regex': 'main',
            'oblimin_cluster_names': [],
            'oblimin_factor_names': [],
            'predict': False},
             {'name': 'all', 
              'regex': '.',
              'oblimin_cluster_names': [],
              'oblimin_factor_names': [],
              'predict': False}]

selected_subsets = ['task', 'survey']

bootstrap = True
boot_iter = 1000

In [6]:
results = None
all_results = None
ID = str(random.getrandbits(16)) 
# create/run results for each subset
for subset in [subsets[0]]: ##CHANGE ONCE YOU CAN GET THIS IN A STRAIGHT RUN THROUGH
    name = subset['name']
    if verbose:
        print('*'*79)
        print('SUBSET: %s' % name.upper())
        print('*'*79)
    if selected_subsets is not None and name not in selected_subsets:
        continue

    print('*'*79)
    print('Analyzing Subset: %s' % name)
    # ****************************************************************************
    # Laad Data
    # ****************************************************************************
    # run dimensional analysis
    start = time.time()
    results = Results(datafile=datafile, 
                      dist_metric='abscorrelation',
                      name=subset['name'],
                      filter_regex=subset['regex'],
                      boot_iter=boot_iter,
                      ID=ID,
                      residualize_vars=['Age', 'Sex'])
    results.run_demographic_analysis(verbose=verbose, bootstrap=bootstrap)
    for rotate in ['oblimin', 'varimax']:
        results.run_EFA_analysis(rotate=rotate, 
                                 verbose=verbose, 
                                 bootstrap=bootstrap)
        results.run_clustering_analysis(rotate=rotate, 
                                        verbose=verbose, 
                                        run_graphs=False)
        c = results.EFA.get_c()
        # name factors and clusters
        factor_names = subset.get('%s_factor_names' % rotate, None)
        cluster_names = subset.get('%s_cluster_names' % rotate, None)
        if factor_names:
            results.EFA.name_factors(factor_names, rotate=rotate)
        if cluster_names:
            results.HCA.name_clusters(cluster_names, inp='EFA%s_%s' % (c, rotate))
    ID = results.ID.split('_')[1]
    results.DA.name_factors(demographic_factor_names)
    if verbose: print('Saving Subset: %s' % name)
    id_file = results.save_results()
    # ***************************** saving ****************************************
    # copy latest results and prediction to higher directory
    copyfile(id_file, path.join(path.dirname(results.get_output_dir()), 
                                '%s_results.pkl' % name))


*******************************************************************************
SUBSET: TASK
*******************************************************************************
*******************************************************************************
Analyzing Subset: task
Getting dataset: /SRO/Data/Complete_02-22-2020...:
file: meaningful_variables_imputed.csv 
 
Getting dataset: /SRO/Data/Complete_02-22-2020...:
file: meaningful_variables_clean.csv 
 
*******************************************************************************
Running demographics
*******************************************************************************
Is the data adequate for factor analysis? Yes
Determining Optimal Dimensionality
Best Components:  {'c_metric-BIC': 8}
Creating Factor Tree
No 8 factor solution computed yet! Computing...
Determining Higher Order Factors
# of components not specified, using BIC determined #
*******************************************************************************
Runn

In [7]:
results.HCA.get_cluster_loading(results.EFA)

OrderedDict([('cluster0', 0    0.035808
              1    0.035140
              2    0.038311
              3    0.022224
              4    0.032278
              5    0.019283
              6    0.637637
              7    0.143033
              dtype: float64), ('cluster1', 0    0.020307
              1    0.045307
              2    0.055493
              3    0.754076
              4    0.041883
              5    0.026116
              6    0.028908
              7    0.019630
              dtype: float64), ('cluster2', 0    0.043987
              1    0.188524
              2    0.104175
              3    0.018932
              4    0.064326
              5    0.585827
              6    0.028443
              7    0.022844
              dtype: float64), ('cluster3', 0    0.053541
              1    0.136329
              2    0.080923
              3    0.047624
              4    0.043498
              5    0.282215
              6    0.030133
              7    0.011614
  

In [8]:
run_plot=True

In [9]:
    # ****************************************************************************
    # Plotting
    # ****************************************************************************
    dpi = 300
    ext = 'png'
    size = 4.6
    if run_plot==True:
        if verbose:
            print('*'*79)
            print('Plotting Subset: %s' % name)
        if results is None or name not in results.ID:
            results = load_results(datafile, name=name)[name]
        plot_dir = results.get_plot_dir()
        DA_plot_dir = path.join(plot_dir, 'DA')
        EFA_plot_dir = path.join(plot_dir, 'EFA')
        HCA_plot_dir = path.join(plot_dir, 'HCA')
        prediction_plot_dir = path.join(plot_dir, 'prediction')
        makedirs(DA_plot_dir, exist_ok = True)
        makedirs(EFA_plot_dir, exist_ok = True)
        makedirs(HCA_plot_dir, exist_ok = True)
        
        # set up kws for plotting functions
        tasks = np.unique([i.split('.')[0] for i in results.data.columns])
        if name == 'task':
            plot_task_kws= {'task_sublists': {'tasks': [t for t in tasks if 'survey' not in t]}}
        elif name == 'survey':
            plot_task_kws= {'task_sublists': {'surveys': [t for t in tasks if 'survey' in t]}}
        else:
            plot_task_kws={}
         
            # Plot EFA
        if verbose: print("** Plotting DA **")
        plot_DA(results, DA_plot_dir, verbose=verbose, size=size, dpi=dpi, ext=ext)
        
        for rotate in ['oblimin', 'varimax']:
            # Plot EFA
            if verbose: print("** Plotting EFA %s **" % rotate)
            plot_EFA(results, EFA_plot_dir, rotate=rotate,
                     verbose=verbose, size=size, dpi=dpi, 
                     ext=ext, plot_task_kws=plot_task_kws)
            
            # Plot EFA retest
            combined, *the_rest = calc_EFA_retest(results, rotate=rotate)
            plot_EFA_retest(combined=combined, 
                            plot_dir=path.join(EFA_plot_dir, rotate), 
                            size=size, dpi=dpi, ext=ext)
            plot_EFA_change(combined=combined, 
                            plot_dir=path.join(EFA_plot_dir, rotate),
                            size=size, dpi=dpi, ext=ext)
            # Plot HCA
            if verbose: print("** Plotting HCA %s **" % rotate)
            drop_list = {('task', 'oblimin'): ([1,5,8,9,12,15],[2,4,6,14]) ,
                         ('survey', 'oblimin'): ([0,2,4,6,8,10], None)}
            drop1, drop2 = drop_list.get((name, rotate), (None, None))
            plot_HCA(results, HCA_plot_dir, rotate=rotate,
                     drop_list = drop1, double_drop_list=drop2,
                     size=size, dpi=dpi, ext='png')

*******************************************************************************
Plotting Subset: task
** Plotting DA **
Plotting Distributions


  (prop.get_family(), self.defaultFamily[fontext]))


Plotting factor correlations
Plotting factor bars
** Plotting EFA oblimin **
Plotting communality


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


Plotting factor bars
Plotting factor heatmap
Plotting factor correlations
using correct transfer_scores


TypeError: transform_remove_skew() got an unexpected keyword argument 'drop_failed'