### Generate the post hoc dataset for analysis

In [1]:
import math
import time
import parmap
import pickle
import multiprocessing
from imagen_posthocloader import *
import matplotlib.pyplot as plt
import seaborn as sns
from plot_results_posthoc import *
%matplotlib inline

In [2]:
num_cores = multiprocessing.cpu_count()
print(f'Available CPU cores: {num_cores}')
num_cores = math.floor(num_cores/3)
print(f'Set CPU cores: {num_cores}')

Available CPU cores: 48
Set CPU cores: 16


In [3]:
DATA_DIR = "/ritter/share/data/IMAGEN"
posthoc = IMAGEN_posthoc()

### 1. Load the [INSTRUMENT](https://imagen-europe.com/resources/imagen-dataset/documentation/) data

#### collect the selected instrument files from IMAGEN_RAW and store in posthoc

Please refer to <i>set_INSTRUMENT()</i> in <i>imagen_posthocloader.py</i>, and load the file and save it as <b> all_*.csv</b> 

In [4]:
# Instrument
## Demographic
PBQ   = posthoc.set_INSTRUMENT('PBQ')#, save=True)
GEN   = posthoc.set_INSTRUMENT('GEN')#, save=True)
LEQ   = posthoc.set_INSTRUMENT('LEQ')#, save=True)
# DAWBA
# CANTAB
NEO   = posthoc.set_INSTRUMENT('NEO')#, save=True)
SURPS = posthoc.set_INSTRUMENT('SURPS')#, save=True)
TCI = posthoc.set_INSTRUMENT('TCI')#, save=True)
BSI = posthoc.set_INSTRUMENT('BSI')#, save=True)
# KIRBY
# BIS-11
# CSI
# PHQ
# CES-D
# ANXDX
# CAPE
# SDQ
# IRI
# RRS
# PALP
## Social
# CTQ   = posthoc.set_INSTRUMENT('CTQ')#, save=True)
CTQ_MD = posthoc.set_INSTRUMENT('CTQ_MD')#, save=True)
CTS   = posthoc.set_INSTRUMENT('CTS')#, save=True)
PANAS = posthoc.set_INSTRUMENT('PANAS')#, save=True)
# MINI5
## Substance Use
MAST = posthoc.set_INSTRUMENT('MAST')#, save=True)
FTND  = posthoc.set_INSTRUMENT('FTND')#, save=True)
# DAST
# SCID
# RAPI
# DMQ
# Bully Questionnaire
# ESPAD
# TLFB
# AUDIT

### 2. Load the HDF5 data

#### collect the HDF5 files from h5files and save in posthoc

Please refer to <i>set_HDF5()</i> in <i>imagen_posthocloader.py</i>, and load the file and save it as <b> all_*.csv</b>

In [5]:
# Hdf5
BINGE = posthoc.set_HDF5('Binge')#, save=True)

In [6]:
# general information of the hdf5
BINGE.columns

Index(['ID', 'Session', 'y', 'Dataset', 'Sex', 'Site', 'Class'], dtype='object')

### 3. Load the RUN data

#### collect the RUN file from results and save in posthoc

Please refer to <i>set_RUN()</i> in <i>imagen_posthocloader.py</i>, and load the file and save it as <b> all_*.csv</b>

In [7]:
# RUN
RUN = posthoc.set_RUN('../../results/holdout_all-tp-clean_run.csv')#, save=True)

In [8]:
# general infromation of the hdf5
RUN.columns

Index(['i', 'o', 'io', 'technique', 'Session', 'Trial', 'path', 'n_samples',
       'n_samples_cc', 'i_is_conf', 'o_is_conf', 'Model', 'model_SVM-rbf__C',
       'model_SVM-rbf__gamma', 'runtime', 'model_SVM-lin__C',
       'model_GB__learning_rate', 'model_LR__C', 'train_score', 'valid_score',
       'test_score', 'roc_auc', 'holdout_score', 'holdout_roc_auc', 'dataset',
       'ID', 'true_label', 'prediction', 'TP prob', 'TN prob', 'FP prob',
       'FN prob', 'T prob', 'F prob', 'Prob', 'Predict TF', 'Model PN',
       'Label PN'],
      dtype='object')

### 4. Save the INSTRUMENT data

#### collect the instrument files from posthoc into one file

Please refer to <i>to_INSTRUMENT()</i> in <i>imagen_posthocloader.py</i>, and load the file

In [9]:
# collect the instrument file 
inst_list = [LEQ,                      # Demographic
             NEO, SURPS, TCI, BSI,     # Psychological
             CTQ_MD, CTS, PANAS,       # Social
             MAST, FTND]               # Substance use
# save the instrument file
INST = posthoc.to_INSTRUMENT(inst_list)#, save=True)

In [10]:
# general information of the instrument
# selected ROI
col_INST = list(INST.columns[2:].values)
print(len(col_INST), col_INST)

65 ['Family valence', 'Accident valence', 'Sexuality valence', 'Autonomy valence', 'Devience valence', 'Relocation valence', 'Distress valence', 'Noscale valence', 'Overall valence', 'Family mean frequency', 'Accident mean frequency', 'Sexuality mean frequency', 'Autonomy mean frequency', 'Devience mean frequency', 'Relocation mean frequency', 'Distress mean frequency', 'Noscale mean frequency', 'Overall mean frequency', 'Openness mean', 'Conscientiousness mean', 'Extroversion mean', 'Agreeableness mean', 'Neuroticism mean', 'Anxiety Sensitivity mean', 'Hopelessness mean', 'Impulsivity mean', 'Sensation seeking mean', 'Exploratory excitability vs. Stoic rigidity', 'Impulsiveness vs. Reflection', 'Extravagance vs. Reserve', 'Disorderliness vs. Regimentation', 'Total Novelty Seeking score', 'Somatization mean', 'Obsession-Compulsion mean', 'Interpersonal Sensitivity mean', 'Depression mean', 'Anxiety mean', 'Hostility mean', 'Phobic Anxiety mean', 'Paranoid Ideation mean', 'Psychoticism 

### 5. Read the INSTRUMENT data

#### read the instrument files from posthoc into one file

Please refer to <i>read_INSTRUMENT()</i> in <i>imagen_posthocloader.py</i>, and load the file

In [11]:
INST = posthoc.read_INSTRUMENT('IMAGEN_INSTRUMENT.csv')

In [12]:
# general information of the instrument
INST.columns

Index(['ID', 'Session', 'Family valence', 'Accident valence',
       'Sexuality valence', 'Autonomy valence', 'Devience valence',
       'Relocation valence', 'Distress valence', 'Noscale valence',
       'Overall valence', 'Family mean frequency', 'Accident mean frequency',
       'Sexuality mean frequency', 'Autonomy mean frequency',
       'Devience mean frequency', 'Relocation mean frequency',
       'Distress mean frequency', 'Noscale mean frequency',
       'Overall mean frequency', 'Openness mean', 'Conscientiousness mean',
       'Extroversion mean', 'Agreeableness mean', 'Neuroticism mean',
       'Anxiety Sensitivity mean', 'Hopelessness mean', 'Impulsivity mean',
       'Sensation seeking mean', 'Exploratory excitability vs. Stoic rigidity',
       'Impulsiveness vs. Reflection', 'Extravagance vs. Reserve',
       'Disorderliness vs. Regimentation', 'Total Novelty Seeking score',
       'Somatization mean', 'Obsession-Compulsion mean',
       'Interpersonal Sensitivity mean'

### 6. Read the HDF5 data

#### collect the hdf5 files from posthoc into one file

Please refer to <i>to_HDF5()</i> in <i>imagen_posthocloader.py</i>, and load the file and save it as <b> all_*.csv</b>

In [13]:
HDF5 = posthoc.to_HDF5('all_Binge.csv')#, save=True)

#### read the HDF5 files from posthoc into one file

Please refer to <i>read_HDF5()</i> in <i>imagen_posthocloader.py</i>, and load the file</b>

In [14]:
HDF5 = posthoc.read_HDF5('IMAGEN_HDF5.csv')

In [15]:
# general information of the hdf5
HDF5.columns

Index(['ID', 'Session', 'y', 'Dataset', 'Sex', 'Site', 'Class'], dtype='object')

### 7. Read the RUN data

#### select the ROI of the RUN file from posthoc into one file

Please refer to <i>to_RUN()</i> in <i>imagen_posthocloader.py</i>, and load the file and save it as <b> all_*.csv</b>

In [16]:
RUN.columns

Index(['i', 'o', 'io', 'technique', 'Session', 'Trial', 'path', 'n_samples',
       'n_samples_cc', 'i_is_conf', 'o_is_conf', 'Model', 'model_SVM-rbf__C',
       'model_SVM-rbf__gamma', 'runtime', 'model_SVM-lin__C',
       'model_GB__learning_rate', 'model_LR__C', 'train_score', 'valid_score',
       'test_score', 'roc_auc', 'holdout_score', 'holdout_roc_auc', 'dataset',
       'ID', 'true_label', 'prediction', 'TP prob', 'TN prob', 'FP prob',
       'FN prob', 'T prob', 'F prob', 'Prob', 'Predict TF', 'Model PN',
       'Label PN'],
      dtype='object')

In [17]:
COL = ['ID','Session','Trial','dataset','io','technique','Model',
       'TP prob','TN prob','FP prob','FN prob','T prob','F prob','Prob',
       'Predict TF','Model PN','Label PN','true_label','prediction']

In [18]:
RUN = posthoc.to_RUN('all_RUN.csv', COL)#, save = True)

In [19]:
# general information of the run
RUN.columns

Index(['ID', 'Session', 'Trial', 'dataset', 'io', 'technique', 'Model',
       'TP prob', 'TN prob', 'FP prob', 'FN prob', 'T prob', 'F prob', 'Prob',
       'Predict TF', 'Model PN', 'Label PN', 'true_label', 'prediction'],
      dtype='object')

#### read the RUN files from posthoc into one file

Please refer to <i>read_RUN()</i> in <i>imagen_posthocloader.py</i>, and load the file

In [20]:
RUN = posthoc.read_RUN('IMAGEN_RUN.csv')

In [21]:
# general information of the run
RUN.columns

Index(['ID', 'Session', 'Trial', 'dataset', 'io', 'technique', 'Model',
       'TP prob', 'TN prob', 'FP prob', 'FN prob', 'T prob', 'F prob', 'Prob',
       'Predict TF', 'Model PN', 'Label PN', 'true_label', 'prediction'],
      dtype='object')

### 8. Save the post hoc dataset

#### set the dataset for analysis of diagnosis (X:FU3 == y:FU3)

Please refer to <i>to_posthoc()</i> in <i>imagen_posthocloader.py</i>, and load the file and save it as <b> all_*.csv</b>

In [22]:
DATA = ['IMAGEN_HDF5.csv', 'IMAGEN_INSTRUMENT.csv', 'IMAGEN_RUN.csv']
FU3 = posthoc.to_posthoc(DATA)#, save=True)

In [23]:
# general information of the instrument
FU3.columns

Index(['ID', 'Session', 'y', 'Dataset', 'Sex', 'Site', 'Class', 'Trial',
       'dataset', 'io', 'technique', 'Model', 'TP prob', 'TN prob', 'FP prob',
       'FN prob', 'T prob', 'F prob', 'Prob', 'Predict TF', 'Model PN',
       'Label PN', 'true_label', 'prediction', 'Family valence',
       'Accident valence', 'Sexuality valence', 'Autonomy valence',
       'Devience valence', 'Relocation valence', 'Distress valence',
       'Noscale valence', 'Overall valence', 'Family mean frequency',
       'Accident mean frequency', 'Sexuality mean frequency',
       'Autonomy mean frequency', 'Devience mean frequency',
       'Relocation mean frequency', 'Distress mean frequency',
       'Noscale mean frequency', 'Overall mean frequency', 'Openness mean',
       'Conscientiousness mean', 'Extroversion mean', 'Agreeableness mean',
       'Neuroticism mean', 'Anxiety Sensitivity mean', 'Hopelessness mean',
       'Impulsivity mean', 'Sensation seeking mean',
       'Exploratory excitability vs. 

In [24]:
FU3.iloc[1000]

ID                                               37058553
Session                                               FU3
y                                                   Binge
Dataset                                          Training
Sex                                                  Male
                                                ...      
MAST total                                             11
MAST Alcohol dependency symptoms                        3
MAST sum                                                8
Likelihood of nicotine dependence child    less dependent
FTND Sum                                                0
Name: 1000, Length: 89, dtype: object

#### set the dataset for analysis of prognosis (X:FU3 != y:FU3)

Please refer to <i>read_posthoc()</i> in <i>imagen_posthocloader.py</i>, and load the file

In [25]:
FU3 = posthoc.read_posthoc('IMAGEN_posthoc.csv')

In [26]:
FU3.groupby('Dataset').get_group('Holdout')

Unnamed: 0,ID,Session,y,Dataset,Sex,Site,Class,Trial,dataset,io,...,Psychological Aggression mean,Sexual Coercion mean,Positive Affect Score,Negative Affect Score,MAST flag,MAST total,MAST Alcohol dependency symptoms,MAST sum,Likelihood of nicotine dependence child,FTND Sum
2600,1163495,FU3,Binge,Holdout,Male,Mannheim,AAM,0,Holdout set,X-Binge,...,0.3125,0.0,31.0,20.0,positive alchololism screening,17.0,3.0,14.0,less dependent,0.0
2601,1163495,FU3,Binge,Holdout,Male,Mannheim,AAM,1,Holdout set,X-Binge,...,0.3125,0.0,31.0,20.0,positive alchololism screening,17.0,3.0,14.0,less dependent,0.0
2602,1163495,FU3,Binge,Holdout,Male,Mannheim,AAM,2,Holdout set,X-Binge,...,0.3125,0.0,31.0,20.0,positive alchololism screening,17.0,3.0,14.0,less dependent,0.0
2603,1163495,FU3,Binge,Holdout,Male,Mannheim,AAM,3,Holdout set,X-Binge,...,0.3125,0.0,31.0,20.0,positive alchololism screening,17.0,3.0,14.0,less dependent,0.0
2604,1163495,FU3,Binge,Holdout,Male,Mannheim,AAM,4,Holdout set,X-Binge,...,0.3125,0.0,31.0,20.0,positive alchololism screening,17.0,3.0,14.0,less dependent,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5451,99875982,FU3,Binge,Holdout,Male,Hamburg,HC,2,Holdout set,X-Binge,...,0.2500,0.0,35.0,13.0,,,,,less dependent,0.0
5452,99875982,FU3,Binge,Holdout,Male,Hamburg,HC,3,Holdout set,X-Binge,...,0.2500,0.0,35.0,13.0,,,,,less dependent,0.0
5453,99875982,FU3,Binge,Holdout,Male,Hamburg,HC,4,Holdout set,X-Binge,...,0.2500,0.0,35.0,13.0,,,,,less dependent,0.0
5454,99875982,FU3,Binge,Holdout,Male,Hamburg,HC,5,Holdout set,X-Binge,...,0.2500,0.0,35.0,13.0,,,,,less dependent,0.0


In [27]:
FU3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5456 entries, 0 to 5455
Data columns (total 89 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   ID                                           5456 non-null   int64  
 1   Session                                      5456 non-null   object 
 2   y                                            5456 non-null   object 
 3   Dataset                                      5456 non-null   object 
 4   Sex                                          5456 non-null   object 
 5   Site                                         5456 non-null   object 
 6   Class                                        5456 non-null   object 
 7   Trial                                        5456 non-null   int64  
 8   dataset                                      5456 non-null   object 
 9   io                                           5456 non-null   object 
 10  

In [28]:
# general information of the instrument
print(list(FU3.columns))

['ID', 'Session', 'y', 'Dataset', 'Sex', 'Site', 'Class', 'Trial', 'dataset', 'io', 'technique', 'Model', 'TP prob', 'TN prob', 'FP prob', 'FN prob', 'T prob', 'F prob', 'Prob', 'Predict TF', 'Model PN', 'Label PN', 'true_label', 'prediction', 'Family valence', 'Accident valence', 'Sexuality valence', 'Autonomy valence', 'Devience valence', 'Relocation valence', 'Distress valence', 'Noscale valence', 'Overall valence', 'Family mean frequency', 'Accident mean frequency', 'Sexuality mean frequency', 'Autonomy mean frequency', 'Devience mean frequency', 'Relocation mean frequency', 'Distress mean frequency', 'Noscale mean frequency', 'Overall mean frequency', 'Openness mean', 'Conscientiousness mean', 'Extroversion mean', 'Agreeableness mean', 'Neuroticism mean', 'Anxiety Sensitivity mean', 'Hopelessness mean', 'Impulsivity mean', 'Sensation seeking mean', 'Exploratory excitability vs. Stoic rigidity', 'Impulsiveness vs. Reflection', 'Extravagance vs. Reserve', 'Disorderliness vs. Regimen

In [29]:
print(FU3.iloc[1000])

ID                                               37058553
Session                                               FU3
y                                                   Binge
Dataset                                          Training
Sex                                                  Male
                                                ...      
MAST total                                             11
MAST Alcohol dependency symptoms                        3
MAST sum                                                8
Likelihood of nicotine dependence child    less dependent
FTND Sum                                                0
Name: 1000, Length: 89, dtype: object


### 9. Get the SHAP value

(To do) merge the command into one method: to_SHAP() in posthocloader.py

#### Diagnosis: X:FU3 to y:FU3 in holdout set

<b> Load the data and the model </b>

In [30]:
MODELS = posthoc.get_model("../../results/newlbls-clean-fu3-espad-fu3-19a-binge-*/*/")

In [31]:
holdout_dir = "newholdout-clean-fu3-espad-fu3-19a-binge-n102.h5"
# load the holdout data
ho_X, ho_X_col_names, ho_list = posthoc.get_holdout_data(holdout_dir, group=True)
# print(f"Holdout dataset: {ho_X.shape}, {len(ho_X_col_names)}, "
#       f"{ho_list[0].shape}, {ho_list[1].shape}")

In [32]:
# generate the SHAP input list of the holdout ONLY SVM-rbf
ho_INPUT = posthoc.get_list(MODELS, ho_X, "SVM-RBF")
# print(f"Number of training set: {len(tr_INPUT)}\n\n" # , One example: {tr_INPUT[0:1]}\n\n"
print(f"Number of holdout set: {len(ho_INPUT)}")#, {ho_INPUT}")

Number of holdout set: 7


<b> Compute the SHAP value </b>

In [None]:
# # One by one
# INPUT = tr_INPUT[0]
# start_time = time.time()
# _ = posthoc.get_SHAP(INPUT, 'FU3')
# print("--- %s seconds ---" % (time.time() - start_time))

In [33]:
# Multi processing
INPUT = ho_INPUT
start_time = time.time()
# _ = parmap.map(posthoc.get_SHAP, INPUT, 'FU3', pm_pbar=True, pm_processes=num_cores)
print("--- %s seconds ---" % (time.time() - start_time))

--- 8.797645568847656e-05 seconds ---


#### Prognosis: X:FU2 to y:FU3 in holdout set

<b> Load the data and the model </b>

In [34]:
MODELS = posthoc.get_model("../../results/newlbls-clean-fu2-espad-fu3-19a-binge-*/*/")

In [35]:
holdout_dir = "newholdout-clean-fu2-espad-fu3-19a-binge-n102.h5"
# load the holdout data
ho_X, ho_X_col_names, ho_list = posthoc.get_holdout_data(holdout_dir, group=True)
# print(f"Holdout dataset: {ho_X.shape}, {len(ho_X_col_names)}, "
#       f"{ho_list[0].shape}, {ho_list[1].shape}")

In [36]:
# generate the SHAP input list of the holdout ONLY SVM-rbf
ho_INPUT = posthoc.get_list(MODELS, ho_X, "SVM-LIN")
# print(f"Number of training set: {len(tr_INPUT)}\n\n" # , One example: {tr_INPUT[0:1]}\n\n"
print(f"Number of holdout set: {len(ho_INPUT)}")#, {ho_INPUT}")

Number of holdout set: 7


<b> Compute the SHAP value </b>

In [37]:
# # One by one
# INPUT = tr_INPUT[0]
# start_time = time.time()
# _ = posthoc.get_SHAP(INPUT, 'FU2')
# print("--- %s seconds ---" % (time.time() - start_time))

In [38]:
# Multi processing
INPUT = ho_INPUT
start_time = time.time()
# _ = parmap.map(posthoc.get_SHAP, INPUT, 'FU2', pm_pbar=True, pm_processes=num_cores)
print("--- %s seconds ---" % (time.time() - start_time))

--- 9.965896606445312e-05 seconds ---


#### Prognosis: X:BL to y:FU3 in holdout set

<b> Load the data and the model </b>

In [39]:
MODELS = posthoc.get_model("../../results/newlbls-clean-bl-espad-fu3-19a-binge-*/*/")

In [40]:
holdout_dir = "newholdout-clean-bl-espad-fu3-19a-binge-n102.h5"
# load the holdout data
ho_X, ho_X_col_names, ho_list = posthoc.get_holdout_data(holdout_dir, group=True)
# print(f"Holdout dataset: {ho_X.shape}, {len(ho_X_col_names)}, "
#       f"{ho_list[0].shape}, {ho_list[1].shape}")

In [41]:
# generate the SHAP input list of the holdout ONLY SVM-rbf
ho_INPUT = posthoc.get_list(MODELS, ho_X, "LR")
# print(f"Number of training set: {len(tr_INPUT)}\n\n" # , One example: {tr_INPUT[0:1]}\n\n"
print(f"Number of holdout set: {len(ho_INPUT)}")#, {ho_INPUT}")

Number of holdout set: 7


<b> Compute the SHAP value </b>

In [42]:
# # One by one
# INPUT = tr_INPUT[0]
# start_time = time.time()
# _ = posthoc.get_SHAP(INPUT, 'BL')
# print("--- %s seconds ---" % (time.time() - start_time))

In [43]:
# Multi processing
INPUT = ho_INPUT
start_time = time.time()
# _ = parmap.map(posthoc.get_SHAP, INPUT, 'BL', pm_pbar=True, pm_processes=num_cores)
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.00011110305786132812 seconds ---


#### Diagnosis: X:FU3 to y:FU3 in training set

(to do)

In [None]:
# MODELS = posthoc.get_model("../../results/newlbls-clean-fu3-espad-fu3-19a-binge-*/*/")
# train_dir = "newlbls-clean-fu3-espad-fu3-19a-binge-n650.h5"
# # load the training data
# tr_X, tr_X_col_names, tr_list = SHAP.get_train_data(train_dir, group=True)
# print(f"Training dataset: {tr_X.shape}, {len(tr_X_col_names)}, {tr_list[0].shape}")
# # generate the SHAP input list of the training
# tr_INPUT = SHAP.get_list(MODELS, tr_X)

### 10. Save the mean|SHAP| value

#### load the feature derivatives and mean, std |SHAP value|

Please refer to <i>to_abs_SHAP()</i> in <i>imagen_posthocloader.py</i>, and load the file

In [4]:
# FU3
H5_FU3 = "newholdout-clean-fu3-espad-fu3-19a-binge-n102.h5"
GB_FU3 = ["GB0_FU3.sav", "GB1_FU3.sav", "GB2_FU3.sav", "GB3_FU3.sav", "GB4_FU3.sav", "GB5_FU3.sav", "GB6_FU3.sav"]
LR_FU3 = ["LR0_FU3.sav", "LR1_FU3.sav", "LR2_FU3.sav", "LR3_FU3.sav", "LR4_FU3.sav", "LR5_FU3.sav", "LR6_FU3.sav"]
SVM_lin_FU3 = ["SVM-lin0_FU3.sav", "SVM-lin1_FU3.sav", "SVM-lin2_FU3.sav", "SVM-lin3_FU3.sav", "SVM-lin4_FU3.sav", "SVM-lin5_FU3.sav", "SVM-lin6_FU3.sav"]
SVM_rbf_FU3 = ["SVM-rbf0_FU3.sav", "SVM-rbf1_FU3.sav", "SVM-rbf2_FU3.sav", "SVM-rbf3_FU3.sav", "SVM-rbf4_FU3.sav", "SVM-rbf5_FU3.sav", "SVM-rbf6_FU3.sav"]

# FU2
H5_FU2 = "newholdout-clean-fu2-espad-fu3-19a-binge-n102.h5"
GB_FU2 = ["GB0_FU2.sav", "GB1_FU2.sav", "GB2_FU2.sav", "GB3_FU2.sav", "GB4_FU2.sav", "GB5_FU2.sav", "GB6_FU2.sav"]
LR_FU2 = ["LR0_FU2.sav", "LR1_FU2.sav", "LR2_FU2.sav", "LR3_FU2.sav", "LR4_FU2.sav", "LR5_FU2.sav", "LR6_FU2.sav"]
SVM_lin_FU2 = ["SVM-lin0_FU2.sav", "SVM-lin1_FU2.sav", "SVM-lin2_FU2.sav", "SVM-lin3_FU2.sav", "SVM-lin4_FU2.sav", "SVM-lin5_FU2.sav", "SVM-lin6_FU2.sav"]
SVM_rbf_FU2 = ["SVM-rbf0_FU2.sav", "SVM-rbf1_FU2.sav", "SVM-rbf2_FU2.sav", "SVM-rbf3_FU2.sav", "SVM-rbf4_FU2.sav", "SVM-rbf5_FU2.sav", "SVM-rbf6_FU2.sav"]

# BL
H5_BL = "newholdout-clean-bl-espad-fu3-19a-binge-n102.h5"
GB_BL = ["GB0_BL.sav", "GB1_BL.sav", "GB2_BL.sav", "GB3_BL.sav", "GB4_BL.sav", "GB5_BL.sav", "GB6_BL.sav"]
LR_BL = ["LR0_BL.sav", "LR1_BL.sav", "LR2_BL.sav", "LR3_BL.sav", "LR4_BL.sav", "LR5_BL.sav", "LR6_BL.sav"]
SVM_lin_BL = ["SVM-lin0_BL.sav", "SVM-lin1_BL.sav", "SVM-lin2_BL.sav", "SVM-lin3_BL.sav", "SVM-lin4_BL.sav", "SVM-lin5_BL.sav", "SVM-lin6_BL.sav"]
SVM_rbf_BL = ["SVM-rbf0_BL.sav", "SVM-rbf1_BL.sav", "SVM-rbf2_BL.sav", "SVM-rbf3_BL.sav", "SVM-rbf4_BL.sav", "SVM-rbf5_BL.sav", "SVM-rbf6_BL.sav"]

In [45]:
# FU3
SHAP = GB_FU3+LR_FU3+SVM_lin_FU3+SVM_rbf_FU3
FU3_SHAP = posthoc.to_abs_SHAP(H5_FU3, SHAP, "FU3")#, save=True)

# FU2
SHAP = GB_FU2+LR_FU2+SVM_lin_FU2+SVM_rbf_FU2
FU2_SHAP = posthoc.to_abs_SHAP(H5_FU2, SHAP, "FU2")#, save=True)

# BL
SHAP = GB_BL+LR_BL+SVM_lin_BL+SVM_rbf_BL
BL_SHAP = posthoc.to_abs_SHAP(H5_BL, SHAP, "BL")#, save=True)

#### load the mean of mean, std |SHAP value|

Please refer to <i>to_mofm_SHAP()</i> in <i>imagen_posthocloader.py</i>, and load the file

In [46]:
FU3_SHAP = posthoc.read_SHAP('all_FU3_SHAP.csv')
FU2_SHAP = posthoc.read_SHAP('all_FU2_SHAP.csv')
BL_SHAP = posthoc.read_SHAP('all_BL_SHAP.csv')

#### sorted SHAP in SVM-rbf

In [None]:
DF = posthoc.read_SHAP('all_mofm_abs_SHAP.csv')
DF = posthoc.to_sorted_mean_SHAP(DF, 'SVM-rbf', 'FU3')
DF = posthoc.to_sorted_mean_SHAP(DF, 'SVM-rbf', 'FU2')
DF = posthoc.to_sorted_mean_SHAP(DF, 'SVM-rbf', 'BL')#, save=True)

In [None]:
DF

#### load and save the SHAP value subject by subject

In [5]:
# FU3
for MD in GB_FU3:
    posthoc.load_SHAP(H5_FU3, MD)#, save=True)
for MD in LR_FU3:
    posthoc.load_SHAP(H5_FU3, MD)#, save=True)
for MD in SVM_lin_FU3:
    posthoc.load_SHAP(H5_FU3, MD)#, save=True)
for MD in SVM_rbf_FU3:
    posthoc.load_SHAP(H5_FU3, MD)#, save=True)
# FU2
for MD in GB_FU2:
    posthoc.load_SHAP(H5_FU2, MD)#, save=True)
for MD in LR_FU2:
    posthoc.load_SHAP(H5_FU2, MD)#, save=True)
for MD in SVM_lin_FU2:
    posthoc.load_SHAP(H5_FU2, MD)#, save=True)
for MD in SVM_rbf_FU2:
    posthoc.load_SHAP(H5_FU2, MD)#, save=True)
# BL
for MD in GB_BL:
    posthoc.load_SHAP(H5_BL, MD)#, save=True)
for MD in LR_BL:
    posthoc.load_SHAP(H5_BL, MD)#, save=True)
for MD in SVM_lin_BL:
    posthoc.load_SHAP(H5_BL, MD)#, save=True)
for MD in SVM_rbf_BL:
    posthoc.load_SHAP(H5_BL, MD)#, save=True)

<b> Case: SVM-rbf in FU3 </b>

In [6]:
SVM_rbf0_FU3 = posthoc.read_SHAP('all_SVM-rbf0_FU3_SHAP.csv')
SVM_rbf1_FU3 = posthoc.read_SHAP('all_SVM-rbf1_FU3_SHAP.csv')
SVM_rbf2_FU3 = posthoc.read_SHAP('all_SVM-rbf2_FU3_SHAP.csv')
SVM_rbf3_FU3 = posthoc.read_SHAP('all_SVM-rbf3_FU3_SHAP.csv')
SVM_rbf4_FU3 = posthoc.read_SHAP('all_SVM-rbf4_FU3_SHAP.csv')
SVM_rbf5_FU3 = posthoc.read_SHAP('all_SVM-rbf5_FU3_SHAP.csv')
SVM_rbf6_FU3 = posthoc.read_SHAP('all_SVM-rbf6_FU3_SHAP.csv')
SVM_rbf_list = [SVM_rbf0_FU3, SVM_rbf1_FU3, SVM_rbf2_FU3,
                SVM_rbf3_FU3, SVM_rbf4_FU3, SVM_rbf5_FU3, SVM_rbf6_FU3]
SVM_rbf_FU3 = posthoc.to_SHAP(SVM_rbf_list, 'all_SVM-rbf_FU3_SHAP.csv')#, save=True)

In [7]:
SVM_rbf_FU3

Unnamed: 0,ID,Session,Trial,Model,Class,T1w_cor_bankssts-lh-volume,T1w_cor_caudalanteriorcingulate-lh-volume,T1w_cor_caudalmiddlefrontal-lh-volume,T1w_cor_cuneus-lh-volume,T1w_cor_entorhinal-lh-volume,...,DTI_SFO-R_Average,DTI_SLF_Average,DTI_SLF-L_Average,DTI_SLF-R_Average,DTI_SS_Average,DTI_SS-L_Average,DTI_SS-R_Average,DTI_UNC_Average,DTI_UNC-L_Average,DTI_UNC-R_Average
0,1163495,FU3,0,SVM-rbf,AAM,0.008,0.024,0.000,1.000000e-03,-3.000000e-03,...,-3.000000e-03,0.002,-0.005,0.000,0.000,-4.000000e-03,0.001,-0.014,0.003,-0.010
1,1938036,FU3,0,SVM-rbf,AAM,0.000,0.003,0.001,3.000000e-03,0.000000e+00,...,0.000000e+00,-0.002,0.000,0.000,0.000,0.000000e+00,0.002,0.001,0.001,-0.005
2,2103894,FU3,0,SVM-rbf,AAM,0.000,-0.003,-0.001,0.000000e+00,1.000000e-03,...,-2.000000e-03,0.003,0.011,-0.003,-0.008,-7.000000e-03,0.000,-0.002,-0.005,-0.005
3,2766073,FU3,0,SVM-rbf,AAM,0.006,0.007,0.000,6.000000e-03,-6.000000e-03,...,-1.000000e-03,-0.004,0.003,0.000,0.000,-7.000000e-03,0.000,0.000,-0.002,-0.010
4,3504454,FU3,0,SVM-rbf,HC,0.010,-0.008,0.014,-1.387779e-18,5.204170e-19,...,-2.775558e-18,-0.003,0.000,0.000,0.001,5.000000e-03,-0.005,-0.007,-0.011,-0.008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,95957207,FU3,6,SVM-rbf,HC,0.003,0.009,0.000,2.000000e-03,1.300000e-02,...,5.000000e-03,0.018,0.044,-0.002,-0.001,-6.000000e-03,0.000,0.003,0.002,0.004
98,97739048,FU3,6,SVM-rbf,HC,0.000,0.002,0.002,0.000000e+00,-5.000000e-03,...,-4.000000e-03,-0.012,-0.036,0.002,0.014,2.100000e-02,0.005,0.004,0.002,0.004
99,99217838,FU3,6,SVM-rbf,AAM,-0.006,-0.060,-0.002,-6.000000e-03,3.000000e-03,...,0.000000e+00,0.006,0.023,-0.006,-0.017,1.387779e-18,-0.001,-0.011,0.011,-0.009
100,99677574,FU3,6,SVM-rbf,HC,0.009,-0.009,-0.001,-3.000000e-03,-5.000000e-03,...,-1.000000e-03,-0.001,0.015,0.004,-0.002,-4.000000e-03,0.000,-0.001,0.005,-0.002


### 11. Save the Summary Statistics

#### load the dataset

In [None]:
# HDF5
HDF5 = posthoc.read_HDF5('all_Binge.csv')
# INSTRUMENT
INST = posthoc.read_INSTRUMENT('IMAGEN_INSTRUMENT.csv')

In [None]:
# FU3
HDF5_FU3 = HDF5.groupby('Session').get_group('FU3')
INST_FU3 = INST.groupby('Session').get_group('FU3')
SS_FU3 = pd.merge(HDF5_FU3,INST_FU3, on=['ID','Session'], how='left')

In [None]:
SS_FU3.info()

In [None]:
SS_FU3_Col = list(SS_FU3.columns[:66])+list(SS_FU3.columns[67:70])+list(SS_FU3.columns[71:])

In [None]:
SS = SS_FU3[SS_FU3_Col]
SS

In [None]:
SS.describe()

In [None]:
# save_path = f"{DATA_DIR}/posthoc/IMAGEN_Binge_FU3_SS_ver02.csv"
# if not os.path.isdir(os.path.dirname(save_path)):
#     os.makedirs(os.path.dirname(save_path))
# SS.to_csv(save_path, index=None)