### Generate the post hoc dataset for analysis

In [1]:
from imagen_posthocloader import *

In [2]:
DATA_DIR = "/ritter/share/data/IMAGEN"
posthoc = IMAGEN_posthoc()

### 1. Load the [INSTRUMENT](https://imagen-europe.com/resources/imagen-dataset/documentation/) data

#### collect the selected instrument files from IMAGEN_RAW and store in posthoc

Please refer to <i>set_INSTRUMENT()</i> in <i>imagen_posthocloader.py</i>, and load the file and save it as <b> all_*.csv</b> 

In [3]:
# Instrument
## Demographic
PBQ   = posthoc.set_INSTRUMENT('PBQ')#, save=True)
GEN   = posthoc.set_INSTRUMENT('GEN')#, save=True)
LEQ   = posthoc.set_INSTRUMENT('LEQ')#, save=True)
# DAWBA
# CANTAB
NEO   = posthoc.set_INSTRUMENT('NEO')#, save=True)
SURPS = posthoc.set_INSTRUMENT('SURPS')#, save=True)
TCI = posthoc.set_INSTRUMENT('TCI')#, save=True)
BSI = posthoc.set_INSTRUMENT('BSI')#, save=True)
# KIRBY
# BIS-11
# CSI
# PHQ
# CES-D
# ANXDX
# CAPE
# SDQ
# IRI
# RRS
# PALP
## Social
# CTQ   = posthoc.set_INSTRUMENT('CTQ')#, save=True)
CTQ_MD = posthoc.set_INSTRUMENT('CTQ_MD')#, save=True)
CTS   = posthoc.set_INSTRUMENT('CTS')#, save=True)
PANAS = posthoc.set_INSTRUMENT('PANAS')#, save=True)
# MINI5
## Substance Use
MAST = posthoc.set_INSTRUMENT('MAST')#, save=True)
FTND  = posthoc.set_INSTRUMENT('FTND')#, save=True)
# DAST
# SCID
# RAPI
# DMQ
# Bully Questionnaire
# ESPAD
# TLFB
# AUDIT

### 2. Load the HDF5 data

#### collect the HDF5 files from h5files and save in posthoc

Please refer to <i>set_HDF5()</i> in <i>imagen_posthocloader.py</i>, and load the file and save it as <b> all_*.csv</b>

In [4]:
# Hdf5
BINGE = posthoc.set_HDF5('Binge')#, save=True)

In [5]:
# general information of the hdf5
BINGE.columns

Index(['ID', 'Session', 'y', 'Dataset', 'Sex', 'Site', 'Class'], dtype='object')

### 3. Load the RUN data

#### collect the RUN file from results and save in posthoc

Please refer to <i>set_RUN()</i> in <i>imagen_posthocloader.py</i>, and load the file and save it as <b> all_*.csv</b>

In [6]:
# RUN
RUN = posthoc.set_RUN('../../results/holdout_all-tp-clean_run.csv')#, save=True)

In [7]:
# general infromation of the hdf5
RUN.columns

Index(['i', 'o', 'io', 'technique', 'Session', 'Trial', 'path', 'n_samples',
       'n_samples_cc', 'i_is_conf', 'o_is_conf', 'Model', 'model_SVM-rbf__C',
       'model_SVM-rbf__gamma', 'runtime', 'model_SVM-lin__C',
       'model_GB__learning_rate', 'model_LR__C', 'train_score', 'valid_score',
       'test_score', 'roc_auc', 'holdout_score', 'holdout_roc_auc', 'dataset',
       'ID', 'true_label', 'prediction', 'TP prob', 'TN prob', 'FP prob',
       'FN prob', 'T prob', 'F prob', 'Prob', 'Predict TF', 'Model PN',
       'Label PN'],
      dtype='object')

### 4. Save the INSTRUMENT data

#### collect the instrument files from posthoc into one file

Please refer to <i>to_INSTRUMENT()</i> in <i>imagen_posthocloader.py</i>, and load the file

In [8]:
# collect the instrument file 
inst_list = [LEQ,                      # Demographic
             NEO, SURPS, TCI, BSI,     # Psychological
             CTQ_MD, CTS, PANAS,       # Social
             MAST, FTND]               # Substance use
# save the instrument file
INST = posthoc.to_INSTRUMENT(inst_list)#, save=True)

In [9]:
# general information of the instrument
# selected ROI
col_INST = list(INST.columns[2:].values)
print(len(col_INST), col_INST)

65 ['Family valence', 'Accident valence', 'Sexuality valence', 'Autonomy valence', 'Devience valence', 'Relocation valence', 'Distress valence', 'Noscale valence', 'Overall valence', 'Family mean frequency', 'Accident mean frequency', 'Sexuality mean frequency', 'Autonomy mean frequency', 'Devience mean frequency', 'Relocation mean frequency', 'Distress mean frequency', 'Noscale mean frequency', 'Overall mean frequency', 'Openness mean', 'Conscientiousness mean', 'Extroversion mean', 'Agreeableness mean', 'Neuroticism mean', 'Anxiety Sensitivity mean', 'Hopelessness mean', 'Impulsivity mean', 'Sensation seeking mean', 'Exploratory excitability vs. Stoic rigidity', 'Impulsiveness vs. Reflection', 'Extravagance vs. Reserve', 'Disorderliness vs. Regimentation', 'Total Novelty Seeking score', 'Somatization mean', 'Obsession-Compulsion mean', 'Interpersonal Sensitivity mean', 'Depression mean', 'Anxiety mean', 'Hostility mean', 'Phobic Anxiety mean', 'Paranoid Ideation mean', 'Psychoticism 

### 5. Read the INSTRUMENT data

#### read the instrument files from posthoc into one file

Please refer to <i>read_INSTRUMENT()</i> in <i>imagen_posthocloader.py</i>, and load the file

In [10]:
INST = posthoc.read_INSTRUMENT('IMAGEN_INSTRUMENT.csv')

In [11]:
# general information of the instrument
INST.columns

Index(['ID', 'Session', 'Family valence', 'Accident valence',
       'Sexuality valence', 'Autonomy valence', 'Devience valence',
       'Relocation valence', 'Distress valence', 'Noscale valence',
       'Overall valence', 'Family mean frequency', 'Accident mean frequency',
       'Sexuality mean frequency', 'Autonomy mean frequency',
       'Devience mean frequency', 'Relocation mean frequency',
       'Distress mean frequency', 'Noscale mean frequency',
       'Overall mean frequency', 'Openness mean', 'Conscientiousness mean',
       'Extroversion mean', 'Agreeableness mean', 'Neuroticism mean',
       'Anxiety Sensitivity mean', 'Hopelessness mean', 'Impulsivity mean',
       'Sensation seeking mean', 'Exploratory excitability vs. Stoic rigidity',
       'Impulsiveness vs. Reflection', 'Extravagance vs. Reserve',
       'Disorderliness vs. Regimentation', 'Total Novelty Seeking score',
       'Somatization mean', 'Obsession-Compulsion mean',
       'Interpersonal Sensitivity mean'

### 6. Read the HDF5 data

#### collect the hdf5 files from posthoc into one file

Please refer to <i>to_HDF5()</i> in <i>imagen_posthocloader.py</i>, and load the file and save it as <b> all_*.csv</b>

In [12]:
HDF5 = posthoc.to_HDF5('all_Binge.csv')#, save=True)

#### read the HDF5 files from posthoc into one file

Please refer to <i>read_HDF5()</i> in <i>imagen_posthocloader.py</i>, and load the file</b>

In [13]:
HDF5 = posthoc.read_HDF5('IMAGEN_HDF5.csv')

In [14]:
# general information of the hdf5
HDF5.columns

Index(['ID', 'Session', 'y', 'Dataset', 'Sex', 'Site', 'Class'], dtype='object')

### 7. Read the RUN data

#### select the ROI of the RUN file from posthoc into one file

Please refer to <i>to_RUN()</i> in <i>imagen_posthocloader.py</i>, and load the file and save it as <b> all_*.csv</b>

In [15]:
RUN.columns

Index(['i', 'o', 'io', 'technique', 'Session', 'Trial', 'path', 'n_samples',
       'n_samples_cc', 'i_is_conf', 'o_is_conf', 'Model', 'model_SVM-rbf__C',
       'model_SVM-rbf__gamma', 'runtime', 'model_SVM-lin__C',
       'model_GB__learning_rate', 'model_LR__C', 'train_score', 'valid_score',
       'test_score', 'roc_auc', 'holdout_score', 'holdout_roc_auc', 'dataset',
       'ID', 'true_label', 'prediction', 'TP prob', 'TN prob', 'FP prob',
       'FN prob', 'T prob', 'F prob', 'Prob', 'Predict TF', 'Model PN',
       'Label PN'],
      dtype='object')

In [16]:
COL = ['ID','Session','Trial','dataset','io','technique','Model',
       'TP prob','TN prob','FP prob','FN prob','T prob','F prob','Prob',
       'Predict TF','Model PN','Label PN','true_label','prediction']

In [17]:
RUN = posthoc.to_RUN('all_RUN.csv', COL)#, save = True)

In [18]:
# general information of the run
RUN.columns

Index(['ID', 'Session', 'Trial', 'dataset', 'io', 'technique', 'Model',
       'TP prob', 'TN prob', 'FP prob', 'FN prob', 'T prob', 'F prob', 'Prob',
       'Predict TF', 'Model PN', 'Label PN', 'true_label', 'prediction'],
      dtype='object')

#### read the RUN files from posthoc into one file

Please refer to <i>read_RUN()</i> in <i>imagen_posthocloader.py</i>, and load the file

In [19]:
RUN = posthoc.read_RUN('IMAGEN_RUN.csv')

In [20]:
# general information of the run
RUN.columns

Index(['ID', 'Session', 'Trial', 'dataset', 'io', 'technique', 'Model',
       'TP prob', 'TN prob', 'FP prob', 'FN prob', 'T prob', 'F prob', 'Prob',
       'Predict TF', 'Model PN', 'Label PN', 'true_label', 'prediction'],
      dtype='object')

### 8. Save the post hoc dataset

#### set the dataset for analysis of diagnosis (X:FU3 == y:FU3)

Please refer to <i>to_posthoc()</i> in <i>imagen_posthocloader.py</i>, and load the file and save it as <b> all_*.csv</b>

In [21]:
DATA = ['IMAGEN_HDF5.csv', 'IMAGEN_INSTRUMENT.csv', 'IMAGEN_RUN.csv']
FU3 = posthoc.to_posthoc(DATA)#, save=True)

In [22]:
# general information of the instrument
FU3.columns

Index(['ID', 'Session', 'y', 'Dataset', 'Sex', 'Site', 'Class', 'Trial',
       'dataset', 'io', 'technique', 'Model', 'TP prob', 'TN prob', 'FP prob',
       'FN prob', 'T prob', 'F prob', 'Prob', 'Predict TF', 'Model PN',
       'Label PN', 'true_label', 'prediction', 'Family valence',
       'Accident valence', 'Sexuality valence', 'Autonomy valence',
       'Devience valence', 'Relocation valence', 'Distress valence',
       'Noscale valence', 'Overall valence', 'Family mean frequency',
       'Accident mean frequency', 'Sexuality mean frequency',
       'Autonomy mean frequency', 'Devience mean frequency',
       'Relocation mean frequency', 'Distress mean frequency',
       'Noscale mean frequency', 'Overall mean frequency', 'Openness mean',
       'Conscientiousness mean', 'Extroversion mean', 'Agreeableness mean',
       'Neuroticism mean', 'Anxiety Sensitivity mean', 'Hopelessness mean',
       'Impulsivity mean', 'Sensation seeking mean',
       'Exploratory excitability vs. 

In [23]:
FU3.iloc[1000]

ID                                               37058553
Session                                               FU3
y                                                   Binge
Dataset                                          Training
Sex                                                  Male
                                                ...      
MAST total                                             11
MAST Alcohol dependency symptoms                        3
MAST sum                                                8
Likelihood of nicotine dependence child    less dependent
FTND Sum                                                0
Name: 1000, Length: 89, dtype: object

#### set the dataset for analysis of prognosis (X:FU3 != y:FU3)

Please refer to <i>read_posthoc()</i> in <i>imagen_posthocloader.py</i>, and load the file

In [24]:
FU3 = posthoc.read_posthoc('IMAGEN_posthoc.csv')

In [25]:
FU3.groupby('Dataset').get_group('Holdout')

Unnamed: 0,ID,Session,y,Dataset,Sex,Site,Class,Trial,dataset,io,...,Psychological Aggression mean,Sexual Coercion mean,Positive Affect Score,Negative Affect Score,MAST flag,MAST total,MAST Alcohol dependency symptoms,MAST sum,Likelihood of nicotine dependence child,FTND Sum
2600,1163495,FU3,Binge,Holdout,Male,Mannheim,AAM,0,Holdout set,X-Binge,...,0.3125,0.0,31.0,20.0,positive alchololism screening,17.0,3.0,14.0,less dependent,0.0
2601,1163495,FU3,Binge,Holdout,Male,Mannheim,AAM,1,Holdout set,X-Binge,...,0.3125,0.0,31.0,20.0,positive alchololism screening,17.0,3.0,14.0,less dependent,0.0
2602,1163495,FU3,Binge,Holdout,Male,Mannheim,AAM,2,Holdout set,X-Binge,...,0.3125,0.0,31.0,20.0,positive alchololism screening,17.0,3.0,14.0,less dependent,0.0
2603,1163495,FU3,Binge,Holdout,Male,Mannheim,AAM,3,Holdout set,X-Binge,...,0.3125,0.0,31.0,20.0,positive alchololism screening,17.0,3.0,14.0,less dependent,0.0
2604,1163495,FU3,Binge,Holdout,Male,Mannheim,AAM,4,Holdout set,X-Binge,...,0.3125,0.0,31.0,20.0,positive alchololism screening,17.0,3.0,14.0,less dependent,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5451,99875982,FU3,Binge,Holdout,Male,Hamburg,HC,2,Holdout set,X-Binge,...,0.2500,0.0,35.0,13.0,,,,,less dependent,0.0
5452,99875982,FU3,Binge,Holdout,Male,Hamburg,HC,3,Holdout set,X-Binge,...,0.2500,0.0,35.0,13.0,,,,,less dependent,0.0
5453,99875982,FU3,Binge,Holdout,Male,Hamburg,HC,4,Holdout set,X-Binge,...,0.2500,0.0,35.0,13.0,,,,,less dependent,0.0
5454,99875982,FU3,Binge,Holdout,Male,Hamburg,HC,5,Holdout set,X-Binge,...,0.2500,0.0,35.0,13.0,,,,,less dependent,0.0


In [26]:
FU3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5456 entries, 0 to 5455
Data columns (total 89 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   ID                                           5456 non-null   int64  
 1   Session                                      5456 non-null   object 
 2   y                                            5456 non-null   object 
 3   Dataset                                      5456 non-null   object 
 4   Sex                                          5456 non-null   object 
 5   Site                                         5456 non-null   object 
 6   Class                                        5456 non-null   object 
 7   Trial                                        5456 non-null   int64  
 8   dataset                                      5456 non-null   object 
 9   io                                           5456 non-null   object 
 10  

In [27]:
# general information of the instrument
print(list(FU3.columns))

['ID', 'Session', 'y', 'Dataset', 'Sex', 'Site', 'Class', 'Trial', 'dataset', 'io', 'technique', 'Model', 'TP prob', 'TN prob', 'FP prob', 'FN prob', 'T prob', 'F prob', 'Prob', 'Predict TF', 'Model PN', 'Label PN', 'true_label', 'prediction', 'Family valence', 'Accident valence', 'Sexuality valence', 'Autonomy valence', 'Devience valence', 'Relocation valence', 'Distress valence', 'Noscale valence', 'Overall valence', 'Family mean frequency', 'Accident mean frequency', 'Sexuality mean frequency', 'Autonomy mean frequency', 'Devience mean frequency', 'Relocation mean frequency', 'Distress mean frequency', 'Noscale mean frequency', 'Overall mean frequency', 'Openness mean', 'Conscientiousness mean', 'Extroversion mean', 'Agreeableness mean', 'Neuroticism mean', 'Anxiety Sensitivity mean', 'Hopelessness mean', 'Impulsivity mean', 'Sensation seeking mean', 'Exploratory excitability vs. Stoic rigidity', 'Impulsiveness vs. Reflection', 'Extravagance vs. Reserve', 'Disorderliness vs. Regimen

In [28]:
print(FU3.iloc[1000])

ID                                               37058553
Session                                               FU3
y                                                   Binge
Dataset                                          Training
Sex                                                  Male
                                                ...      
MAST total                                             11
MAST Alcohol dependency symptoms                        3
MAST sum                                                8
Likelihood of nicotine dependence child    less dependent
FTND Sum                                                0
Name: 1000, Length: 89, dtype: object


### 9. Save the mean|SHAP| value

CAUTION: Generating SHAP value is needed in advance

#### load the SHAP

Please refer to <i>load_SHAP()</i> in <i>imagen_posthocloader.py</i>, and load the file

In [29]:
# load the SHAP value
# GB
mean_GB0, std_GB0 = posthoc.load_SHAP("GB0_multi.sav")
mean_GB1, std_GB1 = posthoc.load_SHAP("GB1_multi.sav")
mean_GB2, std_GB2 = posthoc.load_SHAP("GB2_multi.sav")
mean_GB3, std_GB3 = posthoc.load_SHAP("GB3_multi.sav")
mean_GB4, std_GB4 = posthoc.load_SHAP("GB4_multi.sav")
mean_GB5, std_GB5 = posthoc.load_SHAP("GB5_multi.sav")
mean_GB6, std_GB6 = posthoc.load_SHAP("GB6_multi.sav")
# LR
mean_LR0, std_LR0 = posthoc.load_SHAP("LR0_multi.sav")
mean_LR1, std_LR1 = posthoc.load_SHAP("LR1_multi.sav")
mean_LR2, std_LR2 = posthoc.load_SHAP("LR2_multi.sav")
mean_LR3, std_LR3 = posthoc.load_SHAP("LR3_multi.sav")
mean_LR4, std_LR4 = posthoc.load_SHAP("LR4_multi.sav")
mean_LR5, std_LR5 = posthoc.load_SHAP("LR5_multi.sav")
mean_LR6, std_LR6 = posthoc.load_SHAP("LR6_multi.sav")
# SVM-lin
mean_SVM_lin0, std_SVM_lin0 = posthoc.load_SHAP("SVM-lin0_multi.sav")
mean_SVM_lin1, std_SVM_lin1 = posthoc.load_SHAP("SVM-lin1_multi.sav")
mean_SVM_lin2, std_SVM_lin2 = posthoc.load_SHAP("SVM-lin2_multi.sav")
mean_SVM_lin3, std_SVM_lin3 = posthoc.load_SHAP("SVM-lin3_multi.sav")
mean_SVM_lin4, std_SVM_lin4 = posthoc.load_SHAP("SVM-lin4_multi.sav")
mean_SVM_lin5, std_SVM_lin5 = posthoc.load_SHAP("SVM-lin5_multi.sav")
mean_SVM_lin6, std_SVM_lin6 = posthoc.load_SHAP("SVM-lin6_multi.sav")
# SVM-rbf
mean_SVM_rbf0, std_SVM_rbf0 = posthoc.load_SHAP("SVM-rbf0_multi.sav")
mean_SVM_rbf1, std_SVM_rbf1 = posthoc.load_SHAP("SVM-rbf1_multi.sav")
mean_SVM_rbf2, std_SVM_rbf2 = posthoc.load_SHAP("SVM-rbf2_multi.sav")
mean_SVM_rbf3, std_SVM_rbf3 = posthoc.load_SHAP("SVM-rbf3_multi.sav")
mean_SVM_rbf4, std_SVM_rbf4 = posthoc.load_SHAP("SVM-rbf4_multi.sav")
mean_SVM_rbf5, std_SVM_rbf5 = posthoc.load_SHAP("SVM-rbf5_multi.sav")
mean_SVM_rbf6, std_SVM_rbf6 = posthoc.load_SHAP("SVM-rbf6_multi.sav")

In [30]:
# load the holdout data
holdout_dir = "newholdout-clean-fu3-espad-fu3-19a-binge-n102.h5"
ho_X, ho_X_col_names, ho_list = posthoc.get_holdout_data(holdout_dir, group=False)
# print(f"Holdout dataset: {ho_X.shape}, {len(ho_X_col_names)}, "
#       f"{ho_list[0].shape}, {ho_list[1].shape}")

In [31]:
# Save the DF_SHAP
DF_SHAP = pd.DataFrame(
    {'GB0 mean': mean_GB0,
     'GB1 mean': mean_GB1,
     'GB2 mean': mean_GB2,
     'GB3 mean': mean_GB3,
     'GB4 mean': mean_GB4,
     'GB5 mean': mean_GB5,
     'GB6 mean': mean_GB6,
     'LR0 mean': mean_LR0,
     'LR1 mean': mean_LR1,
     'LR2 mean': mean_LR2,
     'LR3 mean': mean_LR3,
     'LR4 mean': mean_LR4,
     'LR5 mean': mean_LR5,
     'LR6 mean': mean_LR6,
     'SVM_lin0 mean': mean_SVM_lin0,
     'SVM_lin1 mean': mean_SVM_lin1,
     'SVM_lin2 mean': mean_SVM_lin2,
     'SVM_lin3 mean': mean_SVM_lin3,
     'SVM_lin4 mean': mean_SVM_lin4,
     'SVM_lin5 mean': mean_SVM_lin5,
     'SVM_lin6 mean': mean_SVM_lin6,
     'SVM_rbf0 mean': mean_SVM_rbf0,
     'SVM_rbf1 mean': mean_SVM_rbf1,
     'SVM_rbf2 mean': mean_SVM_rbf2,
     'SVM_rbf3 mean': mean_SVM_rbf3,
     'SVM_rbf4 mean': mean_SVM_rbf4,
     'SVM_rbf5 mean': mean_SVM_rbf5,
     'SVM_rbf6 mean': mean_SVM_rbf6,
     'GB0 std': std_GB0,
     'GB1 std': std_GB1,
     'GB2 std': std_GB2,
     'GB3 std': std_GB3,
     'GB4 std': std_GB4,
     'GB5 std': std_GB5,
     'GB6 std': std_GB6,
     'LR0 std': std_LR0,
     'LR1 std': std_LR1,
     'LR2 std': std_LR2,
     'LR3 std': std_LR3,
     'LR4 std': std_LR4,
     'LR5 std': std_LR5,
     'LR6 std': std_LR6,
     'SVM_lin0 std': std_SVM_lin0,
     'SVM_lin1 std': std_SVM_lin1,
     'SVM_lin2 std': std_SVM_lin2,
     'SVM_lin3 std': std_SVM_lin3,
     'SVM_lin4 std': std_SVM_lin4,
     'SVM_lin5 std': std_SVM_lin5,
     'SVM_lin6 std': std_SVM_lin6,
     'SVM_rbf0 std': std_SVM_rbf0,
     'SVM_rbf1 std': std_SVM_rbf1,
     'SVM_rbf2 std': std_SVM_rbf2,
     'SVM_rbf3 std': std_SVM_rbf3,
     'SVM_rbf4 std': std_SVM_rbf4,
     'SVM_rbf5 std': std_SVM_rbf5,
     'SVM_rbf6 std': std_SVM_rbf6,
     'Col names': ho_X_col_names}
)

In [32]:
DF_SHAP

Unnamed: 0,GB0 mean,GB1 mean,GB2 mean,GB3 mean,GB4 mean,GB5 mean,GB6 mean,LR0 mean,LR1 mean,LR2 mean,...,SVM_lin5 std,SVM_lin6 std,SVM_rbf0 std,SVM_rbf1 std,SVM_rbf2 std,SVM_rbf3 std,SVM_rbf4 std,SVM_rbf5 std,SVM_rbf6 std,Col names
0,0.000000,0.000000,0.000000,0.000284,0.000000,0.002814,0.000000,0.000588,0.002971,0.006118,...,0.006673,0.011833,0.006345,0.002279,0.004085,0.006211,0.004912,0.002000,0.007592,T1w_cor_bankssts-lh-volume
1,0.000029,0.005127,0.023265,0.001578,0.009637,0.001892,0.005137,0.009167,0.000284,0.009284,...,0.001308,0.006127,0.014738,0.010316,0.012958,0.015548,0.015661,0.013116,0.006958,T1w_cor_caudalanteriorcingulate-lh-volume
2,0.000804,0.012402,0.002216,0.008206,0.000118,0.003980,0.000824,0.009745,0.015833,0.004412,...,0.016619,0.027380,0.007733,0.004556,0.002881,0.003649,0.005856,0.003943,0.002670,T1w_cor_caudalmiddlefrontal-lh-volume
3,0.000000,0.000000,0.000000,0.000000,0.000500,0.002882,0.000794,0.002010,0.001804,0.006029,...,0.002623,0.009764,0.005828,0.002467,0.002663,0.001886,0.002824,0.002417,0.002627,T1w_cor_cuneus-lh-volume
4,0.000667,0.009020,0.001539,0.011284,0.003735,0.000971,0.000000,0.011000,0.011304,0.009333,...,0.002642,0.033080,0.008751,0.013435,0.009926,0.010371,0.009716,0.017496,0.013574,T1w_cor_entorhinal-lh-volume
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
714,0.003461,0.003157,0.004039,0.014971,0.002598,0.006971,0.000000,0.015814,0.002333,0.003147,...,0.016935,0.021165,0.006419,0.010762,0.010780,0.013397,0.009839,0.009730,0.006767,DTI_SS-L_Average
715,0.000755,0.000000,0.002667,0.001314,0.000000,0.000000,0.001373,0.019206,0.001814,0.002157,...,0.013622,0.019012,0.003654,0.002327,0.004168,0.002224,0.002580,0.003212,0.002545,DTI_SS-R_Average
716,0.000000,0.004010,0.000000,0.005167,0.003402,0.001657,0.000000,0.003324,0.010235,0.008578,...,0.024814,0.005384,0.007020,0.002617,0.003950,0.002619,0.011159,0.003840,0.004482,DTI_UNC_Average
717,0.000000,0.002971,0.001686,0.002206,0.000000,0.000627,0.000000,0.001265,0.003088,0.015833,...,0.005472,0.012711,0.009566,0.010366,0.007827,0.007766,0.004413,0.009363,0.005065,DTI_UNC-L_Average


In [33]:
# DF_SHAP.to_csv("/ritter/share/data/IMAGEN/posthoc/all_mean_SHAP.csv", index=None)

#### read the SHAP value

In [6]:
DF_SHAP = posthoc.read_SHAP('all_mean_SHAP.csv')

In [7]:
DF_SHAP

Unnamed: 0,GB0 mean,GB1 mean,GB2 mean,GB3 mean,GB4 mean,GB5 mean,GB6 mean,LR0 mean,LR1 mean,LR2 mean,...,SVM_lin5 std,SVM_lin6 std,SVM_rbf0 std,SVM_rbf1 std,SVM_rbf2 std,SVM_rbf3 std,SVM_rbf4 std,SVM_rbf5 std,SVM_rbf6 std,Col names
0,0.000000,0.000000,0.000000,0.000284,0.000000,0.002814,0.000000,0.000588,0.002971,0.006118,...,0.006673,0.011833,0.006345,0.002279,0.004085,0.006211,0.004912,0.002000,0.007592,T1w_cor_bankssts-lh-volume
1,0.000029,0.005127,0.023265,0.001578,0.009637,0.001892,0.005137,0.009167,0.000284,0.009284,...,0.001308,0.006127,0.014738,0.010316,0.012958,0.015548,0.015661,0.013116,0.006958,T1w_cor_caudalanteriorcingulate-lh-volume
2,0.000804,0.012402,0.002216,0.008206,0.000118,0.003980,0.000824,0.009745,0.015833,0.004412,...,0.016619,0.027380,0.007733,0.004556,0.002881,0.003649,0.005856,0.003943,0.002670,T1w_cor_caudalmiddlefrontal-lh-volume
3,0.000000,0.000000,0.000000,0.000000,0.000500,0.002882,0.000794,0.002010,0.001804,0.006029,...,0.002623,0.009764,0.005828,0.002467,0.002663,0.001886,0.002824,0.002417,0.002627,T1w_cor_cuneus-lh-volume
4,0.000667,0.009020,0.001539,0.011284,0.003735,0.000971,0.000000,0.011000,0.011304,0.009333,...,0.002642,0.033080,0.008751,0.013435,0.009926,0.010371,0.009716,0.017496,0.013574,T1w_cor_entorhinal-lh-volume
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
714,0.003461,0.003157,0.004039,0.014971,0.002598,0.006971,0.000000,0.015814,0.002333,0.003147,...,0.016935,0.021165,0.006419,0.010762,0.010780,0.013397,0.009839,0.009730,0.006767,DTI_SS-L_Average
715,0.000755,0.000000,0.002667,0.001314,0.000000,0.000000,0.001373,0.019206,0.001814,0.002157,...,0.013622,0.019012,0.003654,0.002327,0.004168,0.002224,0.002580,0.003212,0.002545,DTI_SS-R_Average
716,0.000000,0.004010,0.000000,0.005167,0.003402,0.001657,0.000000,0.003324,0.010235,0.008578,...,0.024814,0.005384,0.007020,0.002617,0.003950,0.002619,0.011159,0.003840,0.004482,DTI_UNC_Average
717,0.000000,0.002971,0.001686,0.002206,0.000000,0.000627,0.000000,0.001265,0.003088,0.015833,...,0.005472,0.012711,0.009566,0.010366,0.007827,0.007766,0.004413,0.009363,0.005065,DTI_UNC-L_Average


#### add the subtype of SHAP

Modality, Type, Lobe Region, Value, All mean, and All std: GB, SVM lin, LR, and SVM rbf

In [8]:
GB_c = ['GB0 mean','GB1 mean','GB2 mean','GB3 mean','GB4 mean','GB5 mean','GB6 mean']
LR_c = ['LR0 mean','LR1 mean','LR2 mean','LR3 mean','LR4 mean','LR5 mean','LR6 mean']
SVM_lin_c = ['SVM_lin0 mean','SVM_lin1 mean','SVM_lin2 mean','SVM_lin3 mean','SVM_lin4 mean','SVM_lin5 mean','SVM_lin6 mean']
SVM_rbf_c = ['SVM_rbf0 mean','SVM_rbf1 mean', 'SVM_rbf2 mean', 'SVM_rbf3 mean', 'SVM_rbf4 mean', 'SVM_rbf5 mean', 'SVM_rbf6 mean']

In [9]:
def type_check(col):
    if 'cor' == col.split('_')[1]:
        return "Cortical region"
    elif 'subcor' == col.split('_')[1]:
        return "Subcortical region"
    else:
        return "DTI region"

def lobe_region(col):
    if 'cor' == col.split('_')[1]:
        temporal_lobe = {'bankssts', 'entorhinal', 'fusiform', 'inferiortemporal', 'middletemporal',
                         'parahippocampal','superiortemporal', 'temporalpole', 'transversetemporal'}
        frontal_lobe = {'caudalmiddlefrontal', 'lateralorbitofrontal', 'paracentral', 'parsopercularis',
                        'parsorbitalis', 'parstriangularis', 'precentral', 'rostralmiddlefrontal',
                        'superiorfrontal', 'medialorbitofrontal', 'frontalpole'}
        parietal_lobe = {'inferiorparietal', 'postcentral', 'precuneus', 'superiorparietal', 'supramarginal'}
        occipital_lobe = {'cuneus', 'lateraloccipital', 'pericalcarine', 'lingual'}
        cingulate_cortex = {'caudalanteriorcingulate', 'isthmuscingulate', 'posteriorcingulate', 'rostralanteriorcingulate'}
        insula_cortex = {'insula'}
        check = col.split('_')[2].split('-')[0]
        if check in temporal_lobe:
            return 'Temporal lobe'
        elif check in frontal_lobe:
            return 'Frontal lobe'
        elif check in parietal_lobe:
            return 'Parietal lobe'
        elif check in occipital_lobe:
            return 'Occipital lobe'
        elif check in cingulate_cortex:
            return 'Cingulate cortex'
        elif check in insula_cortex:
            return 'Insula cortex'
        else:
            return 'Other'
    elif 'subcor' == col.split('_')[1]:
        return 'Subcortical region' # To do
    else:
        return 'DTI region' # To do

In [10]:
DF_SHAP['Modality'] = [i.split('_')[0] for i in DF_SHAP['Col names']]
DF_SHAP['Type'] = [type_check(i) for i in DF_SHAP['Col names']]
DF_SHAP['Lobe Region'] = [lobe_region(i) for i in DF_SHAP['Col names']]
DF_SHAP['Value'] = [i.split('-')[-1].split('_')[-1] for i in DF_SHAP['Col names']]
DF_SHAP['GB_All mean'] = DF_SHAP[GB_c].mean(axis=1)
DF_SHAP['LR_All mean'] = DF_SHAP[LR_c].mean(axis=1)
DF_SHAP['SVM_lin_All mean'] = DF_SHAP[SVM_lin_c].mean(axis=1)
DF_SHAP['SVM_rbf_All mean'] = DF_SHAP[SVM_rbf_c].mean(axis=1)
DF_SHAP['GB_All std'] = DF_SHAP[GB_c].std(axis=1)
DF_SHAP['LR_All std'] = DF_SHAP[LR_c].std(axis=1)
DF_SHAP['SVM_lin_All std'] = DF_SHAP[SVM_lin_c].std(axis=1)
DF_SHAP['SVM_rbf_All std'] = DF_SHAP[SVM_rbf_c].std(axis=1)

In [11]:
DF_SHAP.columns

Index(['GB0 mean', 'GB1 mean', 'GB2 mean', 'GB3 mean', 'GB4 mean', 'GB5 mean',
       'GB6 mean', 'LR0 mean', 'LR1 mean', 'LR2 mean', 'LR3 mean', 'LR4 mean',
       'LR5 mean', 'LR6 mean', 'SVM_lin0 mean', 'SVM_lin1 mean',
       'SVM_lin2 mean', 'SVM_lin3 mean', 'SVM_lin4 mean', 'SVM_lin5 mean',
       'SVM_lin6 mean', 'SVM_rbf0 mean', 'SVM_rbf1 mean', 'SVM_rbf2 mean',
       'SVM_rbf3 mean', 'SVM_rbf4 mean', 'SVM_rbf5 mean', 'SVM_rbf6 mean',
       'GB0 std', 'GB1 std', 'GB2 std', 'GB3 std', 'GB4 std', 'GB5 std',
       'GB6 std', 'LR0 std', 'LR1 std', 'LR2 std', 'LR3 std', 'LR4 std',
       'LR5 std', 'LR6 std', 'SVM_lin0 std', 'SVM_lin1 std', 'SVM_lin2 std',
       'SVM_lin3 std', 'SVM_lin4 std', 'SVM_lin5 std', 'SVM_lin6 std',
       'SVM_rbf0 std', 'SVM_rbf1 std', 'SVM_rbf2 std', 'SVM_rbf3 std',
       'SVM_rbf4 std', 'SVM_rbf5 std', 'SVM_rbf6 std', 'Col names', 'Modality',
       'Type', 'Lobe Region', 'Value', 'GB_All mean', 'LR_All mean',
       'SVM_lin_All mean', 'SVM_rbf_Al

#### sorted SHAP in SVM-rbf

In [16]:
# SVM rbf
DF_SHAP2 = DF_SHAP.sort_values(by=['Modality','Type','Lobe Region','Value','SVM_rbf_All mean'],
                               ascending=[True,True,True,True,False])

In [17]:
DF_SHAP2

Unnamed: 0,GB0 mean,GB1 mean,GB2 mean,GB3 mean,GB4 mean,GB5 mean,GB6 mean,LR0 mean,LR1 mean,LR2 mean,...,Lobe Region,Value,GB_All mean,LR_All mean,SVM_lin_All mean,SVM_rbf_All mean,GB_All std,LR_All std,SVM_lin_All std,SVM_rbf_All std
703,0.055108,0.028941,0.054186,0.000422,0.024147,0.023451,0.027333,0.031980,0.003912,0.015147,...,DTI region,Average,0.030513,0.018289,0.020291,0.014091,0.019034,0.009884,0.012087,0.002053
702,0.011667,0.010451,0.024108,0.034824,0.008225,0.001343,0.016892,0.023520,0.036637,0.023922,...,DTI region,Average,0.015359,0.026227,0.025499,0.011401,0.011127,0.008767,0.007335,0.001315
700,0.012765,0.000000,0.012196,0.003176,0.001373,0.002020,0.006657,0.019049,0.021412,0.019186,...,DTI region,Average,0.005455,0.019769,0.020207,0.009944,0.005224,0.006587,0.007136,0.001582
669,0.012461,0.025696,0.005059,0.026598,0.002049,0.018206,0.006618,0.004294,0.026451,0.010853,...,DTI region,Average,0.013812,0.013245,0.014489,0.009814,0.009938,0.008304,0.006452,0.001385
711,0.014686,0.017078,0.002637,0.017500,0.008598,0.006196,0.014980,0.020294,0.027961,0.012049,...,DTI region,Average,0.011668,0.017836,0.019559,0.008720,0.005835,0.006683,0.007025,0.002434
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,Subcortical region,volume,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
102,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,Subcortical region,volume,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
103,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,Subcortical region,volume,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
105,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,Subcortical region,volume,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [13]:
DF = DF_SHAP2

In [14]:
rbf0 = [list(x) for x in zip(DF['Col names'], DF['SVM_rbf0 mean'], DF['SVM_rbf0 std'])]
rbf0.sort(key=lambda x:-x[1])
rbf1 = [list(x) for x in zip(DF['Col names'], DF['SVM_rbf1 mean'], DF['SVM_rbf1 std'])]
rbf1.sort(key=lambda x:-x[1])
rbf2 = [list(x) for x in zip(DF['Col names'], DF['SVM_rbf2 mean'], DF['SVM_rbf2 std'])]
rbf2.sort(key=lambda x:-x[1])
rbf3 = [list(x) for x in zip(DF['Col names'], DF['SVM_rbf3 mean'], DF['SVM_rbf3 std'])]
rbf3.sort(key=lambda x:-x[1])
rbf4 = [list(x) for x in zip(DF['Col names'], DF['SVM_rbf4 mean'], DF['SVM_rbf4 std'])]
rbf4.sort(key=lambda x:-x[1])
rbf5 = [list(x) for x in zip(DF['Col names'], DF['SVM_rbf5 mean'], DF['SVM_rbf5 std'])]
rbf5.sort(key=lambda x:-x[1])
rbf6 = [list(x) for x in zip(DF['Col names'], DF['SVM_rbf6 mean'], DF['SVM_rbf6 std'])]
rbf6.sort(key=lambda x:-x[1])

In [15]:
rbf0_name = [i[0] for i in rbf0]
rbf0_mean = [i[1] for i in rbf0]
rbf0_std = [i[2] for i in rbf0]
rbf1_name = [i[0] for i in rbf1]
rbf1_mean = [i[1] for i in rbf1]
rbf1_std = [i[2] for i in rbf1]
rbf2_name = [i[0] for i in rbf2]
rbf2_mean = [i[1] for i in rbf2]
rbf2_std = [i[2] for i in rbf2]
rbf3_name = [i[0] for i in rbf3]
rbf3_mean = [i[1] for i in rbf3]
rbf3_std = [i[2] for i in rbf3]
rbf4_name = [i[0] for i in rbf4]
rbf4_mean = [i[1] for i in rbf4]
rbf4_std = [i[2] for i in rbf4]
rbf5_name = [i[0] for i in rbf5]
rbf5_mean = [i[1] for i in rbf5]
rbf5_std = [i[2] for i in rbf5]
rbf6_name = [i[0] for i in rbf6]
rbf6_mean = [i[1] for i in rbf6]
rbf6_std = [i[2] for i in rbf6]

In [38]:
Name = pd.DataFrame(
    {'SVM rbf0 name': rbf0_name,
     'SVM rbf1 name': rbf1_name,
     'SVM rbf2 name': rbf2_name,
     'SVM rbf3 name': rbf3_name,
     'SVM rbf4 name': rbf4_name,
     'SVM rbf5 name': rbf5_name,
     'SVM rbf6 name': rbf6_name,
     'sorted SVM rbf0 mean': rbf0_mean,
     'sorted SVM rbf1 mean': rbf1_mean,
     'sorted SVM rbf2 mean': rbf2_mean,
     'sorted SVM rbf3 mean': rbf3_mean,
     'sorted SVM rbf4 mean': rbf4_mean,
     'sorted SVM rbf5 mean': rbf5_mean,
     'sorted SVM rbf6 mean': rbf6_mean,
     'sorted SVM rbf0 std': rbf0_std,
     'sorted SVM rbf1 std': rbf1_std,
     'sorted SVM rbf2 std': rbf2_std,
     'sorted SVM rbf3 std': rbf3_std,
     'sorted SVM rbf4 std': rbf4_std,
     'sorted SVM rbf5 std': rbf5_std,
     'sorted SVM rbf6 std': rbf6_std
    }
)

In [39]:
Name

Unnamed: 0,SVM rbf0 name,SVM rbf1 name,SVM rbf2 name,SVM rbf3 name,SVM rbf4 name,SVM rbf5 name,SVM rbf6 name,sorted SVM rbf0 mean,sorted SVM rbf1 mean,sorted SVM rbf2 mean,...,sorted SVM rbf4 mean,sorted SVM rbf5 mean,sorted SVM rbf6 mean,sorted SVM rbf0 std,sorted SVM rbf1 std,sorted SVM rbf2 std,sorted SVM rbf3 std,sorted SVM rbf4 std,sorted SVM rbf5 std,sorted SVM rbf6 std
0,T1w_subcor_Right-Amygdala_volume,T1w_subcor_CC_Anterior_volume,T1w_cor_paracentral-rh-thicknessstd,T1w_subcor_Left-Inf-Lat-Vent_mean,T1w_subcor_Brain-Stem_mean,T1w_subcor_CC_Anterior_mean,T1w_subcor_Right-Inf-Lat-Vent_mean,0.017480,0.016343,0.016510,...,0.017892,0.018549,0.018098,0.024026,0.022702,0.024666,0.022756,0.029348,0.026813,0.024646
1,T1w_subcor_Right-Hippocampus_mean,T1w_subcor_Right-Cerebellum-Cortex_volume,T1w_cor_rostralanteriorcingulate-rh-thickness,T1w_subcor_Right-Inf-Lat-Vent_mean,T1w_cor_parahippocampal-lh-volume,T1w_cor_lateraloccipital-rh-thickness,T1w_cor_entorhinal-rh-meancurv,0.015196,0.015706,0.015304,...,0.017353,0.018118,0.015647,0.020027,0.021490,0.021876,0.023760,0.026730,0.024739,0.021854
2,T1w_cor_lateraloccipital-rh-meancurv,DTI_SCC_Average,T1w_subcor_CC_Anterior_volume,DTI_SCC_Average,T1w_cor_lateraloccipital-rh-meancurv,T1w_subcor_CC_Central_mean,T1w_subcor_Right-Hippocampus_mean,0.014843,0.014706,0.015265,...,0.017235,0.017304,0.015069,0.019325,0.022761,0.022255,0.023929,0.024377,0.022457,0.022298
3,T1w_subcor_Brain-Stem_mean,T1w_subcor_Left-Cerebellum-Cortex_volume,T1w_cor_pericalcarine-lh-meancurv,T1w_cor_rostralanteriorcingulate-rh-thickness,DTI_SCC_Average,T1w_cor_lateralorbitofrontal-lh-meancurv,T1w_cor_caudalmiddlefrontal-rh-foldind,0.014275,0.014196,0.014980,...,0.016431,0.016912,0.013569,0.019293,0.019680,0.021650,0.021688,0.021567,0.022779,0.020617
4,T1w_cor_transversetemporal-lh-thicknessstd,T1w_cor_lingual-lh-thicknessstd,T1w_subcor_Right-Amygdala_volume,T1w_subcor_Left-choroid-plexus_volume,T1w_cor_transversetemporal-lh-thicknessstd,T1w_cor_parahippocampal-rh-area,T1w_subcor_Left-Hippocampus_mean,0.014186,0.013314,0.014824,...,0.015980,0.016451,0.013510,0.019598,0.017946,0.019753,0.020583,0.023639,0.023653,0.018880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
714,T1w_subcor_5th-Ventricle_mean,T1w_subcor_5th-Ventricle_mean,T1w_subcor_5th-Ventricle_mean,T1w_subcor_5th-Ventricle_mean,T1w_subcor_5th-Ventricle_mean,T1w_subcor_5th-Ventricle_mean,T1w_subcor_5th-Ventricle_mean,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
715,T1w_subcor_Left-WM-hypointensities_mean,T1w_subcor_Left-WM-hypointensities_mean,T1w_subcor_Left-WM-hypointensities_mean,T1w_subcor_Left-WM-hypointensities_mean,T1w_subcor_Left-WM-hypointensities_mean,T1w_subcor_Left-WM-hypointensities_mean,T1w_subcor_Left-WM-hypointensities_mean,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
716,T1w_subcor_Right-WM-hypointensities_mean,T1w_subcor_Right-WM-hypointensities_mean,T1w_subcor_Right-WM-hypointensities_mean,T1w_subcor_Right-WM-hypointensities_mean,T1w_subcor_Right-WM-hypointensities_mean,T1w_subcor_Right-WM-hypointensities_mean,T1w_subcor_Right-WM-hypointensities_mean,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
717,T1w_subcor_Left-non-WM-hypointensities_mean,T1w_subcor_Left-non-WM-hypointensities_mean,T1w_subcor_Left-non-WM-hypointensities_mean,T1w_subcor_Left-non-WM-hypointensities_mean,T1w_subcor_Left-non-WM-hypointensities_mean,T1w_subcor_Left-non-WM-hypointensities_mean,T1w_subcor_Left-non-WM-hypointensities_mean,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [40]:
# Name.to_csv("/ritter/share/data/IMAGEN/posthoc/IMAGEN_SHAP_SVM_rbf.csv", index=None)

### 10. Save the Summary Statistics

#### load the dataset

In [41]:
# HDF5
HDF5 = posthoc.read_HDF5('all_Binge.csv')
# INSTRUMENT
INST = posthoc.read_INSTRUMENT('IMAGEN_INSTRUMENT.csv')

In [42]:
# FU3
HDF5_FU3 = HDF5.groupby('Session').get_group('FU3')
INST_FU3 = INST.groupby('Session').get_group('FU3')
SS_FU3 = pd.merge(HDF5_FU3,INST_FU3, on=['ID','Session'], how='left')

In [43]:
SS_FU3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 752 entries, 0 to 751
Data columns (total 72 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   ID                                           752 non-null    int64  
 1   Session                                      752 non-null    object 
 2   y                                            752 non-null    object 
 3   Dataset                                      752 non-null    object 
 4   Sex                                          752 non-null    object 
 5   Site                                         752 non-null    object 
 6   Class                                        752 non-null    object 
 7   Family valence                               749 non-null    float64
 8   Accident valence                             749 non-null    float64
 9   Sexuality valence                            749 non-null    float64
 10  Au

In [44]:
SS_FU3_Col = list(SS_FU3.columns[:66])+list(SS_FU3.columns[67:70])+list(SS_FU3.columns[71:])

In [45]:
SS = SS_FU3[SS_FU3_Col]
SS

Unnamed: 0,ID,Session,y,Dataset,Sex,Site,Class,Family valence,Accident valence,Sexuality valence,...,Injury mean,Negotiation mean,Psychological Aggression mean,Sexual Coercion mean,Positive Affect Score,Negative Affect Score,MAST total,MAST Alcohol dependency symptoms,MAST sum,FTND Sum
0,112288,FU3,Binge,Training,Female,Berlin,AAM,0.2,1.25,0.428571,...,0.125,13.833333,0.0000,0.0,42.0,15.0,14.0,2.0,12.0,1.0
1,215284,FU3,Binge,Training,Male,Dresden,HC,-0.4,0.50,-0.428571,...,0.000,0.000000,0.0000,0.0,14.0,13.0,7.0,1.0,6.0,0.0
2,297685,FU3,Binge,Training,Male,Dresden,AAM,-1.0,-1.00,-0.142857,...,0.000,4.750000,1.1250,0.0,25.0,13.0,14.0,2.0,12.0,0.0
3,308867,FU3,Binge,Training,Female,Dresden,AAM,-1.0,0.75,-0.714286,...,0.000,2.333333,0.0000,0.0,35.0,21.0,12.0,2.0,10.0,0.0
4,469693,FU3,Binge,Training,Male,London,AAM,-1.2,0.75,-0.285714,...,0.000,12.333333,0.0000,0.0,22.0,23.0,11.0,1.0,10.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
747,95957207,FU3,Binge,Holdout,Female,Dublin,HC,,,,...,,,,,,,,,,0.0
748,97739048,FU3,Binge,Holdout,Male,Berlin,HC,-0.2,0.75,0.142857,...,0.000,7.000000,0.5625,0.0,39.0,15.0,7.0,1.0,6.0,0.0
749,99217838,FU3,Binge,Holdout,Female,Nottingham,AAM,,,,...,0.000,15.833333,0.0000,0.0,,,0.0,0.0,0.0,0.0
750,99677574,FU3,Binge,Holdout,Male,Paris,HC,-0.8,0.00,0.000000,...,0.000,3.666667,0.2500,0.0,22.0,30.0,,,,0.0


In [46]:
SS.describe()

Unnamed: 0,ID,Family valence,Accident valence,Sexuality valence,Autonomy valence,Devience valence,Relocation valence,Distress valence,Noscale valence,Overall valence,...,Injury mean,Negotiation mean,Psychological Aggression mean,Sexual Coercion mean,Positive Affect Score,Negative Affect Score,MAST total,MAST Alcohol dependency symptoms,MAST sum,FTND Sum
count,752.0,749.0,749.0,749.0,749.0,749.0,749.0,749.0,749.0,749.0,...,725.0,725.0,725.0,725.0,745.0,745.0,539.0,539.0,539.0,752.0
mean,49390500.0,-0.411482,0.542724,-0.238604,-0.241489,-0.82777,-0.052514,-0.217623,0.688474,-0.137688,...,0.022414,7.098161,0.919828,0.375,26.957047,20.522148,11.020408,1.818182,9.202226,0.422872
std,28505040.0,0.411371,0.418303,0.354788,0.308481,0.40771,0.463365,0.363445,0.607904,0.222015,...,0.162675,5.787,1.553866,1.168372,7.666312,6.682496,5.353798,1.342875,4.479395,1.21329
min,112288.0,-2.0,-2.0,-1.571429,-2.0,-2.0,-2.0,-1.666667,-2.0,-1.871795,...,0.0,0.0,0.0,0.0,10.0,10.0,0.0,0.0,0.0,0.0
25%,25503600.0,-0.6,0.25,-0.428571,-0.5,-1.0,-0.333333,-0.5,0.333333,-0.25641,...,0.0,2.666667,0.0,0.0,21.0,15.0,9.0,1.0,8.0,0.0
50%,47904910.0,-0.4,0.5,-0.285714,-0.25,-1.0,0.0,-0.166667,0.666667,-0.128205,...,0.0,5.666667,0.3125,0.0,26.0,20.0,11.0,2.0,10.0,0.0
75%,74123480.0,-0.2,0.75,0.0,0.0,-0.666667,0.333333,0.0,1.0,0.0,...,0.0,10.5,1.1875,0.0625,32.0,26.0,14.0,3.0,11.0,0.0
max,99930020.0,1.0,1.5,1.0,1.0,1.333333,1.333333,1.333333,2.0,0.948718,...,2.3125,25.0,11.1875,10.9375,50.0,40.0,42.0,6.0,39.0,7.0


In [47]:
# save_path = f"{DATA_DIR}/posthoc/IMAGEN_Binge_FU3_SS_ver02.csv"
# if not os.path.isdir(os.path.dirname(save_path)):
#     os.makedirs(os.path.dirname(save_path))
# SS.to_csv(save_path, index=None)