# Prepare the full ABIDE pheno dataset
This includes:

- adding the motion parameters from niak preprocessing
- adding the corrected ADOS severity scores from Budha
- removing any duplicates
- removing any entries for which there are no files on disk
- replace any -9999 or -999 value with NaN

In [1]:
import os
import re
import glob
import numpy as np
import pandas as pd
from scipy import io as sio

In [2]:
# Set up
pheno_path = '/data1/abide/Pheno/Phenotypic_V1_0b.csv'
motion_path = '/data1/abide/Pheno/qc_motion_all_abide.csv'
ados_path = '/data1/abide/Pheno/ADOS_Budha.mat'
data_path = '/data1/abide/Out/Scores/sc07/time/stability_maps'
data_template = '*_fmri_{:07d}_session_1_run1_stability_maps.nii.gz'
out_path = '/data1/abide/Pheno/full_merged_pheno.csv'

In [3]:
# Prepare the motion data
motion = pd.read_csv(motion_path)
# Rename the subID column
colnames = motion.columns.values
colnames[0] = 'SUB_ID'
motion.columns = colnames
# Some subjects have additional runs, but we only care about the first run
motion = motion[np.array(['_session_1_run1' in x for x in motion['SUB_ID'].values])]
# of the remaining cases, change the name of the subject ID
motion['SUB_ID'] = [int(re.search('(?<=X)[0-9]*',row['SUB_ID']).group()) for index, row in motion.iterrows()]

In [4]:
# Load the pheno file
pheno = pd.read_csv(pheno_path)
# Merge it with the motion data, keeping only matching
pheno = pheno.merge(motion, on='SUB_ID')

In [5]:
# Remove anything that doesn't point to an existing file
is_there = [not not glob.glob(os.path.join(data_path, 
                                           data_template.format(row['SUB_ID']))) for index, row in pheno.iterrows()]
pheno = pheno[is_there]

In [6]:
# Load the ADOS file
tmp = sio.loadmat(ados_path)
ados_sc = np.array(tmp['ados_social_comm_severity']).flatten()
ados_rb = np.array(tmp['ados_stereo_behav_severity']).flatten()
ados_sub = np.array(tmp['subj_ID']).flatten()
# Add a new column to the pheno file
pheno['ADOS_SOCOM_SEV'] = np.nan
pheno['ADOS_STBEH_SEV'] = np.nan
# Add the ados values to pheno
for sub_id, sub in enumerate(ados_sub):
    match_id  = np.where(pheno['SUB_ID']==sub)[0]
    pheno.ix[match_id, 'ADOS_SOCOM_SEV'] = ados_sc[sub_id]
    pheno.ix[match_id, 'ADOS_STBEH_SEV'] = ados_rb[sub_id]

In [7]:
# Replace missing values with nan
pheno = pheno.replace('-9999', np.nan)
pheno = pheno.replace('-999', np.nan)

In [8]:
pheno.to_csv(out_path, index=False)