In [1]:
from glob import glob
import os
import pandas as pd

import requests
import parse

In [3]:
ls

[36mclean-csv[m[m/               fcp-indi.gz              process-csv.ipynb
[36mexample_repository[m[m/      fcp-info.ipynb           [36mtransform[m[m/
[36mfcp-indi[m[m/                [36mload[m[m/
fcp-indi-new.gz          process-clean-csv.ipynb


In [8]:
csv_files = [val for val in sorted(glob('*.csv')) if not any([key in val for key in 
                                                              ['clean', 'Brain', 'ABIDE']])]
csv_files

['ACPI.csv',
 'ADHD200.csv',
 'CORR.csv',
 'HypnosisBarrios.csv',
 'RocklandSample.csv']

In [4]:
def process_csv(filename):
    #filename = csv_files[0]
    df = pd.read_csv(filename, header=None, names=['url', 'id'], dtype=str)
    print(filename, df.shape)
    info = []
    for grp in df.groupby('id'):
        T1 = [val for val in grp[1].url.values if 'T1.mgz' in val]
        if T1:
            for T1_val in T1:
                annot = T1_val.replace('T1.mgz', 'aseg.mgz')
                this_info = dict(url=T1_val, annot=annot, id=grp[0])
                info.append(this_info)
        else:
            for val in grp[1].url.values:
                this_info = dict(url=val, annot=pd.np.nan, id=grp[0])
                info.append(this_info)
    new_df = pd.DataFrame(info)
    return new_df

In [9]:
all_df = []
for filename in csv_files:
    new_df = process_csv(filename)
    new_filename = filename.replace('.csv', '-clean.csv')
    new_df[['url', 'id']].to_csv(new_filename, header=False, index=False)
    print(new_filename, new_df.shape)
    all_df.append(new_df)
all_df = pd.concat(all_df)
all_df.shape

ACPI.csv (129, 2)
ACPI-clean.csv (129, 3)
ADHD200.csv (1927, 2)
ADHD200-clean.csv (973, 3)
CORR.csv (3203, 2)
CORR-clean.csv (3153, 3)
HypnosisBarrios.csv (10, 2)
HypnosisBarrios-clean.csv (10, 3)
RocklandSample.csv (532, 2)
RocklandSample-clean.csv (532, 3)


(4797, 3)

In [106]:
csv = pd.read_csv('clean-csv/all.csv', dtype=str)

In [107]:
patterns = ['data/Projects/{study}/RawData/{site}/{subj_id}/session_{session_id}/anat_{anat_id}/mprage.nii.gz',
            'data/Projects/{study}/Outputs/freesurfer/5.1/{site}_{site_suffix}_{subj_id}/mri/T1.mgz',
            'data/Projects/{study}/Outputs/freesurfer/5.1/{site}_{subj_id}/mri/T1.mgz',
            'data/Projects/{study}/surfaces/freesurfer/5.3/{subj_id}/mri/T1.mgz',
            'data/Projects/{study}/Outputs/IBA_TRT/freesurfer_gpu/{subj_id}-session_{session_id}/mri/T1.mgz',
            'data/Projects/{study}/Outputs/IBA_TRT/freesurfer/{subj_id}-session_{session_id}/mri/T1.mgz',
            'data/Projects/{study}/RawData/{site}/{subj_id}/session_{session_id}/anat_{anat_id}/anat.nii.gz',
            'data/Projects/{study:w}/sub-{subj_id}/ses-{session_id}/anat/sub-{subj_id}_ses-{session_id}_T1w.nii.gz',
            'data/Projects/{study:w}/RawData/{subj_id}/{session_id}/{deface_id}/{deface_id2}_defaced.nii.gz',
            'data/Projects/INDI/{study}/RawData/sub-{subj_id}/anat/sub-{subj_id}_T1w.nii.gz',
           ]


In [108]:
project_id = "https://s3.amazonaws.com/fcp-indi/data/Projects/{project_id}/{frag}" 
def get_project(string):
    parsed = parse.parse(project_id, string)
    return parsed.named.get('project_id').lower()
csv['project'] = csv.T1url.apply(get_project)
csv.head()

Unnamed: 0,T1url,id,project
0,https://s3.amazonaws.com/fcp-indi/data/Project...,28031,acpi
1,https://s3.amazonaws.com/fcp-indi/data/Project...,28032,acpi
2,https://s3.amazonaws.com/fcp-indi/data/Project...,28033,acpi
3,https://s3.amazonaws.com/fcp-indi/data/Project...,28034,acpi
4,https://s3.amazonaws.com/fcp-indi/data/Project...,28035,acpi


In [109]:
url_patterns = ["https://s3.amazonaws.com/fcp-indi/" + pattern for pattern in patterns]

In [110]:
def get_session(url):
    result = None
    for pattern in url_patterns:
        parsed = parse.parse(pattern, url)
        if parsed:
            result = parsed.named.get('session_id')
    if result:
        return result
    else:
        return '1'

In [111]:
sessions = csv.T1url.apply(get_session)
sessions.unique()

array(['1', '2', '3', '10', '4', '5', '6', '7', '8', '9', 'clg_4',
       'clg_4R', 'dsc_2', 'nfb_3', 'nfb_2', 'clg_2R', 'clg_2'], dtype=object)

In [112]:
csv['session_id'] = sessions

In [97]:
ids = csv.groupby(by=['project', 'id'])

In [131]:
count = ids.session_id.count()
count_series = pd.Series()

In [132]:
for idx, row in csv.iterrows():
     count_series.loc[idx] = count.get((row.project, row.id))

In [135]:
csv['session_count'] = count_series

In [138]:
csv.columns = ['t1_url', 'participant_id', 'project', 'session_id', 'session_count']

In [139]:
csv.to_csv('clean-csv/all-session.csv', index=None)