# Data Wrangling Runs

In this notebook we'll write some code that will tell us about the distribution of data in different runs across the project.

In [1]:
dataset_path = '/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC'

In [2]:
import glob
fnames = glob.glob('{}/*/*/*/*.nii.gz'.format(dataset_path))

In [3]:
len(fnames)

2714

In [4]:
print(fnames[:20])

['/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10703/ses-1/anat/sub-10703_ses-1_rec-refaced_run-1_T1w.nii.gz', '/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10703/ses-1/func/sub-10703_ses-1_task-rest_run-1_bold.nii.gz', '/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10703/ses-1/dwi/sub-10703_ses-1_run-1_dwi.nii.gz', '/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10703/ses-2/anat/sub-10703_ses-2_rec-refaced_run-1_T1w.nii.gz', '/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10703/ses-2/func/sub-10703_ses-2_task-rest_run-1_bold.nii.gz', '/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10703/ses-2/dwi/sub-10703_ses-2_run-1_dwi.nii.gz', '/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10753/ses-1/anat/sub-10753_ses-1_rec-refaced_run-1_T1w.nii.gz', '/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10753/ses-1/func/sub-10753_ses-1_task-rest_run-1_bold.nii.gz', '/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10753/ses-1/dwi/sub-10753_ses-

In [5]:
runs = [x for x in fnames if 'run' in x]

In [6]:
len(runs)

2714

In [5]:
runs[:20]

['/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10703/ses-1/anat/sub-10703_ses-1_rec-refaced_run-1_T1w.nii.gz',
 '/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10703/ses-1/func/sub-10703_ses-1_task-rest_run-1_bold.nii.gz',
 '/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10703/ses-1/dwi/sub-10703_ses-1_run-1_dwi.nii.gz',
 '/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10703/ses-2/anat/sub-10703_ses-2_rec-refaced_run-1_T1w.nii.gz',
 '/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10703/ses-2/func/sub-10703_ses-2_task-rest_run-1_bold.nii.gz',
 '/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10703/ses-2/dwi/sub-10703_ses-2_run-1_dwi.nii.gz',
 '/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10753/ses-1/anat/sub-10753_ses-1_rec-refaced_run-1_T1w.nii.gz',
 '/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10753/ses-1/func/sub-10753_ses-1_task-rest_run-1_bold.nii.gz',
 '/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10753/ses-1/dwi/sub-10

In [6]:
import pandas as pd

df = pd.DataFrame(runs, columns=['source'])

In [7]:
def parse_slashes(source, ix):
    
    fields = source.split('/')
    
    return fields[ix]

In [8]:
parse_slashes('/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10703/ses-1/anat/sub-10703_ses-1_rec-refaced_run-1_T1w.nii.gz', 8)

'ses-1'

In [9]:
df

Unnamed: 0,source
0,/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/H...
1,/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/H...
2,/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/H...
3,/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/H...
4,/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/H...
...,...
2709,/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/H...
2710,/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/H...
2711,/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/H...
2712,/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/H...


In [52]:
df['session'] = df.apply(lambda row: parse_slashes(row['source'], 8), axis=1)
df['subject'] = df.apply(lambda row: parse_slashes(row['source'], 7), axis=1)
df['folder'] = df.apply(lambda row: parse_slashes(row['source'], 9), axis=1)

In [53]:
df

Unnamed: 0,source,session,folder,run,entity,task,subject
0,/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/H...,ses-1,anat,run-1,T1w,,sub-10703
1,/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/H...,ses-1,func,run-1,bold,rest,sub-10703
2,/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/H...,ses-1,dwi,run-1,dwi,,sub-10703
3,/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/H...,ses-2,anat,run-1,T1w,,sub-10703
4,/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/H...,ses-2,func,run-1,bold,rest,sub-10703
...,...,...,...,...,...,...,...
2709,/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/H...,ses-1,func,run-1,bold,rest,sub-20082
2710,/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/H...,ses-1,dwi,run-1,dwi,,sub-20082
2711,/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/H...,ses-2,anat,run-1,T1w,,sub-20082
2712,/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/H...,ses-2,func,run-1,bold,rest,sub-20082


In [12]:
def parse_run(source):
    
    run_index = source.index('run')
    if run_index:
        run_end = run_index + 5
        return source[run_index:run_end]
    else:
        return None

In [13]:
parse_run('/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10703/ses-1/anat/sub-10703_ses-1_rec-refaced_run-1_T1w.nii.gz')

'run-1'

In [14]:
df['run'] = df.apply(lambda row: parse_run(row['source']), axis=1)

In [33]:
def parse_entity(source):
    
    entity_end = source.index('.nii')
    entity_start = entity_end
    underscore = False
    
    while not underscore:
        
        if source[entity_start] == '_':
            break
        else:
            entity_start -= 1

    if entity_start <= entity_end:
        return source[entity_start+1:entity_end]
    else:
        return None

In [34]:
parse_entity('/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-20294/ses-1/func/sub-20294_ses-1_task-rest_run-1_bold.nii.gz')

'bold'

In [35]:
df['entity'] = df.apply(lambda row: parse_entity(row['source']), axis=1)

In [45]:
def parse_task(source):
    
    try:
        task_start = source.index('task')
        task_end = task_start
        underscore = False
    
        while not underscore:

            if source[task_end] == '_':
                break
            else:
                task_end += 1

        if task_start <= task_end:
            return source[task_start+5:task_end]
        else:
            return None
    except:
        return None

In [46]:
parse_task('/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-20294/ses-1/func/sub-20294_ses-1_task-rest_run-1_bold.nii.gz')

'rest'

In [47]:
parse_task('/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-20294/ses-1/dwi/sub-20294_ses-1_run-1_dwi.nii.gz')

In [48]:
df['task'] = df.apply(lambda row: parse_task(row['source']), axis=1)

In [49]:
set(df['run'].values.tolist())

{'run-1', 'run-2', 'run-3'}

In [55]:
import qgrid
qgrid_widget = qgrid.show_grid(df, show_toolbar=True)
qgrid_widget

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [56]:
df.to_csv('hrc_entities.csv', index=False)