In [1]:
import os
import glob
import re
import pandas as pd

In [2]:
import numpy as np

OK, maybe for each subject and wave we check:
 - the number of appropriate input files
 - the number of appropriate output files

In [3]:
file_dir = '/gpfs/projects/sanlab/shared/DEV/bids_data/derivatives/fmriprep_2022/'

In [4]:
#iterate through subjects
subj_folder_list = [p.replace(file_dir,"") for p in glob.glob(file_dir + '/sub-DEV*/',recursive=False)]
subj_folder_list.sort()
output_dict_list = []

#count out the number of files for each task in (1) unsmoothed input and (2) smoothed output
task_list = ["ROC","WTP","SST"]
task_correct_acq_set = {
    "ROC":{1,2,3,4},"WTP":{1,2,3,4},"SST":{1}
}

for subj_folder in subj_folder_list:
    #for each subject count the smoothed files by wave and category
    subj_fullpath = file_dir + subj_folder
    wave_list = [p.replace(subj_fullpath,"") for p in glob.glob(subj_fullpath + "ses-wave*")]
    
    #iterate through waves - in most cases, just two
    for wave in wave_list:
        subj_wave_dict = {}
        subj_wave_dict['subj'] = subj_folder
        subj_wave_dict['wave'] = wave
        subj_wave_fullpath = subj_fullpath + wave
        
        #raw_input_list = [p.replace(subj_wave_fullpath,"") for p in glob.glob(subj_wave_fullpath + "/func/sub*bold_space-MNI152NLin2009cAsym_preproc*")]
        #smoothed_output_list = [p.replace(subj_wave_fullpath,"") for p in glob.glob(subj_wave_fullpath + "/func/s6_sub*")]
        

        task_counts_input = {}
        task_counts_smoothed = {}
        task_diff = {}
        
        for task in task_list:
            #check for input matching the canonical number of acqs
            #we should have precisely four, with the values 1, 2, 3, 4 as acquisition groups
            #except for SST which has a smaller dictionary
            
            #match on just the regularly named files
            glob_key = subj_wave_fullpath + "/func/sub*" + task + "*_acq-?_space-MNI152NLin2009cAsym_desc-preproc_bold*"
            task_input_files_regular = [p.replace(subj_wave_fullpath,"") for p in glob.glob(glob_key)]
            acq_groups_input = set([int(re.search("acq-(\d)",fn).group(1)) for fn in task_input_files_regular])
            subj_wave_dict[task + "_missing_input_acq_bool"] = acq_groups_input!=task_correct_acq_set[task]
            subj_wave_dict[task + "_missing_input_acq"] = task_correct_acq_set[task].difference(acq_groups_input)

            #allow matches on irregularly named files
            task_input_files_all = [p.replace(subj_wave_fullpath,"") for p in glob.glob(subj_wave_fullpath + "/func/sub*" + task + "*_acq-*_space-MNI152NLin2009cAsym_desc-preproc_bold*")]
            acq_groups_input_all = set([int(re.search("acq-(\d)",fn).group(1)) for fn in task_input_files_all])
            subj_wave_dict[task + "_missing_entirely_input_acq_bool"] = acq_groups_input_all!=task_correct_acq_set[task]
            subj_wave_dict[task + "_missing_entirely_input_acq"] = task_correct_acq_set[task].difference(acq_groups_input_all)


            #check for output matching the input
            task_output_files = [p.replace(subj_wave_fullpath,"") for p in glob.glob(subj_wave_fullpath + "/func/s6_sub*" + task + "*_acq-?_space-MNI152NLin2009cAsym_desc-preproc_bold*")]
            acq_groups_output = set([int(re.search("acq-(\d)",fn).group(1)) for fn in task_output_files])
            subj_wave_dict[task + "_missing_output_acq_bool"] = acq_groups_output!=task_correct_acq_set[task]
            subj_wave_dict[task + "_missing_output_acq"] = task_correct_acq_set[task].difference(acq_groups_output)
            

        #subj_wave_dict["any_missing_output"] = sum(task_counts_input.values()) - sum(task_counts_smoothed.values())
        subj_wave_dict["any_missing_input_acq_bool"] = any([subj_wave_dict[t + "_missing_input_acq_bool"] for t in task_list])
        subj_wave_dict["any_missing_entirely_input_acq_bool"] = any([subj_wave_dict[t + "_missing_entirely_input_acq_bool"] for t in task_list])
        subj_wave_dict["any_missing_output_acq_bool"] = any([subj_wave_dict[t + "_missing_output_acq_bool"] for t in task_list])
        #print details if not all input files were successfully smoothed
        if sum(task_counts_smoothed.values())<sum(task_counts_input.values()):
            print(subj_folder + ", " + wave)
            print("input:")
            print(task_counts_input)
            print("smoothed:")
            print(task_counts_smoothed)
            print(subj_wave_dict["any_missing_acq_bool"])

        #list specifically which "aacq"
            
        output_dict_list = output_dict_list + [subj_wave_dict]

        
missing_report_df = pd.DataFrame(output_dict_list)

In [5]:
subj_wave_dict['ROC_missing_input_acq']

set()

In [6]:
missing_report_df.loc[210,:]

subj                                   sub-DEV116/
wave                                     ses-wave1
ROC_missing_input_acq_bool                   False
ROC_missing_input_acq                           {}
ROC_missing_entirely_input_acq_bool          False
ROC_missing_entirely_input_acq                  {}
ROC_missing_output_acq_bool                  False
ROC_missing_output_acq                          {}
WTP_missing_input_acq_bool                   False
WTP_missing_input_acq                           {}
WTP_missing_entirely_input_acq_bool          False
WTP_missing_entirely_input_acq                  {}
WTP_missing_output_acq_bool                  False
WTP_missing_output_acq                          {}
SST_missing_input_acq_bool                   False
SST_missing_input_acq                           {}
SST_missing_entirely_input_acq_bool          False
SST_missing_entirely_input_acq                  {}
SST_missing_output_acq_bool                  False
SST_missing_output_acq         

In [7]:
missing_report_df.loc[214,:]

subj                                   sub-DEV118/
wave                                     ses-wave1
ROC_missing_input_acq_bool                   False
ROC_missing_input_acq                           {}
ROC_missing_entirely_input_acq_bool          False
ROC_missing_entirely_input_acq                  {}
ROC_missing_output_acq_bool                  False
ROC_missing_output_acq                          {}
WTP_missing_input_acq_bool                    True
WTP_missing_input_acq                          {3}
WTP_missing_entirely_input_acq_bool           True
WTP_missing_entirely_input_acq                 {3}
WTP_missing_output_acq_bool                   True
WTP_missing_output_acq                      {3, 4}
SST_missing_input_acq_bool                   False
SST_missing_input_acq                           {}
SST_missing_entirely_input_acq_bool          False
SST_missing_entirely_input_acq                  {}
SST_missing_output_acq_bool                  False
SST_missing_output_acq         

In [8]:
missing_report_df.any_missing_output_acq_bool.value_counts()

False    517
True      27
Name: any_missing_output_acq_bool, dtype: int64

In [9]:
missing_report_df.loc[missing_report_df.any_missing_output_acq_bool>0,:]

Unnamed: 0,subj,wave,ROC_missing_input_acq_bool,ROC_missing_input_acq,ROC_missing_entirely_input_acq_bool,ROC_missing_entirely_input_acq,ROC_missing_output_acq_bool,ROC_missing_output_acq,WTP_missing_input_acq_bool,WTP_missing_input_acq,...,WTP_missing_output_acq,SST_missing_input_acq_bool,SST_missing_input_acq,SST_missing_entirely_input_acq_bool,SST_missing_entirely_input_acq,SST_missing_output_acq_bool,SST_missing_output_acq,any_missing_input_acq_bool,any_missing_entirely_input_acq_bool,any_missing_output_acq_bool
1,sub-DEV001/,ses-wave2,False,{},False,{},False,{},False,{},...,{},True,{1},True,{1},True,{1},True,True,True
7,sub-DEV007/,ses-wave1,False,{},False,{},False,{},True,"{1, 2, 3, 4}",...,"{1, 2, 3, 4}",False,{},False,{},False,{},True,True,True
33,sub-DEV020/,ses-wave2,True,"{3, 4}",True,"{3, 4}",True,"{3, 4}",False,{},...,{},False,{},False,{},False,{},True,True,True
46,sub-DEV027/,ses-wave2,False,{},False,{},False,{},False,{},...,{},True,{1},True,{1},True,{1},True,True,True
82,sub-DEV048/,ses-wave1,True,{4},True,{4},True,{4},False,{},...,{},False,{},False,{},False,{},True,True,True
108,sub-DEV061/,ses-wave1,False,{},False,{},False,{},False,{},...,{},True,{1},True,{1},True,{1},True,True,True
109,sub-DEV061/,ses-wave2,False,{},False,{},False,{},False,{},...,{},True,{1},True,{1},True,{1},True,True,True
149,sub-DEV082/,ses-wave1,False,{},False,{},False,{},True,{4},...,{4},False,{},False,{},False,{},True,True,True
170,sub-DEV094/,ses-wave1,True,{4},True,{4},True,{4},False,{},...,{},False,{},False,{},False,{},True,True,True
177,sub-DEV098/,ses-wave1,True,"{3, 4}",True,"{3, 4}",True,"{3, 4}",False,{},...,{},False,{},False,{},False,{},True,True,True


In [10]:
missing_report_df

Unnamed: 0,subj,wave,ROC_missing_input_acq_bool,ROC_missing_input_acq,ROC_missing_entirely_input_acq_bool,ROC_missing_entirely_input_acq,ROC_missing_output_acq_bool,ROC_missing_output_acq,WTP_missing_input_acq_bool,WTP_missing_input_acq,...,WTP_missing_output_acq,SST_missing_input_acq_bool,SST_missing_input_acq,SST_missing_entirely_input_acq_bool,SST_missing_entirely_input_acq,SST_missing_output_acq_bool,SST_missing_output_acq,any_missing_input_acq_bool,any_missing_entirely_input_acq_bool,any_missing_output_acq_bool
0,sub-DEV001/,ses-wave1,False,{},False,{},False,{},False,{},...,{},False,{},False,{},False,{},False,False,False
1,sub-DEV001/,ses-wave2,False,{},False,{},False,{},False,{},...,{},True,{1},True,{1},True,{1},True,True,True
2,sub-DEV004/,ses-wave1,False,{},False,{},False,{},False,{},...,{},False,{},False,{},False,{},False,False,False
3,sub-DEV004/,ses-wave2,False,{},False,{},False,{},False,{},...,{},False,{},False,{},False,{},False,False,False
4,sub-DEV005/,ses-wave1,False,{},False,{},False,{},False,{},...,{},False,{},False,{},False,{},False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
539,sub-DEV310/,ses-wave2,False,{},False,{},False,{},False,{},...,{},False,{},False,{},False,{},False,False,False
540,sub-DEV311/,ses-wave1,False,{},False,{},False,{},False,{},...,{},False,{},False,{},False,{},False,False,False
541,sub-DEV311/,ses-wave2,False,{},False,{},False,{},False,{},...,{},False,{},False,{},False,{},False,False,False
542,sub-DEV312/,ses-wave1,False,{},False,{},False,{},False,{},...,{},False,{},False,{},False,{},False,False,False


In [11]:
missing_report_df.groupby(['wave','SST_missing_input_acq_bool','SST_missing_output_acq_bool']).subj.count().sort_index()

wave       SST_missing_input_acq_bool  SST_missing_output_acq_bool
ses-wave1  False                       False                          281
           True                        True                             4
ses-wave2  False                       False                          251
           True                        True                             5
ses-wave3  False                       True                             2
           True                        True                             1
Name: subj, dtype: int64

In [12]:
missing_report_df.groupby(['wave','ROC_missing_input_acq_bool','ROC_missing_output_acq_bool']).subj.count().sort_index()

wave       ROC_missing_input_acq_bool  ROC_missing_output_acq_bool
ses-wave1  False                       False                          278
           True                        True                             7
ses-wave2  False                       False                          254
           True                        True                             2
ses-wave3  False                       True                             2
           True                        True                             1
Name: subj, dtype: int64

In [13]:
missing_report_df.groupby(['wave','WTP_missing_input_acq_bool','WTP_missing_output_acq_bool']).subj.count().sort_index()

wave       WTP_missing_input_acq_bool  WTP_missing_output_acq_bool
ses-wave1  False                       False                          277
           True                        True                             8
ses-wave2  False                       False                          253
           True                        True                             3
ses-wave3  False                       True                             1
           True                        True                             2
Name: subj, dtype: int64

In [14]:
missing_report_df.loc[(missing_report_df.ROC_missing_output_acq_bool) | (missing_report_df.WTP_missing_input_acq_bool),:]

Unnamed: 0,subj,wave,ROC_missing_input_acq_bool,ROC_missing_input_acq,ROC_missing_entirely_input_acq_bool,ROC_missing_entirely_input_acq,ROC_missing_output_acq_bool,ROC_missing_output_acq,WTP_missing_input_acq_bool,WTP_missing_input_acq,...,WTP_missing_output_acq,SST_missing_input_acq_bool,SST_missing_input_acq,SST_missing_entirely_input_acq_bool,SST_missing_entirely_input_acq,SST_missing_output_acq_bool,SST_missing_output_acq,any_missing_input_acq_bool,any_missing_entirely_input_acq_bool,any_missing_output_acq_bool
7,sub-DEV007/,ses-wave1,False,{},False,{},False,{},True,"{1, 2, 3, 4}",...,"{1, 2, 3, 4}",False,{},False,{},False,{},True,True,True
33,sub-DEV020/,ses-wave2,True,"{3, 4}",True,"{3, 4}",True,"{3, 4}",False,{},...,{},False,{},False,{},False,{},True,True,True
82,sub-DEV048/,ses-wave1,True,{4},True,{4},True,{4},False,{},...,{},False,{},False,{},False,{},True,True,True
149,sub-DEV082/,ses-wave1,False,{},False,{},False,{},True,{4},...,{4},False,{},False,{},False,{},True,True,True
170,sub-DEV094/,ses-wave1,True,{4},True,{4},True,{4},False,{},...,{},False,{},False,{},False,{},True,True,True
177,sub-DEV098/,ses-wave1,True,"{3, 4}",True,"{3, 4}",True,"{3, 4}",False,{},...,{},False,{},False,{},False,{},True,True,True
192,sub-DEV106/,ses-wave1,False,{},False,{},False,{},True,{4},...,{4},False,{},False,{},False,{},True,True,True
214,sub-DEV118/,ses-wave1,False,{},False,{},False,{},True,{3},...,"{3, 4}",False,{},False,{},False,{},True,True,True
229,sub-DEV126/,ses-wave1,True,"{1, 2, 3, 4}",True,"{1, 2, 3, 4}",True,"{1, 2, 3, 4}",True,"{1, 2, 3, 4}",...,"{1, 2, 3, 4}",True,{1},True,{1},True,{1},True,True,True
230,sub-DEV126/,ses-wave2,True,"{1, 2, 3, 4}",True,"{1, 2, 3, 4}",True,"{1, 2, 3, 4}",True,"{1, 2, 3, 4}",...,"{1, 2, 3, 4}",True,{1},True,{1},True,{1},True,True,True


In [15]:
subj_wave_fullpath = file_dir + "sub-DEV118/ses-wave1/"
print(subj_wave_fullpath)
[p.replace(subj_wave_fullpath,"") for p in glob.glob(subj_wave_fullpath + "/func/sub*WTP*bold_space-MNI152NLin2009cAsym_preproc*")]

/gpfs/projects/sanlab/shared/DEV/bids_data/derivatives/fmriprep_2022/sub-DEV118/ses-wave1/


[]

In [16]:
subj_wave_fullpath = file_dir + "sub-DEV116/ses-wave1/"
print(subj_wave_fullpath)
[p.replace(subj_wave_fullpath,"") for p in glob.glob(subj_wave_fullpath + "/func/sub*WTP*bold_space-MNI152NLin2009cAsym_preproc*")]

/gpfs/projects/sanlab/shared/DEV/bids_data/derivatives/fmriprep_2022/sub-DEV116/ses-wave1/


[]

In [17]:
#specifically which "acq" exist for this subject?

## Get a list of all the missing acq runs

In [18]:
missing_report_df.columns

Index(['subj', 'wave', 'ROC_missing_input_acq_bool', 'ROC_missing_input_acq',
       'ROC_missing_entirely_input_acq_bool', 'ROC_missing_entirely_input_acq',
       'ROC_missing_output_acq_bool', 'ROC_missing_output_acq',
       'WTP_missing_input_acq_bool', 'WTP_missing_input_acq',
       'WTP_missing_entirely_input_acq_bool', 'WTP_missing_entirely_input_acq',
       'WTP_missing_output_acq_bool', 'WTP_missing_output_acq',
       'SST_missing_input_acq_bool', 'SST_missing_input_acq',
       'SST_missing_entirely_input_acq_bool', 'SST_missing_entirely_input_acq',
       'SST_missing_output_acq_bool', 'SST_missing_output_acq',
       'any_missing_input_acq_bool', 'any_missing_entirely_input_acq_bool',
       'any_missing_output_acq_bool'],
      dtype='object')

In [19]:
waves_with_missing_input = missing_report_df.loc[:,[
    'ROC_missing_input_acq',
    'WTP_missing_input_acq','SST_missing_input_acq'
                       ]].any(1)
waves_with_missing_data = waves_with_missing_input |  (missing_report_df.any_missing_output_acq_bool>0)


In [20]:
pd.set_option('display.max_rows', None)

### Just the items with missing input data

In [21]:
missing_report_df.columns

Index(['subj', 'wave', 'ROC_missing_input_acq_bool', 'ROC_missing_input_acq',
       'ROC_missing_entirely_input_acq_bool', 'ROC_missing_entirely_input_acq',
       'ROC_missing_output_acq_bool', 'ROC_missing_output_acq',
       'WTP_missing_input_acq_bool', 'WTP_missing_input_acq',
       'WTP_missing_entirely_input_acq_bool', 'WTP_missing_entirely_input_acq',
       'WTP_missing_output_acq_bool', 'WTP_missing_output_acq',
       'SST_missing_input_acq_bool', 'SST_missing_input_acq',
       'SST_missing_entirely_input_acq_bool', 'SST_missing_entirely_input_acq',
       'SST_missing_output_acq_bool', 'SST_missing_output_acq',
       'any_missing_input_acq_bool', 'any_missing_entirely_input_acq_bool',
       'any_missing_output_acq_bool'],
      dtype='object')

In [22]:


waves_with_missing_data_info = (missing_report_df.
    loc[waves_with_missing_input]
    .loc[:,['subj','wave','ROC_missing_input_acq',
            'WTP_missing_input_acq','SST_missing_input_acq',
            'ROC_missing_entirely_input_acq',
            'WTP_missing_entirely_input_acq','SST_missing_entirely_input_acq',
            'any_missing_output_acq_bool'
           ]]
                          )

display(waves_with_missing_data_info)

waves_with_missing_data_info.to_csv("waves_with_missing_input_data.csv")

Unnamed: 0,subj,wave,ROC_missing_input_acq,WTP_missing_input_acq,SST_missing_input_acq,ROC_missing_entirely_input_acq,WTP_missing_entirely_input_acq,SST_missing_entirely_input_acq,any_missing_output_acq_bool
1,sub-DEV001/,ses-wave2,{},{},{1},{},{},{1},True
7,sub-DEV007/,ses-wave1,{},"{1, 2, 3, 4}",{},{},"{1, 2, 3, 4}",{},True
33,sub-DEV020/,ses-wave2,"{3, 4}",{},{},"{3, 4}",{},{},True
46,sub-DEV027/,ses-wave2,{},{},{1},{},{},{1},True
82,sub-DEV048/,ses-wave1,{4},{},{},{4},{},{},True
108,sub-DEV061/,ses-wave1,{},{},{1},{},{},{1},True
109,sub-DEV061/,ses-wave2,{},{},{1},{},{},{1},True
149,sub-DEV082/,ses-wave1,{},{4},{},{},{4},{},True
170,sub-DEV094/,ses-wave1,{4},{},{},{4},{},{},True
177,sub-DEV098/,ses-wave1,"{3, 4}",{},{},"{3, 4}",{},{},True


and what about...can we get a list of the waves that have no missing input data but seem to have missing out put data for some reason?

A lto of the wave2 will be based on problems in the prior category in wave1 that caused wave 2 to be aborted too.

So we'll ignore wave 1 and just focus on wave 1 for now...


In [23]:
waves_with_good_input_but_missing_output = missing_report_df.loc[(
    (missing_report_df.any_missing_input_acq_bool==False) & 
    (missing_report_df.any_missing_output_acq_bool==True) &
    (missing_report_df.wave=='ses-wave1')
),:]

waves_with_good_input_but_missing_output

Unnamed: 0,subj,wave,ROC_missing_input_acq_bool,ROC_missing_input_acq,ROC_missing_entirely_input_acq_bool,ROC_missing_entirely_input_acq,ROC_missing_output_acq_bool,ROC_missing_output_acq,WTP_missing_input_acq_bool,WTP_missing_input_acq,...,WTP_missing_output_acq,SST_missing_input_acq_bool,SST_missing_input_acq,SST_missing_entirely_input_acq_bool,SST_missing_entirely_input_acq,SST_missing_output_acq_bool,SST_missing_output_acq,any_missing_input_acq_bool,any_missing_entirely_input_acq_bool,any_missing_output_acq_bool


Finally we should have a look at the wave2's as well.

In [24]:
waves_with_good_input_but_missing_output = missing_report_df.loc[(
    (missing_report_df.any_missing_input_acq_bool==False) & 
    (missing_report_df.any_missing_output_acq_bool==True) &
    (missing_report_df.wave=='ses-wave2')
),:]

waves_with_good_input_but_missing_output

Unnamed: 0,subj,wave,ROC_missing_input_acq_bool,ROC_missing_input_acq,ROC_missing_entirely_input_acq_bool,ROC_missing_entirely_input_acq,ROC_missing_output_acq_bool,ROC_missing_output_acq,WTP_missing_input_acq_bool,WTP_missing_input_acq,...,WTP_missing_output_acq,SST_missing_input_acq_bool,SST_missing_input_acq,SST_missing_entirely_input_acq_bool,SST_missing_entirely_input_acq,SST_missing_output_acq_bool,SST_missing_output_acq,any_missing_input_acq_bool,any_missing_entirely_input_acq_bool,any_missing_output_acq_bool
