In [None]:
#This script audits the attachments stored on flywheel, either at the acquisition or session level.
#Inputs:
    #enrollment sheets pulled from axis, stored at afp://saturn/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/inputs/
    #data from Flywheel acessed using the CLI 
#Outputs:
    #Binary audit of attachments related to task data (.log, .json, .tsv) on Flywheel 
    #Binary audit of variability on Flywheel 

In [3]:
import flywheel
import pandas as pd
import re 
fw = flywheel.Client()

In [26]:
EFProj=fw.projects.find_first('label=EFR01')
#print(EFProj)

In [5]:
#loop through subs and get sessions
subjects=EFProj.subjects()

sessions=[]
for s in subjects :
    tempsessions=s.sessions()
    sessions.extend(tempsessions)

In [7]:
#read in T1 enrollment
#adjust file path as necessary -- currently assuming parent directory is saturn folder referenced in first block 
axis_t1=pd.read_csv('/axis_enroll_t1.csv',dtype=str)
axis_t1=axis_t1.drop(columns=['scan_1_date'])
#and T2
axis_t2=pd.read_csv('/axis_enroll_t2.csv',dtype=str)
axis_t2=axis_t2.drop(columns=['scan_2_date'])

In [8]:
#reformat for list of scan IDs 
t1_scan=axis_t1['scan_id_timepoint_1']
t1_scan=t1_scan.tolist()
t1_scan = [str(t) for t in t1_scan]
#and again
t2_scan=axis_t2['scan_id_t2']
t2_scan=t2_scan.tolist()
t2_scan = [str(t) for t in t2_scan]

In [9]:
#get a list of all log files attached at the session level of flywheel 
log_files=[]

for ses in sessions:
    sesid=ses.label
    #print(sesid)
    acq=ses.acquisitions()
    for a in acq:
        if 'func_task-fracnoback_run-02' in a.label or 'ABCD_fMRI_frac-no-back-run1' in a.label:
            EFFiles=a.files
            EFNames=[x.name for x in EFFiles]
            for y in EFNames:
                if 'log' in y:
                    log_files.append((sesid,y))

In [1]:
#print((log_files))

In [10]:
has_log=[]

for t in t1_scan: 
    if t not in [l[0] for l in log_files]:
        has_log.append((t,0))
    else: 
        has_log.append((t,1))

In [16]:
#print(has_log)

[('11012', 1), ('10960', 0), ('10959', 0), ('11138', 1), ('11000', 1), ('11003', 1), ('11140', 1), ('11115', 0), ('nan', 0), ('11209', 1), ('11126', 1), ('11112', 0), ('11316', 1), ('11143', 1), ('11142', 1), ('11249', 0), ('11127', 1), ('11290', 1), ('11291', 0), ('nan', 0), ('11208', 1), ('11206', 1), ('11159', 1), ('11242', 1), ('11207', 1), ('11160', 1), ('11161', 0), ('11146', 1), ('11132', 1), ('11184', 1), ('11147', 1), ('11221', 0), ('nan', 0), ('nan', 0), ('11180', 1), ('11210', 1), ('11211', 1), ('11205', 1), ('11233', 1), ('11217', 1), ('nan', 0), ('nan', 0), ('11238', 1), ('11276', 1), ('11302', 1), ('11289', 1), ('nan', 0), ('nan', 0), ('11264', 1), ('11324', 1), ('nan', 0), ('11320', 1), ('11370', 1), ('nan', 0), ('11321', 1), ('11319', 1), ('nan', 0), ('11332', 1), ('nan', 0), ('nan', 0), ('11334', 0), ('11365', 1), ('11375', 1), ('nan', 0), ('11351', 0), ('11359', 1), ('11404', 1), ('11405', 0), ('11366', 1), ('11392', 1), ('11396', 0), ('11376', 0), ('11381', 1), ('nan

In [11]:
#save list as a dataframe
t1_audit = pd.DataFrame(has_log, columns =['scanid', 'has_log'])
t1_audit = t1_audit.astype(str)

Unnamed: 0,scanid,has_log
0,11012,1
1,10960,0
2,10959,0
3,11138,1
4,11000,1
5,11003,1
6,11140,1
7,11115,0
8,,0
9,11209,1


In [12]:
##now, let's check for various attachments at the session level
import re

# initializing empty lists
has_tsv=[]
has_json=[]
has_variability = []
has_asl=[]

# getting the names of all the files 

EFnames = []
for ii in sessions: # looping through each session
    for jj in range(len(ii["files"])): # looping through each file attached to each session
        file_name = (ii["files"][jj]["name"]) # i.e: accessing the files value within each session's dictionary --> accessing each file in the list of files --> accessing the 'name' value within each dictionary there
        EFnames.append(file_name)

has_json_list = []
for ii in EFnames: # going through each file in the list
    if "bold_events.json" in ii: # all files that have the events.json should also have the events.tsv
        ii = ii.split('_',2) # split so that ses_XXX is isolated as element index 1 in a list
        ii = ii[1] # index ses-*
        has_json_list.append(ii)

has_tsv_list = []
for ii in EFnames: # going through each file in the list
    if "bold_events.tsv" in ii: # all files that have the events.json should also have the events.tsv
        ii = ii.split('_',2) # split so that ses_XXX is isolated as element index 1 in a list
        ii = ii[1] # index ses-*
        has_tsv_list.append(ii)

# repeat for aslcontext and variability 

has_asl_list = []
for ii in EFnames: 
    if "aslcontext" in ii: 
        ii = ii.split('_',2) 
        ii = ii[1] # index ses-*
        has_asl_list.append(ii)
        
has_variability_list = []
for ii in EFnames: 
    if "variability" in ii:
        ii = ii.split('_',2) # variability.zip is formatted differently
        ii = ii[1] 
        has_variability_list.append(ii)

all_sessions_list = []

for t in t1_scan: 
    all_sessions_list.append(t) # append each session number to the all_sessions_list

# creating a list of all sessions, but appending "ses-" at the beginning of each as this is how the information will be pulled out from Flywheel    
all_sessions_list_ses = []  
for ses in all_sessions_list: 
    ses = "ses-" + str(ses)
    all_sessions_list_ses.append(ses)
# print (all_sessions_list_ses)

#looping through, filling out binary audits! 
#check jsons
for item in all_sessions_list_ses: 
    if item not in has_json_list:
        has_json.append((item,0))
    else: 
        has_json.append((item,1))
#check events tsvs        
for item in all_sessions_list_ses: 
    if item not in has_tsv_list:
        has_tsv.append((item,0))
    else: 
        has_tsv.append((item,1))
#check asl context        
for item in all_sessions_list_ses: 
    if item not in has_asl_list :
        has_asl.append((item,0))
    else: 
        has_asl.append((item,1))
#check variability        
for item in all_sessions_list: # variability does not have sub-*-ses-* format, so will not need ses- appended
    if item not in has_variability_list:
        has_variability.append((item,0))
    else: 
        has_variability.append((item,1))
        

In [15]:
#quick reformat to be able to merge! 
has_tsv_reformat=[]
for x in has_tsv:
    y=x[1]
    #z=y.split('-')[0]
    #print(z)
    has_tsv_reformat.append(y)
    
has_json_reformat=[]
for x in has_json:
    y=x[1]
    #z=y.split('-')[0]
    #print(z)
    has_json_reformat.append(y)


In [16]:
#fill in t1 task data audit dataframe 
t1_audit['has_json'] = has_json_reformat
t1_audit['has_tsv'] = has_tsv_reformat

In [17]:
#create an audit data frame for variability
t1_var_audit = pd.DataFrame(has_variability, columns =['scanid', 'has_variability'])

In [18]:
#save sheets to disk 
t1_audit.to_csv('/Users/krmurtha/Desktop/EF_DATA_FREEZE/audits/EF_T1_task_audit.csv', sep = ',', index=False)
t1_var_audit.to_csv('/Users/krmurtha/Desktop/EF_DATA_FREEZE/audits/EF_T1_variabilty_audit.csv', sep = ',', index=False)

In [21]:
#and check for .log's in T2 scans 
has_log2=[]

for t in t2_scan: 
    if t not in [l[0] for l in log_files]:
        has_log2.append((t,0))
    else: 
        has_log2.append((t,1))

In [22]:
t2_audit = pd.DataFrame(has_log2, columns =['scanid', 'has_log'])

In [23]:
##now, let's check for various attachments at the session level for T2 

# initializing empty lists
has_tsv2=[]
has_json2=[]
has_variability2 = []
has_asl2=[]

all_sessions_list2 = []

for t in t2_scan: 
    all_sessions_list2.append(t) # append each session number to the all_sessions_list

# creating a list of all sessions, but appending "ses-" at the beginning of each as this is how the information will be pulled out from Flywheel    
all_sessions_list_ses2 = []  
for ses in all_sessions_list2: 
    ses = "ses-" + str(ses)
    all_sessions_list_ses2.append(ses)
# print (all_sessions_list_ses)

#looping through, filling out binary audits! 
#check jsons
for item in all_sessions_list_ses2: 
    if item not in has_json_list:
        has_json2.append((item,0))
    else: 
        has_json2.append((item,1))
#check events tsvs        
for item in all_sessions_list_ses2: 
    if item not in has_tsv_list:
        has_tsv2.append((item,0))
    else: 
        has_tsv2.append((item,1))
#check asl context        
for item in all_sessions_list_ses2: 
    if item not in has_asl_list :
        has_asl2.append((item,0))
    else: 
        has_asl2.append((item,1))
#check variability        
for item in all_sessions_list2: # variability does not have sub-*-ses-* format, so will not need ses- appended
    if item not in has_variability_list:
        has_variability2.append((item,0))
    else: 
        has_variability2.append((item,1))
        

In [25]:
#quick reformat to be able to merge! 
has_tsv_reformat2=[]
for x in has_tsv2:
    y=x[1]
    #z=y.split('-')[0]
    #print(z)
    has_tsv_reformat2.append(y)
    
has_json_reformat2=[]
for x in has_json2:
    y=x[1]
    #z=y.split('-')[0]
    #print(z)
    has_json_reformat2.append(y)

In [26]:
t2_audit['has_json']=has_json_reformat2
t2_audit['has_tsv']=has_tsv_reformat2

In [24]:
t2_var_audit = pd.DataFrame(has_variability2, columns =['scanid', 'has_variability'])

In [28]:
t2_audit.to_csv('/Users/krmurtha/Desktop/EF_DATA_FREEZE/audits/EF_T2_task_audit.csv', sep = ',', index=False)
t2_var_audit.to_csv('/Users/krmurtha/Desktop/EF_DATA_FREEZE/audits/EF_T2_variabilty_audit.csv', sep = ',', index=False)