# Querying Flywheel for the June 30th, 2021 Datafreeze
**Project:** ExtraLong <br>
**Author:** Katja Zoner <br>
**Date:** 07/30/2021 <br>

# Step 0: Setup

In [539]:
import os
import sys
import logging
import numpy as np
import pandas as pd
import flywheel


# Step 1: Read in superset csv, convert to dataframe, and clean.

In [567]:
# Read superset csv.
csv = "./csv/all_long_scans_oracle_cleaned.csv"
superset = pd.read_csv(csv)

# Convert date of scan to timestamp dtype
superset["doscan"] = pd.to_datetime(superset["doscan"])
superset

Unnamed: 0,scanid,bblid,scan_protocol,doscan,timepoint,ntimepoints,scanage_months,sex,race,ethnic,scanstat
0,613,11660,700205 - Worden,2003-05-23,1,6,1240.0,,,,C2
1,615,11518,700205 - Worden,2003-05-27,1,2,356.0,2.0,2.0,,C2
2,597,11382,700205 - Worden,2003-07-17,1,4,260.0,2.0,2.0,2.0,C2
3,598,11565,700205 - Worden,2003-07-24,1,7,291.0,1.0,1.0,2.0,I7
4,599,11530,700205 - Worden,2003-07-29,1,4,331.0,1.0,2.0,2.0,C2
...,...,...,...,...,...,...,...,...,...,...,...
4252,11820,82039,833922 - EvolPsy,2021-07-12,5,5,367.0,2.0,2.0,2.0,
4253,11803,132782,842909 - TRANSCENDS_D1,2021-07-12,3,6,307.0,1.0,1.0,2.0,
4254,11804,132782,842909 - TRANSCENDS_D1,2021-07-19,4,6,307.0,1.0,1.0,2.0,
4255,11805,132782,842909 - TRANSCENDS_D1,2021-07-30,5,6,307.0,1.0,1.0,2.0,


# Step 2: Get list of scan protocols to include in 2021 data freeze.

In [570]:
# Read superset csv.
csv = "./csv/protocols_for_inclusion.csv"
inclusion_df = pd.read_csv(csv)

inclusion_df.include.fillna(True,inplace=True)
protocols = list(inclusion_df.scan_protocol[inclusion_df.include == True])
protocols

['808689 - AGGY',
 '808922 - MGI2_PENN',
 '808799 - DAY2',
 '807360 - Olf Lifespan',
 '810336 - Big GO',
 '810336 - Go2 Supplement',
 '810211 - FNDM',
 'B10218 - MGI2_PITT',
 '817628 - EFDO',
 '816281 - NODRA',
 '810336 - GO3 FOLLOW UP',
 '810336 - Go3',
 '816275 - ONM',
 '815814 - Conte',
 '822937 - HARMONY',
 '818028 - Effort',
 '818621 - SYRP',
 '820690 - phASL',
 '825940 - GluCEST in Psychosis',
 '822831 - GRMPY',
 '825834 - satterttPiloting',
 '829502 - MOTIVE',
 '834246 - 22qmidline',
 '833922 - EvolPsy']

# Step 3: Generate csv of scans to include in ExtraLong 2021 Data Freeze

In [None]:
def filterBySessionCount(df, thresh):

    # Get scan counts for each subject (bblid).
    bblid_counts = df.groupby(["bblid"]).size().reset_index()
    bblid_counts.columns = ["bblid", "scan_count"]
    bblid_counts.sort_values("scan_count")

    # Get list of bblids that should be included
    include_bblids = bblid_counts.bblid[bblid_counts["scan_count"]>=thresh]

    # Filter dataframe to only include subjects that meet session requirements
    df = df[df["bblid"].isin(include_bblids)].copy()

    # Update ntimepoints column to indicate number of sessions in ExtraLong 2021 for each subject.
    for bblid in df.bblid.unique():

        # Get subject's number of timepoints in ExtraLong 2021
        num_tps = bblid_counts[bblid_counts.bblid == bblid].scan_count.item()

        # Update ntimeponts in ExtraLong dataframe
        df.loc[df.bblid==bblid, 'ntimepoints'] = num_tps

    # Update timepoint column to indicate timepoint number for each subject
    df["timepoint"] = df.groupby("bblid").cumcount()+1

    return df.sort_values(["bblid","doscan"])

In [571]:
# 1. Filter by superset by protocol (only include scan_protocols from inclusion csv)
df = superset[superset["scan_protocol"].isin(protocols)]

# 2. Filter by data freeze cutoff date
cutoff_date = pd.to_datetime("2021-07-01")
df = df[df.doscan < cutoff_date]

# 3. Filter by subject's session count (only include subjects with 2+ scans).
df = filterBySessionCount(df,2)
df


Unnamed: 0,scanid,bblid,scan_protocol,doscan,timepoint,ntimepoints,scanage_months,sex,race,ethnic,scanstat
1059,3931,10180,808799 - DAY2,2010-10-11,1,2,566.0,1.0,2.0,2.0,IS4
1172,4451,10180,807360 - Olf Lifespan,2011-01-31,2,2,569.0,1.0,2.0,2.0,IS1
1818,6776,10410,810211 - FNDM,2012-04-24,1,2,556.0,2.0,2.0,2.0,IS1
1846,6843,10410,810211 - FNDM,2012-05-11,2,2,557.0,2.0,2.0,2.0,IS1
1112,4185,11176,808799 - DAY2,2010-11-23,1,2,337.0,1.0,1.0,2.0,IS1
...,...,...,...,...,...,...,...,...,...,...,...
3734,10739,139272,825940 - GluCEST in Psychosis,2018-03-29,8,8,282.0,2.0,2.0,2.0,IS4
2474,8461,139490,810336 - Big GO,2013-08-30,1,2,105.0,1.0,2.0,2.0,IS2
3601,10564,139490,815814 - Conte,2017-04-29,2,2,149.0,1.0,2.0,2.0,IS4
2464,8410,139553,810336 - Big GO,2013-08-23,1,2,107.0,2.0,2.0,2.0,IS2


# Step 4: Add in information for scans previously in ExtraLong 2019

In [572]:
# Read in ExtraLong sesid - scanid mapping csv
mapping_csv = "./csv/scanid_to_seslabel_10-16-2019.csv"
mapping = pd.read_csv(mapping_csv)

# Rename seslabel col to sesid
mapping.columns = ["project", "bblid", "scanid", "sesid"]
mapping = mapping[["bblid", "scanid", "sesid", "project"]]

# Cast col datatypes
mapping[["bblid", "scanid"]]=mapping[["bblid", "scanid"]].astype("int64")

In [573]:
xl = df.copy()
del xl["scanstat"]
xl["acqid"] = ""
xl["filename"] = ""

# Merge scanid-sesid mapping with xl dataframe to add in ExtraLong sesid's
xl = xl.merge(mapping, on=['scanid','bblid'], how='left')

# Reorder columns
xl = xl[['bblid','scanid','sesid','scan_protocol','project', 'filename', 'acqid', 'doscan', 'timepoint', 'ntimepoints', 'scanage_months', 'sex', 'race', 'ethnic']]
xl.sort_values(by=['project'])

Unnamed: 0,bblid,scanid,sesid,scan_protocol,project,filename,acqid,doscan,timepoint,ntimepoints,scanage_months,sex,race,ethnic
288,16098,5049,AGGY2,808689 - AGGY,AGGY_808689,,,2011-05-19,2,2,160.0,2.0,2.0,
2294,111873,4109,AGGY1,808689 - AGGY,AGGY_808689,,,2010-11-12,1,2,151.0,1.0,2.0,2.0
239,15701,4325,AGGY2,808689 - AGGY,AGGY_808689,,,2011-01-04,2,2,137.0,1.0,2.0,
238,15701,3694,AGGY1,808689 - AGGY,AGGY_808689,,,2010-08-17,1,2,132.0,1.0,2.0,
2722,122124,2917,AGGY1,808689 - AGGY,AGGY_808689,,,2010-02-24,1,2,137.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3093,135085,11650,,834246 - 22qmidline,,,,2021-02-11,3,3,222.0,2.0,1.0,2.0
3098,139181,8470,,810336 - Big GO,,,,2013-08-31,1,2,162.0,2.0,2.0,
3099,139181,11619,,833922 - EvolPsy,,,,2021-01-11,2,2,250.0,2.0,2.0,
3104,139272,8985,,816275 - ONM,,,,2014-07-22,5,8,238.0,2.0,2.0,2.0


In [530]:
# Get earliest timestamp in previous ExtraLong freeze to use as timestamp threshold for new datafreeze
old = xl[xl.project.notnull()].copy()
earliest = old.min().doscan
print(earliest)
old.sort_values('doscan')

2009-10-28 00:00:00


Unnamed: 0,bblid,scanid,sesid,scan_protocol,project,filename,acqid,doscan,timepoint,ntimepoints,scanage_months,sex,race,ethnic
2133,130896,2542,AGGY1,808689 - AGGY,AGGY_808689,,,2009-10-28,1,6,144.0,2.0,2.0,2.0
177,80812,2646,PNC1,810336 - Big GO,PNC_CS_810336,,,2009-12-05,1,2,247.0,2.0,2.0,2.0
179,80854,2675,PNC1,810336 - Big GO,PNC_CS_810336,,,2009-12-17,1,2,242.0,1.0,1.0,2.0
241,82232,2706,PNC1,810336 - Big GO,PNC_CS_810336,,,2010-01-07,1,2,228.0,2.0,1.0,2.0
216,81982,2723,PNC1,810336 - Big GO,PNC_CS_810336,,,2010-01-09,1,2,231.0,2.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1079,98535,11254,11254,822831 - GRMPY,GRMPY_822831,,,2019-05-07,3,4,221.0,1.0,1.0,1.0
1611,114723,11256,11256,822831 - GRMPY,GRMPY_822831,,,2019-05-09,2,2,328.0,1.0,2.0,2.0
1998,126921,11262,11262,822831 - GRMPY,GRMPY_822831,,,2019-05-15,2,3,226.0,1.0,2.0,2.0
1530,112200,11349,motive1,829502 - MOTIVE,MOTIVE,,,2019-08-07,6,7,261.0,1.0,1.0,2.0


# Step 5: Locating Scans on Flywheel

In [547]:

#PROJECT_LABEL = "ExtraLong"

IS_BIDSIFIED = [
    "ExtraLong",
    "AGGY_808689", 
    "CONTE_815814", 
    "GRMPY_822831", 
    "MOTIVE",
    "SYRP_818621", 
    "NEFF_818028",
    "PNC_CS_810336"
]

DEPRECATED_PROJECTS = [
    "DAY2_808799",
    "FNDM1_810211",
    "FNDM2_810211",
    "NODRA_816281",
    "ONM_816275"
]

DEPRECATED_PROTOCOLS = [
    "808799 - DAY2",
    "810211 - FNDM",
    "816281 - NODRA",
    "816275 - ONM",
    "807360 - Olf Lifespan",
    '808922 - MGI2_PENN',
    '820690 - phASL',
    "825834 - satterttPiloting",
    "822937 - HARMONY",
    "817628 - EFDO",
    "825940 - GluCEST in Psychosis"
]

# Get client
fw = flywheel.Client()
assert fw, "Your Flywheel CLI credentials aren't set!"

# Get project object
# xlProject = fw.projects.find_first('label="{}"'.format(PROJECT_LABEL))
# assert xlProject, "Project not found!"

In [685]:
def hasBIDS(f):
    '''
    Helper function to check if file has bids metadata.
    '''
    try:
        #return f.info['BIDS']['Modality'] == 'T1w'
        return f.info['BIDS']['Filename'].split('_')[-1] == 'T1w.nii.gz'
    except:
        return False


In [730]:
# Functions below are heuristics to identify valid T1w images across all projects in ExtraLong.
def find_from_bids(acq):
    return [f.name for f in acq.files if hasBIDS(f) and f.size > MIN_SIZE]
    
def find_from_old_xl(acq):
    return [f.name for f in acq.files if f.type =='nifti' and f.size > MIN_SIZE]

def find_by_mprage_in_fname(acq):
    return [f.name for f in acq.files if f.type == 'nifti' and 'mprage' in f.name.lower() and f.size > MIN_SIZE]

def find_by_other_fname(acq):
    return [f.name for f in acq.files if f.type == 'nifti' and 't1w_mpr_vnav_4' in f.name.lower() and f.size > MIN_SIZE]

def find_by_acq_label(acq):
    valid_acq_labels = [
            'ABCD_T1w_MPR_vNav',
            'anat-T1w_acq-vNavBC_seq-ABCD__MPRAGE'
    ]
    
    if acq.label in valid_acq_labels:
            return [f.name for f in acq.files if f.type == 'nifti' and f.size > MIN_SIZE]
    else:
            return []

MIN_SIZE = 6000000
# List of heuristic functions to try to identify T1w image in acquisition
HEURISTICS = [
    #find_from_bids,
    #find_from_old_xl,
    find_by_mprage_in_fname,
    find_by_acq_label,
    find_by_other_fname
]


In [727]:
def find_t1w_in_results(results):
    all_files = []
    t1w_info = []

    # Get project label
    if len(results) > 0:
        project = results[0].project.label

    # Loop through all acquisitions found by query
    for res in results:
        # Get acquisition object
        acqid = res.acquisition.id
        acq = fw.get_acquisition(acqid)

        # Add to list of all files in session
        all_files.extend([f.name for f in acq.files if f.type == 'nifti'])

        # Try BIDS heuristic first --> if found ignore other results and return BIDS T1w image
        files = find_from_bids(acq)
        if len(files) == 1:
            return project, acqid, files[0]

        # Try searching for T1w image using each heuristic. 
        # Break if T1w image is identified.
        for heuristic_func in HEURISTICS:

            # Only apply the old xl heuristic if acq was from ExtraLong
            #if heuristic_func == find_from_old_xl and project != 'ExtraLong':
            #    continue
            
            # Apply heuristic to look for matching files
            files = heuristic_func(acq)
            
            # If any matching files were found, add fname and acq id to info list and break.
            if len(files) > 0:
                #useCounts[heuristic_func] +=1
                for fname in files:
                    t1w_info.append( (project, acqid, fname) )
                break

    # If a single T1w image was found, return the image info.
    if len(t1w_info) == 1:
        # Returns tuple of (project, acqid, filename)
        return t1w_info[0]

    if len(t1w_info) > 1:
        print(f'Warning: Multiple T1w images found for session: {res.session.label}!')
        return (project, None, "multiple")
            
    # If no T1w image was found in any acquisition after trying all heuristics, print all filenames.
    print(f'Warning: No T1w image found for session: {results[0].session.label}!')
    print(all_files)

In [657]:
def query_flywheel_for_acqs(scan):
    results = None

    # If scan already has project label, search in ExtraLong project.
    if not pd.isnull(scan.project):
        query = f'project.label == ExtraLong AND ' \
                f'subject.label == sub-{scan.bblid} AND ' \
                f'session.label == ses-{scan.sesid} '
        results = fw.search({'structured_query': query, 'return_type': 'acquisition'}, size=100)


    # Else if scan wasn't part of old ExtraLong, query all Flywheel.
    if not results:
        query = f'session.label == {scan.scanid} AND ' \
                f'subject.label == {scan.bblid} '
        results = fw.search({'structured_query': query, 'return_type': 'acquisition'}, size=100)

    return results

In [740]:
################## Loop through XL dataframe and attempt to locate each T1w image on Flywheel. ##################

# Initialize a dataframe with same column labels to hold scans that couldn't be found.
failed = pd.DataFrame(columns=xl.columns)

useCounts = dict()
for h in HEURISTICS:
    useCounts[h] = 0

# Search Flywheel for each scan in dataframe.
for index, scan in xl[xl.acqid == ""].iterrows():
    
    # # Skip scans that we know aren't there
    # if scan.scan_protocol in DEPRECATED_PROTOCOLS:
    #     continue

    # Get list of acquisitions matching bblid/sesid/scanid 
    results = query_flywheel_for_acqs(scan)
    
    # If no result are found
    if results is None or len(results) == 0:
        print(f"Warning: scanid {scan.scanid} not found on Flywheel.")
        failed = failed.append(scan)
        continue

    # Get project label, acquisition id, and filename for acq with correct T1w image.
    t1w_info = find_t1w_in_results(results)
    
    # If T1w image could not be found on Flywheel, add to failed dataframe
    if t1w_info is None:
        #print(f"No T1w image found for scanid: {scan.scanid}.")
        failed = failed.append(scan)

    # If project located but no T1w found, add to failed dataframe
    elif t1w_info[1] is None:
        #print(f"No T1w image found for scanid: {scan.scanid}.")
        scan.project = t1w_info[0]
        failed = failed.append(scan)
        xl.loc[index, "project"] = t1w_info[0]      # Project label
        if t1w_info[2] is not None:
            xl.loc[index, "filename"] = t1w_info[2]     # T1w filename
        
    else:
        print(t1w_info)
        xl.loc[index, "project"] = t1w_info[0]      # Project label
        xl.loc[index, "acqid"] = t1w_info[1]        # Acquisition id
        xl.loc[index, "filename"] = t1w_info[2]     # T1w filename

['B0map_v4_11_e2.nii.gz', 'B0map_v4_11_e1.nii.gz', 'B0map_v4_11.nii.gz', 'B0map_v4_11_e2_Eq_1.nii.gz', 'DTI_2x32_35_14.nii.gz', 'DTI_64Combined_MB2_v2_10.nii.gz', 'localizer_1_i00003.nii.gz', 'localizer_1_i00002.nii.gz', 'localizer_1_i00001.nii.gz', 'DTI_2x32_36_15.nii.gz', 'bbl1_idemo2_210_9.nii.gz', 'bbl1_frac2back1_231_8.nii.gz', 'ep2d_se_pcasl_PHC_1200ms_6.nii.gz', 'epi_singlerep_advshim_7.nii.gz', 'ep2d_se_pcasl_PHC_1200ms_5.nii.gz', 'bbl1_restbold1_124_13.nii.gz', 'MPRAGE_TI1110_ipat2_moco3_3.nii.gz', 'MPRAGE_NAVprotocol_2.nii.gz', 'B0map_v4_12_ph.nii.gz']
['ABCD_T1w_MPR_vNav_4_i00097.nii.gz', 'ABCD_T1w_MPR_vNav_4_i00065.nii.gz', 'ABCD_T1w_MPR_vNav_4_i00001.nii.gz', 'ABCD_T1w_MPR_vNav_4_i00225.nii.gz', 'ABCD_T1w_MPR_vNav_4_i00193.nii.gz', 'ABCD_T1w_MPR_vNav_4_i00161.nii.gz', 'ABCD_T1w_MPR_vNav_4_i00129.nii.gz', 'ABCD_T1w_MPR_vNav_4_i00289.nii.gz', 'ABCD_T1w_MPR_vNav_4_i00257.nii.gz', 'ABCD_T1w_MPR_vNav_4_i00705.nii.gz', 'ABCD_T1w_MPR_vNav_4_i00737.nii.gz', 'ABCD_T1w_MPR_vNav_4_i0

# Step 5: Locating Scans on Flywheel - RESULTS
## Tried to located `2297` scans
## Sucessfully located `2174` scans
## Failed to located `483`

In [743]:
xl


Unnamed: 0,bblid,scanid,sesid,scan_protocol,project,filename,acqid,doscan,timepoint,ntimepoints,scanage_months,sex,race,ethnic
0,10180,3931,,808799 - DAY2,,,,2010-10-11,1,2,566.0,1.0,2.0,2.0
1,10180,4451,,807360 - Olf Lifespan,,,,2011-01-31,2,2,569.0,1.0,2.0,2.0
2,10410,6776,FNDM11,810211 - FNDM,ExtraLong,sub-10410_ses-FNDM11_T1w.nii.gz,5d9e0666a54d350038b39e11,2012-04-24,1,2,556.0,2.0,2.0,2.0
3,10410,6843,FNDM21,810211 - FNDM,ExtraLong,sub-10410_ses-FNDM21_T1w.nii.gz,5d9e0972a54d350044c11e33,2012-05-11,2,2,557.0,2.0,2.0,2.0
4,11176,4185,,808799 - DAY2,,,,2010-11-23,1,2,337.0,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3107,139272,10739,,825940 - GluCEST in Psychosis,,,,2018-03-29,8,8,282.0,2.0,2.0,2.0
3108,139490,8461,PNC1,810336 - Big GO,ExtraLong,sub-139490_ses-PNC1_T1w.nii.gz,5d9cd114a54d350039b28ece,2013-08-30,1,2,105.0,1.0,2.0,2.0
3109,139490,10564,CONTE1,815814 - Conte,ExtraLong,sub-139490_ses-CONTE1_acq-moco_T1w.nii.gz,5d9ca3cfa54d350028b04b1d,2017-04-29,2,2,149.0,1.0,2.0,2.0
3110,139553,8410,PNC1,810336 - Big GO,ExtraLong,sub-139553_ses-PNC1_T1w.nii.gz,5d9d14b1a54d350042bcbddd,2013-08-23,1,2,107.0,2.0,2.0,2.0


In [744]:
useCounts


{<function __main__.find_by_mprage_in_fname(acq)>: 0,
 <function __main__.find_by_acq_label(acq)>: 0,
 <function __main__.find_by_other_fname(acq)>: 0}

In [745]:
foundproj = xl[(xl.acqid == "")].copy()
foundT1w = xl[xl.acqid != ""].copy()

print(f'Tried to located {len(xl)} scans on Flywheel.')
print(f'Found {len(foundT1w)} T1w scans.')
print(f'Failed to find {len(failed)} scans.')
#print(f"Number of subjects: {len(freeze.bblid.unique())}")

print(f'\nScans that were located, broken down by project:')
foundT1w.value_counts('project')

Tried to located 3112 scans on Flywheel.
Found 2622 T1w scans.
Found 490 sessions without T1w scans.
Failed to find 490 scans.

Scans that were located, broken down by project:


project
ExtraLong             2339
MOTIVE                 102
Evolution_833922        74
NEFF_818028             39
PNC_CS_810336           27
22q_Midline_834246      22
EONSX_810366             8
PNC_LG_810336            3
GRMPY_822831             3
SYRP_818621              2
AGGY_808689              2
CONTE_815814             1
dtype: int64

In [746]:
# Debugging failed cases
failed_bc_deprecated = failed[failed.scan_protocol.isin(DEPRECATED_PROTOCOLS)]
failed_bc_multiple = failed[failed.filename == 'multiple']
failed_bc_other = failed[(~failed.scanid.isin(failed_bc_multiple.scanid)) & (~failed.scanid.isin(failed_bc_deprecated.scanid))]
print(f'{len(failed)} scans failed')
print(f'{len(failed_bc_deprecated)} scans failed because project is deprecated')
print(f'{len(failed_bc_multiple)} scans failed due to multiple t1w images found')
print(f'{len(failed_bc_other)} scans failed bc other')


#failed = failed.drop(failed[~failed.scanid.isin(failed2.scanid)].index)
# 

490 scans failed
443 scans failed because project is deprecated
6 scans failed due to multiple t1w images found
41 scans failed bc other


In [593]:
print(f'Scans that were not located, broken down by project:')
failed.value_counts('scan_protocol')


Scans that were not located, broken down by project:


scan_protocol
820690 - phASL                   119
816275 - ONM                     108
807360 - Olf Lifespan             73
808922 - MGI2_PENN                35
825834 - satterttPiloting         31
808799 - DAY2                     24
818028 - Effort                   23
825940 - GluCEST in Psychosis     18
822937 - HARMONY                  16
818621 - SYRP                     15
817628 - EFDO                     10
829502 - MOTIVE                   10
810336 - Go3                       6
810211 - FNDM                      6
B10218 - MGI2_PITT                 6
822831 - GRMPY                     4
816281 - NODRA                     3
815814 - Conte                     3
810336 - Go2 Supplement            3
833922 - EvolPsy                   3
834246 - 22qmidline                1
808689 - AGGY                      1
dtype: int64

In [770]:
# 1. Final filter on resulting dataframe to only include subjects with 2+ scans on Flywheel.
freeze = filterBySessionCount(foundT1w,2)

# 2. Filter by data freeze threshold date
freeze = freeze[freeze.doscan >= earliest]
freeze

Unnamed: 0,bblid,scanid,sesid,scan_protocol,project,filename,acqid,doscan,timepoint,ntimepoints,scanage_months,sex,race,ethnic
2,10410,6776,FNDM11,810211 - FNDM,ExtraLong,sub-10410_ses-FNDM11_T1w.nii.gz,5d9e0666a54d350038b39e11,2012-04-24,1,2,556.0,2.0,2.0,2.0
3,10410,6843,FNDM21,810211 - FNDM,ExtraLong,sub-10410_ses-FNDM21_T1w.nii.gz,5d9e0972a54d350044c11e33,2012-05-11,2,2,557.0,2.0,2.0,2.0
7,11186,3395,DAY21,808799 - DAY2,ExtraLong,sub-11186_ses-DAY21_T1w.nii.gz,5d9dfb8ba54d350042bd8142,2010-06-10,1,2,534.0,1.0,1.0,2.0
8,11186,6378,FNDM21,810211 - FNDM,ExtraLong,sub-11186_ses-FNDM21_T1w.nii.gz,5d9e0c47a54d350042bd95b4,2012-02-03,2,2,554.0,1.0,1.0,2.0
10,11242,3360,DAY21,808799 - DAY2,ExtraLong,sub-11242_ses-DAY21_T1w.nii.gz,5d9dfb97a54d35003eb7f374,2010-06-02,1,2,694.0,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3106,139272,10109,10109,822831 - GRMPY,ExtraLong,sub-139272_ses-10109_T1w.nii.gz,5d9cac7ea54d35003eb749f4,2016-04-07,6,6,259.0,2.0,2.0,2.0
3108,139490,8461,PNC1,810336 - Big GO,ExtraLong,sub-139490_ses-PNC1_T1w.nii.gz,5d9cd114a54d350039b28ece,2013-08-30,1,2,105.0,1.0,2.0,2.0
3109,139490,10564,CONTE1,815814 - Conte,ExtraLong,sub-139490_ses-CONTE1_acq-moco_T1w.nii.gz,5d9ca3cfa54d350028b04b1d,2017-04-29,2,2,149.0,1.0,2.0,2.0
3110,139553,8410,PNC1,810336 - Big GO,ExtraLong,sub-139553_ses-PNC1_T1w.nii.gz,5d9d14b1a54d350042bcbddd,2013-08-23,1,2,107.0,2.0,2.0,2.0


# Step 6: Export final datafreeze as csv

In [774]:
freeze.sort_values(by=['bblid','doscan'],inplace=True)
fname = f"./csv/ExtraLong-datafreeze-{cutoff_date.isoformat()}.csv"
freeze.to_csv(f"./csv/ExtraLong-Datafreeze-20210831.csv", index=False)

In [761]:
freeze[freeze.acqid != ""]


Unnamed: 0,bblid,scanid,sesid,scan_protocol,project,filename,acqid,doscan,timepoint,ntimepoints,scanage_months,sex,race,ethnic
16,11399,8767,,818028 - Effort,NEFF_818028,MPRAGE_TI1100_ipat2_2.nii.gz,5c9e6630f546b6002aef9013,2014-02-27,4,4,458.0,2.0,2.0,2.0
26,11599,8555,,818028 - Effort,NEFF_818028,MPRAGE_TI1100_ipat2_3.nii.gz,5c9e663bf546b6002aef904d,2013-10-07,3,3,521.0,2.0,2.0,2.0
35,11801,8591,,818028 - Effort,NEFF_818028,MPRAGE_TI1100_ipat2_2.nii.gz,5c9e6639f546b60028eee5e1,2013-10-23,3,3,399.0,1.0,1.0,2.0
54,12202,8751,,818028 - Effort,NEFF_818028,MPRAGE_TI1100_ipat2_2.nii.gz,5c9e6630f546b60039efc75b,2014-02-18,3,3,412.0,2.0,2.0,2.0
104,13190,9846,NEFF1,818028 - Effort,ExtraLong,sub-13190_ses-NEFF1_acq-moco_T1w.nii.gz,5d9e055ba54d35003eb7f94c,2015-09-22,4,4,388.0,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3106,139272,10109,10109,822831 - GRMPY,ExtraLong,sub-139272_ses-10109_T1w.nii.gz,5d9cac7ea54d35003eb749f4,2016-04-07,6,6,259.0,2.0,2.0,2.0
3108,139490,8461,PNC1,810336 - Big GO,ExtraLong,sub-139490_ses-PNC1_T1w.nii.gz,5d9cd114a54d350039b28ece,2013-08-30,1,2,105.0,1.0,2.0,2.0
3109,139490,10564,CONTE1,815814 - Conte,ExtraLong,sub-139490_ses-CONTE1_acq-moco_T1w.nii.gz,5d9ca3cfa54d350028b04b1d,2017-04-29,2,2,149.0,1.0,2.0,2.0
3110,139553,8410,PNC1,810336 - Big GO,ExtraLong,sub-139553_ses-PNC1_T1w.nii.gz,5d9d14b1a54d350042bcbddd,2013-08-23,1,2,107.0,2.0,2.0,2.0


In [773]:
import string 

SESSION_LABEL_DICT = {
    "22q_Midline_834246": "22QMID",
    "AGGY_808689": "AGGY",
    "CONTE_815814": "CONTE",
    "EONSX_810366": "EONSX",
    "GRMPY_822831": "GRMPY",
    "MOTIVE": "MOTIVE",
    "NEFF_818028": "NEFF",
    "PNC_CS_810336": "PNC",
    "PNC_LG_810336": "PNC",
    "SYRP_818621": "SYRP",
    "Evolution_833922": "EVOL"
}

# Clean the sesid field for the entire dataframe
for index,scan in freeze.iterrows():
    
    sesid = str(scan.sesid)
    # Check for existing ses label
    
    # All numeric sesid's from old ExtraLong should be GRMPY
    if sesid.isdecimal():
        new = "GRMPY"
    # Change motive to all caps to match

    elif sesid == 'motive1':
        new = "MOTIVE"
    
    elif sesid.lower() == 'nan':
        new = SESSION_LABEL_DICT[scan.project]
    
    else:
        new = sesid.rstrip(string.digits)
    
    freeze.loc[index, "sesid"] = new
freeze

Unnamed: 0,bblid,scanid,sesid,scan_protocol,project,filename,acqid,doscan,timepoint,ntimepoints,scanage_months,sex,race,ethnic
2,10410,6776,FNDM,810211 - FNDM,ExtraLong,sub-10410_ses-FNDM11_T1w.nii.gz,5d9e0666a54d350038b39e11,2012-04-24,1,2,556.0,2.0,2.0,2.0
3,10410,6843,FNDM,810211 - FNDM,ExtraLong,sub-10410_ses-FNDM21_T1w.nii.gz,5d9e0972a54d350044c11e33,2012-05-11,2,2,557.0,2.0,2.0,2.0
7,11186,3395,DAY,808799 - DAY2,ExtraLong,sub-11186_ses-DAY21_T1w.nii.gz,5d9dfb8ba54d350042bd8142,2010-06-10,1,2,534.0,1.0,1.0,2.0
8,11186,6378,FNDM,810211 - FNDM,ExtraLong,sub-11186_ses-FNDM21_T1w.nii.gz,5d9e0c47a54d350042bd95b4,2012-02-03,2,2,554.0,1.0,1.0,2.0
10,11242,3360,DAY,808799 - DAY2,ExtraLong,sub-11242_ses-DAY21_T1w.nii.gz,5d9dfb97a54d35003eb7f374,2010-06-02,1,2,694.0,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3106,139272,10109,GRMPY,822831 - GRMPY,ExtraLong,sub-139272_ses-10109_T1w.nii.gz,5d9cac7ea54d35003eb749f4,2016-04-07,6,6,259.0,2.0,2.0,2.0
3108,139490,8461,PNC,810336 - Big GO,ExtraLong,sub-139490_ses-PNC1_T1w.nii.gz,5d9cd114a54d350039b28ece,2013-08-30,1,2,105.0,1.0,2.0,2.0
3109,139490,10564,CONTE,815814 - Conte,ExtraLong,sub-139490_ses-CONTE1_acq-moco_T1w.nii.gz,5d9ca3cfa54d350028b04b1d,2017-04-29,2,2,149.0,1.0,2.0,2.0
3110,139553,8410,PNC,810336 - Big GO,ExtraLong,sub-139553_ses-PNC1_T1w.nii.gz,5d9d14b1a54d350042bcbddd,2013-08-23,1,2,107.0,2.0,2.0,2.0


In [89]:
# comparison of old vs new run
oldFailed = pd.read_csv('./csv/scans-not-found-on-flywheel.csv')

common = failed.merge(oldFailed,on=['scanid'])

found = oldFailed[(~oldFailed.scanid.isin(common.scanid))]
found.to_csv('additional_found _scans.csv',index=False)
found


Unnamed: 0,bblid,scanid,sesid,scan_protocol,project,projectid,doscan,timepoint,ntimepoints,scanage_months,sex,race,ethnic
477,128079,11639,,833922 - EvolPsy,,,2021-01-27,2,3,210.0,1.0,2.0,2.0
478,89279,11550,,834246 - 22qmidline,,,2020-10-06,10,11,263.0,1.0,2.0,2.0
479,89279,11699,,843329 - LongGluCEST,,,2021-03-23,11,11,268.0,1.0,2.0,2.0
480,91422,11753,,843329 - LongGluCEST,,,2021-05-18,5,5,309.0,1.0,2.0,2.0
481,111720,11766,,843329 - LongGluCEST,,,2021-06-03,2,2,269.0,1.0,1.0,2.0
482,114738,11706,,843329 - LongGluCEST,,,2021-03-30,5,5,237.0,2.0,5.0,2.0
483,115783,11788,,843329 - LongGluCEST,,,2021-06-15,3,3,323.0,2.0,2.0,2.0
484,116354,11774,,843329 - LongGluCEST,,,2021-06-08,7,7,283.0,2.0,2.0,2.0
485,118864,11783,,843329 - LongGluCEST,,,2021-06-14,6,6,237.0,2.0,5.0,1.0
486,121085,11716,,843329 - LongGluCEST,,,2021-04-10,10,10,284.0,2.0,1.0,2.0
