# Locating scans for the LongitudinalDiffusion Project on Flywheel
Get Flywheel **acqid** and **filename** for `anat`, `dwi`, and `fmap` files from all subjects/sessions in the following projects:
- 22Q_812481
- 22q_Midline_834246
- EFR01
- EONSX_810366
- Evolution_833922
- GRMPY_822831
- PNC_CS_810336
- PNC_LG_810336
- SYRP_818621

**Project:** LongitudinalDiffusion <br>
**Author:** Katja Zoner <br>
**Date:** 02/24/2022 <br>

In [102]:
from tqdm import tqdm
from datetime import datetime
import numpy as np
import pandas as pd
import flywheel

In [25]:
# Get client
fw = flywheel.Client()
assert fw, "Your Flywheel CLI credentials aren't set!"

In [180]:
PROJECT_LIST = [
    "22Q_812481",
    "22q_Midline_834246",
    "EFR01",
    "EONSX_810366",
    "Evolution_833922",
    "GRMPY_822831",
    "PNC_CS_810336",
    "PNC_LG_810336",
    "SYRP_818621"
]

BIDSIFIED_LIST = [
    "22Q_812481",
    "EFR01",
    "GRMPY_822831",
    "PNC_CS_810336",
    "PNC_LG_810336",
    "SYRP_818621"
]

In [26]:
def get_project(fw, label):
    '''
    Helper function to retrieve Flywheel project given project name.
    '''
    project = fw.projects.find_first('label="{}"'.format(label))
    assert project, f"Project '{label}' not found on Flywheel!"  
    return project

In [163]:
def get_from_bidsified(project, info):
    '''
    Loop through all sessions in a BIDSIFIED project to gather dict of 
    all niftis and associated files under the anat, dwi, and fmap folders.
    '''
    # Loop through subjects in project
    for sub in tqdm(project.subjects(), desc=f"Subjects processed", unit="subject", position=0):
        
        # Loop through sessions in subject
        for ses in sub.sessions():
            ses = ses.reload()
            
            # Loop through acquisitions in session
            for acq in ses.acquisitions():
                acq = acq.reload()

                # Loop through files in acquisition
                for f in acq.files:
                    
                    # If file is one of nifti/bval/bvec, try to get bids folder data.
                    if f.type in ['nifti', 'bval', 'bvec']:  
                        
                        try:
                            bids_folder = f.info['BIDS']['Folder']
                            # Add file metadata to info dict if file falls under one of anat, dwi, fmap BIDS dirs.
                            if bids_folder in ['anat', 'dwi', 'fmap']:
                                modality = f.info['BIDS']['Filename'].split(".")[0].split("_")[-1]
                                info[f.id] =  [project.label, sub.label, ses.label, acq.id, bids_folder, modality, f.name]

                        # KeyError if file info doesn't have BIDS entry; TypeError if it has BIDS that isn't a dict 
                        except (KeyError, TypeError) as e:
                        # except KeyError:
                            pass


            

## 1. Build dictionary of scan data for each anat/dwi/fmap file in the bidsified projects

In [164]:
# Create info dict with entries for each scan.
info = {}

In [178]:
done = []
# Loop through each BIDSIFIED project and locate scans. 
# Note: this takes a while, esp for projects with many subjects!
for project_label in BIDSIFIED_LIST:
    project = get_project(fw, project_label,)
    get_from_bidsified(project, info)
    done.append(project_label)
    print(f"Finished locating scans from project {project_label}.")

Subjects processed: 100%|██████████| 1601/1601 [3:00:16<00:00,  6.76s/subject]  


In [181]:
len(info)

28768

## 2. Convert dict to dataframe and clean data

In [264]:
def printCounts(df):
    print(f"Subject count: {len(df.bblid.unique())}")
    print(f"Sessions count: {len(df.sesid.unique())}")

In [224]:
# Convert info dict to pandas dataframe, rename columns, remove fileid col
df = pd.DataFrame.from_dict(info, orient='index').reset_index()

# Rename columns
df.columns=['fileid', 'project', 'bblid', 'sesid', 'acqid', 'folder', 'modality', 'filename']

# Remove fileid column
del df['fileid']

# Convert bblid and sesid to str
df['bblid'] = df.bblid.astype(str)
df['sesid'] = df.sesid.astype(str)

# WHAT TO DO WITH NON-DIGIT SESSION LABELS --> for now, removing these rows
df = df[df.sesid.apply(lambda x: x.isnumeric())]

# Strip additional leading zero from sesid (22Q project issue)
df.loc[df.project == '22Q_812481', 'sesid'] = df.loc[df.project == '22Q_812481', 'sesid'].apply(lambda x: x.lstrip('0'))

# Add leading zeros to bblid and sesid so ids are 6 and 5 digits long respectively
df['bblid'] = df.bblid.apply(lambda x: x.zfill(6))
df['sesid'] = df.sesid.astype(str).apply(lambda x: x.zfill(5))

df

Unnamed: 0,project,bblid,sesid,acqid,folder,modality,filename
0,22Q_812481,016533,09422,5c8fb15df546b6002ebda7f0,fmap,phasediff,B0map_onesizefitsall_v4_7_ph.nii.gz
1,22Q_812481,016533,09422,5c8fb15df546b6002ebda7f1,fmap,magnitude2,B0map_onesizefitsall_v4_6_e2.nii.gz
2,22Q_812481,016533,09422,5c8fb15df546b6002ebda7f1,fmap,magnitude1,B0map_onesizefitsall_v4_6_e1.nii.gz
3,22Q_812481,016533,09422,5c8fb15df546b6002fbdafc9,anat,T1w,MPRAGE_TI1100_ipat2_2.nii.gz
4,22Q_812481,016533,09422,5c8fb15df546b6002bbd9d23,dwi,dwi,DTI_2x32_35_10.nii.gz
...,...,...,...,...,...,...,...
28763,PNC_CS_810336,080010,02894,5d94bbf7a54d350042b4b970,fmap,magnitude2,B0map_onesizefitsall_v2_10_e2.nii.gz
28764,PNC_CS_810336,080010,02894,5d94bbf7a54d350042b4b970,fmap,magnitude1,B0map_onesizefitsall_v2_10_e1.nii.gz
28765,PNC_CS_810336,080010,02894,5d94bbf8a54d350042b4b971,fmap,phase1,B0map_onesizefitsall_v2_11_e1_ph.nii.gz
28766,PNC_CS_810336,080010,02894,5d94bbf8a54d350042b4b971,fmap,phase2,B0map_onesizefitsall_v2_11_e2_ph.nii.gz


In [222]:
# Print count of subjects with one of anat/dwi/fmap scans located by this script
printCounts(df)

Subject count: 1922
Sessions count: 2986


## 3. QC Dataframe --> check for 2+ sessions, and presence of T1w, dwi, fmap modalities

In [243]:
def get_t1w(session):
    return session[session.modality == "T1w"]

def get_dwi(session):
    return session[session.folder == "dwi"]

def get_fmap(session):
    return session[session.folder == "fmap"]

In [261]:
def get_required_files_from_valid_sessions(df):
    '''
    Filter to only include sessions that contain a T1w image, and scans under both fmap and dwi bids folders.
    Note: Could add better QC checks for project level dwi/fmap requirements, but this was easiest for a short deadline.
    '''
    required = pd.DataFrame(columns=df.columns)

    for sesid in df.sesid.unique():
        session_df = df[df.sesid == sesid]
        assert len(session_df.project.unique()) == 1, f"Session label {sesid} was found under multiple projects!"
        assert len(session_df.bblid.unique()) == 1, f"Session label {sesid} was found under multiple subjects!."
        
        t1w = get_t1w(session_df)
        dwi = get_dwi(session_df)
        fmap = get_fmap(session_df)

        if not t1w.empty and not dwi.empty and not fmap.empty:
            required = pd.concat([required, t1w, dwi,fmap], ignore_index=True)
        # else:
        #     print(f"Session {sesid} does not have all required scans")
    return required
    

In [295]:
def filter_by_session_count(df, thresh):
    '''
    Filter dataframe to only include subjects who have <threshold> or more scans.
    '''

    # Get session counts for each subject (bblid).
    bblid_counts = df.groupby(["bblid", "sesid"]).size().reset_index().groupby(["bblid"]).size().reset_index()
    bblid_counts.columns = ["bblid", "session_count"]
    bblid_counts.sort_values("session_count")

    # Get list of bblids that should be included
    include_bblids = bblid_counts.bblid[bblid_counts["session_count"]>=thresh]

    # Filter dataframe to only include subjects that meet session requirements
    df = df[df["bblid"].isin(include_bblids)].copy()

    # Update ntimepoints column to indicate number of sessions in ExtraLong 2021 for each subject.
    for bblid in df.bblid.unique():

        # Get subject's number of timepoints in ExtraLong 2021
        num_tps = bblid_counts[bblid_counts.bblid == bblid].session_count.item()

        # Update ntimeponts in ExtraLong dataframe
        df.loc[df.bblid==bblid, 'ntimepoints'] = num_tps

    # Update timepoint column to indicate timepoint number for each subject
    df["timepoint"] = df.groupby("bblid").cumcount()+1

    return df.sort_values(["bblid","sesid"])

In [272]:
# 1. Filter to only include sessions with T1w, dwi, and fmaps
upload_df = get_required_files_from_valid_sessions(df)
upload_df


Unnamed: 0,project,bblid,sesid,acqid,folder,modality,filename
0,22Q_812481,016533,09422,5c8fb15df546b6002fbdafc9,anat,T1w,MPRAGE_TI1100_ipat2_2.nii.gz
1,22Q_812481,016533,09422,5c8fb15df546b6002bbd9d23,dwi,dwi,DTI_2x32_35_10.nii.gz
2,22Q_812481,016533,09422,5c8fb15df546b6002bbd9d23,dwi,dwi,DTI_2x32_35_10.bvec
3,22Q_812481,016533,09422,5c8fb15df546b6002bbd9d23,dwi,dwi,DTI_2x32_35_10.bval
4,22Q_812481,016533,09422,5c8fb15df546b6002ebda7f2,dwi,dwi,DTI_2x32_36_11.nii.gz
...,...,...,...,...,...,...,...
25859,PNC_CS_810336,080010,02894,5d94bbf8a54d35003eb2d010,dwi,dwi,DTI_2x32_36_9.bval
25860,PNC_CS_810336,080010,02894,5d94bbf7a54d350042b4b970,fmap,magnitude2,B0map_onesizefitsall_v2_10_e2.nii.gz
25861,PNC_CS_810336,080010,02894,5d94bbf7a54d350042b4b970,fmap,magnitude1,B0map_onesizefitsall_v2_10_e1.nii.gz
25862,PNC_CS_810336,080010,02894,5d94bbf8a54d350042b4b971,fmap,phase1,B0map_onesizefitsall_v2_11_e1_ph.nii.gz


In [None]:
print(f"After excluding sessions without T1w, dwi, and fmp, we have:")
printCounts(upload_df)

After excluding sessions without T1w, dwi, and fmp, we have:
Subject count: 1687
Sessions count: 2599


In [304]:
# # 2. Final filter on resulting dataframe to only include subjects with 2+ valid sessions.
upload_df = filter_by_session_count(upload_df, 2)
upload_df

Unnamed: 0,project,bblid,sesid,acqid,folder,modality,filename,ntimepoints,timepoint
267,22Q_812481,015305,08932,5c8fb164f546b6002cbd9f9a,anat,T1w,MPRAGE_TI1100_ipat2_2.nii.gz,2.0,1
268,22Q_812481,015305,08932,5c8fb164f546b6002fbdafeb,dwi,dwi,DTI_2x32_36_11.nii.gz,2.0,2
269,22Q_812481,015305,08932,5c8fb164f546b6002fbdafeb,dwi,dwi,DTI_2x32_36_11.bvec,2.0,3
270,22Q_812481,015305,08932,5c8fb164f546b6002fbdafeb,dwi,dwi,DTI_2x32_36_11.bval,2.0,4
271,22Q_812481,015305,08932,5c8fb164f546b6002fbdafec,dwi,dwi,DTI_2x32_35_10.nii.gz,2.0,5
...,...,...,...,...,...,...,...,...,...
3795,PNC_LG_810336,139272,10040,5c76ef2dba2580003839f9fe,dwi,dwi,DTI_2x32_35_13.bvec,2.0,3
3796,PNC_LG_810336,139272,10040,5c76ef2dba2580003839f9fe,dwi,dwi,DTI_2x32_35_13.bval,2.0,4
3797,PNC_LG_810336,139272,10040,5c76ef2dba2580002938f366,fmap,phasediff,B0map_v4_9_ph.nii.gz,2.0,5
3798,PNC_LG_810336,139272,10040,5c76ef2dba2580003839f9f7,fmap,magnitude2,B0map_v4_8_e2.nii.gz,2.0,6


In [305]:
print(f"After excluding subjects with less than 2 sessions we have:")
printCounts(upload_df)

After excluding subjects with less than 2 sessions we have:
Subject count: 601
Sessions count: 1513


In [307]:
# Export dataframe to csv
filename = f"scans_for_download_upload_{datetime.now().isoformat()}.csv"
upload_df.to_csv("../../data/organize/" + filename, index=False)