# Querying Flywheel for the June 30th, 2021 Datafreeze
**Project:** ExtraLong <br>
**Author:** Katja Zoner <br>
**Date:** 07/30/2021 <br>

# Step 0: Setup

In [2]:
import os
import sys
import logging
import numpy as np
import pandas as pd
import flywheel


# Step 1: Read in superset csv, convert to dataframe, and clean.

In [3]:
# Read superset csv.
csv = "all_long_scans_oracle_cleaned.csv"
superset = pd.read_csv(csv)

# Convert date of scan to timestamp dtype
superset["doscan"] = pd.to_datetime(superset["doscan"])
superset

Unnamed: 0,scanid,bblid,scan_protocol,doscan,timepoint,ntimepoints,scanage_months,sex,race,ethnic,scanstat
0,613,11660,700205 - Worden,2003-05-23,1,6,1240.0,,,,C2
1,615,11518,700205 - Worden,2003-05-27,1,2,356.0,2.0,2.0,,C2
2,597,11382,700205 - Worden,2003-07-17,1,4,260.0,2.0,2.0,2.0,C2
3,598,11565,700205 - Worden,2003-07-24,1,7,291.0,1.0,1.0,2.0,I7
4,599,11530,700205 - Worden,2003-07-29,1,4,331.0,1.0,2.0,2.0,C2
...,...,...,...,...,...,...,...,...,...,...,...
4252,11820,82039,833922 - EvolPsy,2021-07-12,5,5,367.0,2.0,2.0,2.0,
4253,11803,132782,842909 - TRANSCENDS_D1,2021-07-12,3,6,307.0,1.0,1.0,2.0,
4254,11804,132782,842909 - TRANSCENDS_D1,2021-07-19,4,6,307.0,1.0,1.0,2.0,
4255,11805,132782,842909 - TRANSCENDS_D1,2021-07-30,5,6,307.0,1.0,1.0,2.0,


# Step 2: Get list of scan protocols to include in 2021 data freeze.

In [4]:
# Read superset csv.
csv = "protocols_for_inclusion.csv"
inclusion_df = pd.read_csv(csv)

inclusion_df.include.fillna(True,inplace=True)
protocols = list(inclusion_df.scan_protocol[inclusion_df.include == True])
protocols

['808689 - AGGY',
 '808922 - MGI2_PENN',
 '808799 - DAY2',
 '807360 - Olf Lifespan',
 '810336 - Big GO',
 '810336 - Go2 Supplement',
 '810211 - FNDM',
 'B10218 - MGI2_PITT',
 '817628 - EFDO',
 '816281 - NODRA',
 '810336 - GO3 FOLLOW UP',
 '810336 - Go3',
 '816275 - ONM',
 '815814 - Conte',
 '822937 - HARMONY',
 '818028 - Effort',
 '818621 - SYRP',
 '820690 - phASL',
 '825940 - GluCEST in Psychosis',
 '822831 - GRMPY',
 '825834 - satterttPiloting',
 '829502 - MOTIVE',
 '834246 - 22qmidline',
 '833922 - EvolPsy',
 '843329 - LongGluCEST']

# Step 3: Generate csv of scans to include in ExtraLong 2021 Data Freeze

In [8]:
# 1. Filter by superset by protocol (only include scan_protocols from inclusion csv)
df = superset[superset["scan_protocol"].isin(protocols)]

# 2. Filter by data freeze cutoff date
cutoff_date = pd.to_datetime("2021-07-01")
df = df[df.doscan < cutoff_date]

# 3. Filter by subject's session count (only include subjects with 2+ scans).
df = filterBySessionCount(df,2)
df


Unnamed: 0,scanid,bblid,scan_protocol,doscan,timepoint,ntimepoints,scanage_months,sex,race,ethnic,scanstat
1059,3931,10180,808799 - DAY2,2010-10-11,1,2,566.0,1.0,2.0,2.0,IS4
1172,4451,10180,807360 - Olf Lifespan,2011-01-31,2,2,569.0,1.0,2.0,2.0,IS1
1818,6776,10410,810211 - FNDM,2012-04-24,1,2,556.0,2.0,2.0,2.0,IS1
1846,6843,10410,810211 - FNDM,2012-05-11,2,2,557.0,2.0,2.0,2.0,IS1
1112,4185,11176,808799 - DAY2,2010-11-23,1,2,337.0,1.0,1.0,2.0,IS1
...,...,...,...,...,...,...,...,...,...,...,...
3734,10739,139272,825940 - GluCEST in Psychosis,2018-03-29,8,8,282.0,2.0,2.0,2.0,IS4
2474,8461,139490,810336 - Big GO,2013-08-30,1,2,105.0,1.0,2.0,2.0,IS2
3601,10564,139490,815814 - Conte,2017-04-29,2,2,149.0,1.0,2.0,2.0,IS4
2464,8410,139553,810336 - Big GO,2013-08-23,1,2,107.0,2.0,2.0,2.0,IS2


In [7]:
def filterBySessionCount(df, thresh):

    # Get scan counts for each subject (bblid).
    bblid_counts = df.groupby(["bblid"]).size().reset_index()
    bblid_counts.columns = ["bblid", "scan_count"]
    bblid_counts.sort_values("scan_count")

    # Get list of bblids that should be included
    include_bblids = bblid_counts.bblid[bblid_counts["scan_count"]>=thresh]

    # Filter dataframe to only include subjects that meet session requirements
    df = df[df["bblid"].isin(include_bblids)].copy()

    # Update ntimepoints column to indicate number of sessions in ExtraLong 2021 for each subject.
    for bblid in df.bblid.unique():

        # Get subject's number of timepoints in ExtraLong 2021
        num_tps = bblid_counts[bblid_counts.bblid == bblid].scan_count.item()

        # Update ntimeponts in ExtraLong dataframe
        df.loc[df.bblid==bblid, 'ntimepoints'] = num_tps

    # Update timepoint column to indicate timepoint number for each subject
    df["timepoint"] = df.groupby("bblid").cumcount()+1

    return df.sort_values(["bblid","doscan"])

# Step 4: Add in information for scans previously in ExtraLong 2019

In [9]:
# Read in ExtraLong sesid - scanid mapping csv
mapping_csv = "scanid_to_seslabel_10-16-2019.csv"
mapping = pd.read_csv(mapping_csv)

# Rename seslabel col to sesid
mapping.columns = ["project", "bblid", "scanid", "sesid"]
mapping = mapping[["bblid", "scanid", "sesid", "project"]]

# Cast col datatypes
mapping[["bblid", "scanid"]]=mapping[["bblid", "scanid"]].astype("int64")

In [10]:
xl = df.copy()
del xl["scanstat"]
xl["projectid"] = ""

# Merge scanid-sesid mapping with xl dataframe to add in ExtraLong sesid's
xl = xl.merge(mapping, on=['scanid','bblid'], how='left')

# Reorder columns
xl = xl[['bblid','scanid','sesid','scan_protocol','project','projectid', 'doscan', 'timepoint', 'ntimepoints', 'scanage_months', 'sex', 'race', 'ethnic']]
xl

Unnamed: 0,bblid,scanid,sesid,scan_protocol,project,projectid,doscan,timepoint,ntimepoints,scanage_months,sex,race,ethnic
0,10180,3931,,808799 - DAY2,,,2010-10-11,1,2,566.0,1.0,2.0,2.0
1,10180,4451,,807360 - Olf Lifespan,,,2011-01-31,2,2,569.0,1.0,2.0,2.0
2,10410,6776,FNDM11,810211 - FNDM,FNDM1_810211,,2012-04-24,1,2,556.0,2.0,2.0,2.0
3,10410,6843,FNDM21,810211 - FNDM,FNDM2_810211,,2012-05-11,2,2,557.0,2.0,2.0,2.0
4,11176,4185,,808799 - DAY2,,,2010-11-23,1,2,337.0,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116,139272,10739,,825940 - GluCEST in Psychosis,,,2018-03-29,8,8,282.0,2.0,2.0,2.0
3117,139490,8461,PNC1,810336 - Big GO,PNC_CS_810336,,2013-08-30,1,2,105.0,1.0,2.0,2.0
3118,139490,10564,CONTE1,815814 - Conte,CONTE_815814,,2017-04-29,2,2,149.0,1.0,2.0,2.0
3119,139553,8410,PNC1,810336 - Big GO,PNC_CS_810336,,2013-08-23,1,2,107.0,2.0,2.0,2.0


# Step 5: Locating Scans on Flywheel

In [11]:
# # Get API_KEY from FW profile
API_KEY = "upenn.flywheel.io:47vhOSDkwMxGRNxFq0"
PROJECT_LABEL = "ExtraLong"

DEPRECATED_PROJECTS = [
    "DAY2_808799",
    "FNDM1_810211",
    "FNDM2_810211",
    "NODRA_816281",
    "ONM_816275"
]

DEPRECATED_PROTOCOLS = [
    "808799 - DAY2",
    "810211 - FNDM",
    "816281 - NODRA",
    "816275 - ONM"
]

# Get client
fw = flywheel.Client(API_KEY)
assert fw, "Your Flywheel CLI credentials aren't set!"

# Get project object
xlProject = fw.projects.find_first('label="{}"'.format(PROJECT_LABEL))
assert xlProject, "Project not found!"



In [13]:
def queryFlywheel(query):
    
    results = fw.search({'structured_query': query, 'return_type': 'project'}, size=100)
    
    if results:   
        # Print warning message if multiple results are found
        if len(results) > 1: 
            print(f"WARNING: Found {len(results)} results for bblid: {bblid}, scanid: {scanid}")
        
        project = results[0].project.label
        projectid = results[0].project.id
        return [project, projectid]
        
def getProjectInfo(scan):

        # If scan already has project label, search in ExtraLong project.
        if not pd.isnull(scan.project):
                query = f'project.label == ExtraLong AND ' \
                        f'subject.label == sub-{scan.bblid} AND ' \
                        f'session.label == ses-{scan.sesid} '
                proj_info = queryFlywheel(query)
        
                # If scan wasn't found in ExtraLong and project should exist, search in original project
                if not proj_info and scan.project not in DEPRECATED_PROJECTS:
                        query = f'project.label == {scan.project} AND ' \
                        f'subject.label == sub-{scan.bblid} AND ' \
                        f'session.label == ses-{scan.sesid} '
                        proj_info = queryFlywheel(query)

        # Else if scan wasn't part of old ExtraLong, query all Flywheel.
        else:
                # Query all of Flywheel to get project label and id for given scan.
                query = f'session.label == {scan.scanid} AND ' \
                        f'subject.label == {scan.bblid} '
                proj_info = queryFlywheel(query)

        return proj_info

In [14]:
failed = pd.DataFrame(columns=xl.columns)

# For each entry in the ExtraLong dataframe, get Flywheel project label and id
for index, scan in xl.iterrows():

    bblid = scan.bblid
    scanid = scan.scanid
    sesid = scan.sesid

    proj_info = getProjectInfo(scan)

    # If results were found, add project and projectid to original xl dataframe.
    if proj_info:
        xl.loc[index, "project"] = proj_info[0] 
        xl.loc[index, "projectid"] = proj_info[1]

    # If scan could not be found on Flywheel, add to failed dataframe
    else:
        failed = failed.append(scan)

# Step 5: Locating Scans on Flywheel - RESULTS
## Tried to located `3121` scans
## Sucessfully located `2628` scans
## Failed to located `493`

In [40]:
worked = xl[xl.project.notnull()].copy()
print(f'Tried to located {len(xl)} scans on Flywheel.')
print(f'Found {len(worked)} scans.')
print(f'Failed to find {len(failed)} scans.')

print(f'\nScans that were located, broken down by project:')
worked.value_counts('project')


Tried to located 3121 scans on Flywheel.
Found 2628 scans.
Failed to find 493 scans.

Scans that were located, broken down by project:


project
ExtraLong             2339
MOTIVE                 103
Evolution_833922        77
NEFF_818028             39
PNC_CS_810336           27
22q_Midline_834246      21
EONSX_810366             9
PNC_LG_810336            4
GRMPY_822831             4
SYRP_818621              2
AGGY_808689              2
CONTE_815814             1
dtype: int64

In [39]:
worked_protocols = sorted(worked.scan_protocol.unique(), key=lambda s:s.split('-')[1])
failed_protocols = sorted(failed.scan_protocol.unique(), key=lambda s:s.split('-')[1])
all_failed = [p for p in failed_protocols if p not in worked_protocols]

print(f'Scans that were not located, broken down by project:')
failed.value_counts('scan_protocol')

Scans that were not located, broken down by project:


scan_protocol
820690 - phASL                   119
816275 - ONM                     108
807360 - Olf Lifespan             73
808922 - MGI2_PENN                35
825834 - satterttPiloting         31
808799 - DAY2                     24
825940 - GluCEST in Psychosis     18
822937 - HARMONY                  16
818621 - SYRP                     13
817628 - EFDO                     10
843329 - LongGluCEST               8
818028 - Effort                    8
810211 - FNDM                      6
B10218 - MGI2_PITT                 6
829502 - MOTIVE                    5
816281 - NODRA                     3
810336 - Go2 Supplement            3
815814 - Conte                     2
822831 - GRMPY                     1
810336 - Go3                       1
833922 - EvolPsy                   1
834246 - 22qmidline                1
808689 - AGGY                      1
dtype: int64

In [17]:
# Final filter on resulting dataframe to only include subjects with 2+ scans on Flywheel.
freeze = filterBySessionCount(worked,2)
freeze

Unnamed: 0,bblid,scanid,sesid,scan_protocol,project,projectid,doscan,timepoint,ntimepoints,scanage_months,sex,race,ethnic
2,10410,6776,FNDM11,810211 - FNDM,ExtraLong,5d76a90c0f98b7005e7e5744,2012-04-24,1,2,556.0,2.0,2.0,2.0
3,10410,6843,FNDM21,810211 - FNDM,ExtraLong,5d76a90c0f98b7005e7e5744,2012-05-11,2,2,557.0,2.0,2.0,2.0
7,11186,3395,DAY21,808799 - DAY2,ExtraLong,5d76a90c0f98b7005e7e5744,2010-06-10,1,2,534.0,1.0,1.0,2.0
8,11186,6378,FNDM21,810211 - FNDM,ExtraLong,5d76a90c0f98b7005e7e5744,2012-02-03,2,2,554.0,1.0,1.0,2.0
10,11242,3360,DAY21,808799 - DAY2,ExtraLong,5d76a90c0f98b7005e7e5744,2010-06-02,1,2,694.0,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3115,139272,10109,10109,822831 - GRMPY,ExtraLong,5d76a90c0f98b7005e7e5744,2016-04-07,6,6,259.0,2.0,2.0,2.0
3117,139490,8461,PNC1,810336 - Big GO,ExtraLong,5d76a90c0f98b7005e7e5744,2013-08-30,1,2,105.0,1.0,2.0,2.0
3118,139490,10564,CONTE1,815814 - Conte,ExtraLong,5d76a90c0f98b7005e7e5744,2017-04-29,2,2,149.0,1.0,2.0,2.0
3119,139553,8410,PNC1,810336 - Big GO,ExtraLong,5d76a90c0f98b7005e7e5744,2013-08-23,1,2,107.0,2.0,2.0,2.0


# Step 6: Export final datafreeze as csv

In [30]:
freeze.sort_values(by=['bblid','doscan'],inplace=True)
freeze.to_csv(f"ExtraLong-datafreeze-{cutoff_date.isoformat()}.csv", index=False)

In [35]:
failed.sort_values(by=['scan_protocol','bblid'],inplace=True)
failed.to_csv("scans-not-found-on-flywheel.csv",index=False)