# Querying Flywheel for the June 30th, 2021 Datafreeze
**Project:** ExtraLong <br>
**Author:** Katja Zoner <br>
**Date:** 07/30/2021 <br>

# Step 0: Setup

In [32]:
import os
import sys
import logging
import numpy as np
import pandas as pd
import flywheel


# Step 1: Read in superset csv, convert to dataframe, and clean.

In [33]:
# Read superset csv.
csv = "all_long_scans_oracle_cleaned.csv"
superset = pd.read_csv(csv)

# Convert date of scan to timestamp dtype
superset["doscan"] = pd.to_datetime(superset["doscan"])
superset

Unnamed: 0,scanid,bblid,scan_protocol,doscan,timepoint,ntimepoints,scanage_months,sex,race,ethnic,scanstat
0,613,11660,700205 - Worden,2003-05-23,1,6,1240.0,,,,C2
1,615,11518,700205 - Worden,2003-05-27,1,2,356.0,2.0,2.0,,C2
2,597,11382,700205 - Worden,2003-07-17,1,4,260.0,2.0,2.0,2.0,C2
3,598,11565,700205 - Worden,2003-07-24,1,7,291.0,1.0,1.0,2.0,I7
4,599,11530,700205 - Worden,2003-07-29,1,4,331.0,1.0,2.0,2.0,C2
...,...,...,...,...,...,...,...,...,...,...,...
4252,11820,82039,833922 - EvolPsy,2021-07-12,5,5,367.0,2.0,2.0,2.0,
4253,11803,132782,842909 - TRANSCENDS_D1,2021-07-12,3,6,307.0,1.0,1.0,2.0,
4254,11804,132782,842909 - TRANSCENDS_D1,2021-07-19,4,6,307.0,1.0,1.0,2.0,
4255,11805,132782,842909 - TRANSCENDS_D1,2021-07-30,5,6,307.0,1.0,1.0,2.0,


# Step 2: Get list of scan protocols to include in 2021 data freeze.

In [27]:
# Read superset csv.
csv = "protocols_for_inclusion.csv"
inclusion_df = pd.read_csv(csv)

inclusion_df.include.fillna(True,inplace=True)
protocols = list(inclusion_df.scan_protocol[inclusion_df.include == True])
protocols

['808689 - AGGY',
 '808922 - MGI2_PENN',
 '808799 - DAY2',
 '807360 - Olf Lifespan',
 '810336 - Big GO',
 '810336 - Go2 Supplement',
 '810211 - FNDM',
 'B10218 - MGI2_PITT',
 '817628 - EFDO',
 '816281 - NODRA',
 '810336 - GO3 FOLLOW UP',
 '810336 - Go3',
 '816275 - ONM',
 '815814 - Conte',
 '822937 - HARMONY',
 '818028 - Effort',
 '818621 - SYRP',
 '820690 - phASL',
 '825940 - GluCEST in Psychosis',
 '822831 - GRMPY',
 '825834 - satterttPiloting',
 '829502 - MOTIVE',
 '834246 - 22qmidline',
 '833922 - EvolPsy',
 '843329 - LongGluCEST']

# Step 3: Generate csv of scans to include in ExtraLong 2021 Data Freeze

In [106]:
# 1. Filter by superset by protocol (only include scan_protocols from inclusion csv)
df = superset[superset["scan_protocol"].isin(protocols)]

# 2. Filter by data freeze cutoff date
cutoff_date = pd.to_datetime("2021-07-01")
df = df[df.doscan < cutoff_date]

# Get scan counts for each subject (bblid).
bblid_counts = df.groupby(["bblid"]).size().reset_index()
bblid_counts.columns = ["bblid", "scan_count"]
bblid_counts.sort_values("scan_count")

# Get list of bblids that should be included
include_bblids = bblid_counts.bblid[bblid_counts["scan_count"]>1]

# 3. Filter by subject's session count (only include subjects with 2+ scans).
df = df[df["bblid"].isin(include_bblids)]
df


Unnamed: 0,scanid,bblid,scan_protocol,doscan,timepoint,ntimepoints,scanage_months,sex,race,ethnic,scanstat
713,2143,13128,808922 - MGI2_PENN,2009-03-03,8,10,333.0,1.0,1.0,2.0,IS3
721,2052,12583,808922 - MGI2_PENN,2009-03-20,1,5,705.0,1.0,1.0,2.0,IS1
726,2124,13585,808922 - MGI2_PENN,2009-04-03,1,4,699.0,1.0,1.0,2.0,IS1
734,2172,14587,808689 - AGGY,2009-04-23,1,2,135.0,1.0,,,IS6
735,2171,14523,808922 - MGI2_PENN,2009-04-24,1,6,351.0,1.0,1.0,2.0,IS1
...,...,...,...,...,...,...,...,...,...,...,...
4236,11796,107712,834246 - 22qmidline,2021-06-22,2,2,296.0,2.0,5.0,2.0,IS1
4237,11799,98422,829502 - MOTIVE,2021-06-25,4,4,254.0,1.0,2.0,2.0,IS1
4238,11800,126555,833922 - EvolPsy,2021-06-25,2,2,275.0,2.0,5.0,2.0,IS4
4240,11801,132176,829502 - MOTIVE,2021-06-28,2,2,303.0,2.0,2.0,2.0,ISI


In [107]:
# Update ntimepoints column to indicate number of sessions in ExtraLong 2021 for each subject.
for bblid in df.bblid.unique():
    # Get subject's number of timepoints in ExtraLong 2021
    num_tps = bblid_counts[bblid_counts.bblid == bblid].scan_count.item()

    # Update ntimeponts in ExtraLong dataframe
    df.loc[df.bblid==bblid, 'ntimepoints'] = num_tps

# Update timepoint column to indicate timepoint number for each subject
df["timepoint"] = df.groupby("bblid").cumcount()+1
df.sort_values(["bblid","doscan"],inplace = True)


Unnamed: 0,scanid,bblid,scan_protocol,doscan,timepoint,ntimepoints,scanage_months,sex,race,ethnic,scanstat
4099,11585,135085,833922 - EvolPsy,2020-11-17,2,3,219.0,2.0,1.0,2.0,IS1
4143,11650,135085,834246 - 22qmidline,2021-02-11,3,3,222.0,2.0,1.0,2.0,IS1
2445,8361,135484,810336 - Big GO,2013-08-18,1,2,196.0,1.0,1.0,2.0,IS1
3037,9627,135484,810336 - Go3,2015-06-27,2,2,218.0,1.0,1.0,2.0,IS1
2442,8347,138788,810336 - Go2 Supplement,2013-08-17,1,2,160.0,1.0,2.0,2.0,IS1
2942,9478,138788,810336 - Go3,2015-04-25,2,2,180.0,1.0,2.0,2.0,IS4
2478,8470,139181,810336 - Big GO,2013-08-31,1,2,162.0,2.0,2.0,,IS1
4120,11619,139181,833922 - EvolPsy,2021-01-11,2,2,250.0,2.0,2.0,,IS1
2475,8456,139272,810336 - Go2 Supplement,2013-08-30,1,8,227.0,2.0,2.0,2.0,IS2
2540,8631,139272,815814 - Conte,2013-11-13,2,8,230.0,2.0,2.0,2.0,IS4


# Step 4. Locate scans on Flywheel

In [131]:
xl = df.copy()
del xl["scanstat"]
del xl["scanage_months"]
xl["project"]=""
xl["projectid"]=""
xl

Unnamed: 0,scanid,bblid,scan_protocol,doscan,timepoint,ntimepoints,sex,race,ethnic,project,projectid
1059,3931,10180,808799 - DAY2,2010-10-11,1,2,1.0,2.0,2.0,,
1172,4451,10180,807360 - Olf Lifespan,2011-01-31,2,2,1.0,2.0,2.0,,
1818,6776,10410,810211 - FNDM,2012-04-24,1,2,2.0,2.0,2.0,,
1846,6843,10410,810211 - FNDM,2012-05-11,2,2,2.0,2.0,2.0,,
1112,4185,11176,808799 - DAY2,2010-11-23,1,2,1.0,1.0,2.0,,
...,...,...,...,...,...,...,...,...,...,...,...
3734,10739,139272,825940 - GluCEST in Psychosis,2018-03-29,8,8,2.0,2.0,2.0,,
2474,8461,139490,810336 - Big GO,2013-08-30,1,2,1.0,2.0,2.0,,
3601,10564,139490,815814 - Conte,2017-04-29,2,2,1.0,2.0,2.0,,
2464,8410,139553,810336 - Big GO,2013-08-23,1,2,2.0,2.0,2.0,,


In [79]:
# # Get API_KEY from FW profile
API_KEY = "upenn.flywheel.io:47vhOSDkwMxGRNxFq0"
PROJECT_LABEL = "ExtraLong"

# Get client
fw = flywheel.Client(API_KEY)
assert fw, "Your Flywheel CLI credentials aren't set!"

# Get project object
#project = fw.projects.find_first('label="{}"'.format(PROJECT_LABEL))
#assert project, "Project not found!"

In [137]:
def getProjectInfo(bblid, scanid):
        # Query Flywheel to get project label and id for given scan.
        query = f'session.label == {scanid} AND ' \
                f'subject.label == {bblid} '
                #f'acquisition.label CONTAINS mprage'
                #f'NOT acquisition.label CONTAINS navsetter'

        results = fw.search({'structured_query': query, 'return_type': 'project'}, size=100)
        #assert len(results) == 1, f"WARNING: Found {len(results)} results matching scanid {scanid}."
        if len(results) != 1: 
                print(f"WARNING: Found {len(results)} results for bblid: {bblid}, scanid: {scanid}")
        else:
                # Add project label and id to dataframe
                xl.loc[xl["scanid"] == scanid, "project"] = results[0].project.label
                xl.loc[xl["scanid"] == scanid, "projectid"] = results[0].project.id

In [138]:
# For each entry in the ExtraLong dataframe, get Flywheel project label and id
for index, row in xl.iterrows():
    bblid = row.bblid
    scanid = row.scanid
    getProjectInfo(bblid, scanid)



In [153]:
it_worked = xl[xl.project != ""].scan_protocol.unique()

xl[xl.scan_protocol.isin(it_worked)].project.unique()

array(['SYRP_818621', 'MOTIVE', 'GRMPY_822831', '', '22q_Midline_834246',
       'PNC_CS_810336', 'EONSX_810366', 'Evolution_833922'], dtype=object)

In [162]:
study_to_protocol_map = xl.groupby(["project", "scan_protocol"]).size().reset_index()
study_to_protocol_map.sort_values("scan_protocol", inplace = True)

study_to_protocol_map.columns = ["flywheel_project", "scan_protocol", "scan_count"]
study_to_protocol_map.to_csv("fwproject_to_scanprotocol.csv",index=False)

In [160]:
xl[xl.scan_protocol == '834246 - 22qmidline']

Unnamed: 0,scanid,bblid,scan_protocol,doscan,timepoint,ntimepoints,sex,race,ethnic,project,projectid
4137,11638,20160,834246 - 22qmidline,2021-01-26,3,3,2.0,2.0,2.0,22q_Midline_834246,5f5f75415079889f551bd6c5
4200,11732,20786,834246 - 22qmidline,2021-04-23,2,2,2.0,2.0,2.0,22q_Midline_834246,5f5f75415079889f551bd6c5
4098,11586,87538,834246 - 22qmidline,2020-11-17,7,7,1.0,2.0,2.0,22q_Midline_834246,5f5f75415079889f551bd6c5
4077,11550,89279,834246 - 22qmidline,2020-10-06,10,11,1.0,2.0,2.0,,
4163,11677,93242,834246 - 22qmidline,2021-03-06,9,9,1.0,1.0,2.0,22q_Midline_834246,5f5f75415079889f551bd6c5
4179,11701,94333,834246 - 22qmidline,2021-03-26,3,3,2.0,1.0,1.0,22q_Midline_834246,5f5f75415079889f551bd6c5
4152,11661,94378,834246 - 22qmidline,2021-02-20,2,3,1.0,2.0,2.0,22q_Midline_834246,5f5f75415079889f551bd6c5
4215,11760,96659,834246 - 22qmidline,2021-05-25,7,7,1.0,1.0,2.0,22q_Midline_834246,5f5f75415079889f551bd6c5
4177,11698,100079,834246 - 22qmidline,2021-03-23,5,5,1.0,1.0,2.0,22q_Midline_834246,5f5f75415079889f551bd6c5
4151,11662,103035,834246 - 22qmidline,2021-02-20,3,4,1.0,1.0,2.0,22q_Midline_834246,5f5f75415079889f551bd6c5
