In [1]:
## Import Python Packages and scripts
import sys, os, subprocess
import gen3

from gen3.auth import Gen3Auth # authentication SDK class
from gen3.query import Gen3Query # query SDK class

In [None]:
credentials = rf"C:\Users\Wahaj Sajid\Desktop\Research\credentials.json"
api = "https://data.midrc.org"

In [3]:
auth = Gen3Auth(api, refresh_file=credentials) # authentication class
query = Gen3Query(auth) # query class

In [4]:
## "case" query parameters
## In this example, we're going to filter our patient cohort by asking for:
# female Asian patients in an age range that tested positive for COVID-19.

# demographic attributes / filters
race = "Asian"
sex = "Female"
min_age = 0
max_age = 75

# clinical attributes / filters
covid19_positive = "false"

# fields to return.  
fields = ["submitter_id", # "submitter_id" here is the case/patient's unique identifier in the database
    "project_id" # this is the "project" that the patient belongs to. by default, queries run across all projects
]

In [5]:
# Set a limit
limit = 500  # Download only 100 cases
## Run the query using the guppy graphQL service
data = query.raw_data_download(
                    data_type="case",
                    fields=fields,
                    filter_object={
                        "AND": [
                            {"=": {"race": race}},
                            {"=": {"sex": sex}},
                            {">=": {"age_at_index": min_age}},
                            {"<=": {"age_at_index": max_age}},
                            {"=": {"covid19_positive": "No"}}
                        ]
                    },
                    sort_fields=[{"submitter_id": "asc"}]
                )

if len(data) > 0 and "submitter_id" in data[0]:
    data = data[:limit]
    case_ids = [i['submitter_id'] for i in data] ## make a list of the case (patient) IDs returned
    print("Query returned {} case IDs.".format(len(data)))
    print("Data is a list with rows like this:\n\t {}".format(data[0:1]))
else:
    print("Your query returned no data! Please, check that query parameters are valid.")

[2025-08-13 16:28:49,825][  ERROR] backoff: gave up call gen3.auth._write_to_file(<gen3.auth.Gen3Auth object at 0x000001687DF58E90>, C:\Users\Wahaj Sajid/.cache/gen3/token_cache_750b4eb9e1145ccf60ffdf0af86340697e7d394de4e2a420bc189e9c0e3cda81, eyJhbGciOiJSUzI1NiIsImtpZCI6ImZlbmNlX2tleV9rZXkiLCJ0eXAiOiJKV1QifQ.eyJwdXIiOiJhY2Nlc3MiLCJpc3MiOiJodHRwczovL2RhdGEubWlkcmMub3JnL3VzZXIiLCJhdWQiOlsiaHR0cHM6Ly9kYXRhLm1pZHJjLm9yZy91c2VyIiwib3BlbmlkIiwiYWRtaW4iLCJnb29nbGVfY3JlZGVudGlhbHMiLCJmZW5jZSIsImRhdGEiLCJ1c2VyIiwiZ29vZ2xlX3NlcnZpY2VfYWNjb3VudCIsImdvb2dsZV9saW5rIiwiZ2E0Z2hfcGFzc3BvcnRfdjEiXSwiaWF0IjoxNzU1MDg0NTI4LCJleHAiOjE3NTUwODgxMjgsImp0aSI6ImVhOWY3OGE2LTViYzktNDQxNy1iZDk2LTI2ZmIwZjkzZWU3MCIsInNjb3BlIjpbIm9wZW5pZCIsImFkbWluIiwiZ29vZ2xlX2NyZWRlbnRpYWxzIiwiZmVuY2UiLCJkYXRhIiwidXNlciIsImdvb2dsZV9zZXJ2aWNlX2FjY291bnQiLCJnb29nbGVfbGluayIsImdhNGdoX3Bhc3Nwb3J0X3YxIl0sImNvbnRleHQiOnsidXNlciI6eyJuYW1lIjoiMDAwMC0wMDAyLTEwOTgtOTgzMSIsImlzX2FkbWluIjpmYWxzZSwiZ29vZ2xlIjp7InByb3h5X2dyb3VwIjpudWxsfX19LCJhe

In [6]:
## "data_file" query parameters
## In this example, we're asking for files from CT imaging studies of the chest

# imaging_study attributes / filters
source_node = "ct_series_file" # this will limit the files returned to those that are CT series
loinc_system = "Chest" # this is the LOINC-harmonized "body part examined" in the imaging study

# fields to return.  
fields = [
    "project_id", # this is the "project" that the file belongs to. by default, queries run across all projects
    "case_ids", # this is the "submitter_id" of the patient the file is associated with (the patient ID)
    "object_id", # this is the unique identifier (GUID) for a file in MIDRC which can be used to access/download the file
    "source_node", # this is the name of the node in the MIDRC data model under which the file is stored
    "file_name",
    "file_size"
]

In [7]:
# note that the field "data_type" here has changed from "case" (example above) to "data_file". This is the name of the Elasticsearch index
data = query.raw_data_download(
                    data_type="data_file",
                    fields=fields,
                    filter_object={
                        "AND": [
                            {"=": {"source_node": source_node}},
                            {"=": {"loinc_system": loinc_system}},
                            {"IN": {"case_ids": case_ids}},
                        ]
                    },
                    sort_fields=[{"submitter_id": "asc"}]
                )

if len(data) > 0 and "object_id" in data[0]:
    object_ids = [i['object_id'] for i in data] ## make a list of the file object_ids returned by our query
    print("Query returned {} data files with {} object_ids.".format(len(data),len(object_ids)))
    print("Data is a list with rows like this:\n\t {}".format(data[0:1]))
else:
    print("Your query returned no data! Please, check that query parameters are valid.")

Query returned 966 data files with 966 object_ids.
Data is a list with rows like this:
	 [{'case_ids': ['419639-003289'], 'project_id': 'Open-R1', 'file_name': '419639-003289/1.2.826.0.1.3680043.10.474.419639.294050514746867089723650867701/1.2.826.0.1.3680043.10.474.419639.194962688010589557896821390458.zip', 'source_node': 'ct_series_file', 'object_id': 'dg.MD1R/5638d128-a5fd-4f68-849d-70780195da5f', 'file_size': 50564126}]


In [8]:
source_nodes = ["ct_series_file", "annotation_file","dicom_annotation_file"]

In [9]:
# note that the field "data_type" here has changed from "case" (example above) to "data_file". This is the name of the Elasticsearch index
data = query.raw_data_download(
                    data_type="data_file",
                    fields=fields,
                    filter_object={
                        "AND": [
                            {"in": {"source_node": source_nodes}},
                            {"=": {"loinc_system": loinc_system}},
                            {"IN": {"case_ids": case_ids}},
                        ]
                    },
                    sort_fields=[{"submitter_id": "asc"}]
                )

if len(data) > 0:
    object_ids = [i['object_id'] for i in data if 'object_id' in i] ## make a list of the file object_ids returned by our query
    print("Query returned {} data files with {} object_ids.".format(len(data),len(object_ids)))
    print("Data is a list with rows like this:\n\t {}".format(data[0:1]))
else:
    print("Your query returned no data! Please, check that query parameters are valid.")

Query returned 1751 data files with 1751 object_ids.
Data is a list with rows like this:
	 [{'case_ids': ['419639-002460'], 'project_id': 'Open-R1', 'file_name': 'dg.MD1R__0d7657f7-fbbf-4be3-baa7-5a404ae4808f__1.2.826.0.1.3680043.10.474.419639.198854991532662577544267984030__seg.dcm', 'source_node': 'dicom_annotation_file', 'object_id': 'dg.MD1R/6e69c0f8-05c3-4b64-a1c9-cabd4bdde401', 'file_size': 958192}]


In [11]:
## Make a new directory for downloaded files
os.makedirs('downloads', exist_ok=True)

In [10]:
output_dir = rf"C:\Users\Wahaj Sajid\Desktop\Asian Women COVID 19 negative (Age  0 to 75) download data"

In [None]:
## Run the "gen3 drs-pull object" command to download one of the files
object_id = object_ids[0]
cmd = f'gen3 --auth "{credentials}" --endpoint data.midrc.org drs-pull object {object_id} --output-dir "{output_dir}"'
os.system(cmd)

In [None]:
## Simple loop to download all files and keep track of success and failures
object_id = object_ids[0]
success,failure,other=[],[],[]
count,total = 0,len(object_ids)
for object_id in object_ids:
    count+=1
    cmd = f'gen3 --auth "{credentials}" --endpoint data.midrc.org drs-pull object {object_id} --output-dir "{output_dir}"'
    stout = subprocess.run(cmd, shell=True, capture_output=True)
    print("Progress ({}/{}): {}".format(count,total,stout.stdout))
    if "failed" in str(stout.stdout):
        failure.append(object_id)
    elif "successfully" in str(stout.stdout):
        success.append(object_id)
    else:
        other.append(object_id)          

Progress (59/1751): b''
Progress (60/1751): b''
Progress (61/1751): b''
Progress (62/1751): b''
Progress (63/1751): b''
Progress (64/1751): b''
Progress (65/1751): b''
Progress (66/1751): b''
Progress (67/1751): b''
Progress (68/1751): b''
Progress (69/1751): b''
Progress (70/1751): b''
Progress (72/1751): b''
Progress (73/1751): b''
Progress (74/1751): b''
Progress (75/1751): b''
Progress (76/1751): b''
Progress (77/1751): b''
Progress (78/1751): b''
Progress (79/1751): b''
Progress (80/1751): b''
Progress (81/1751): b''
Progress (82/1751): b''
Progress (83/1751): b''
Progress (84/1751): b''
Progress (85/1751): b''
Progress (86/1751): b''
Progress (87/1751): b''
Progress (88/1751): b''
Progress (89/1751): b''
Progress (90/1751): b''
Progress (91/1751): b''
Progress (92/1751): b''
Progress (93/1751): b''
Progress (94/1751): b''
Progress (95/1751): b''
Progress (96/1751): b''
Progress (97/1751): b''
Progress (98/1751): b''
Progress (99/1751): b''
Progress (100/1751): b''
Progress (101/1