In [2]:
## Import Python Packages and scripts
import sys, os, subprocess
import gen3

from gen3.auth import Gen3Auth # authentication SDK class
from gen3.query import Gen3Query # query SDK class

In [3]:
credentials = rf"C:\Users\Wahaj Sajid\Desktop\Research\credentials.json"
api = "https://data.midrc.org" 

In [4]:
auth = Gen3Auth(api, refresh_file=credentials) # authentication class
query = Gen3Query(auth) # query class

In [5]:
## "case" query parameters
## In this example, we're going to filter our patient cohort by asking for:
# female Asian patients in an age range that tested positive for COVID-19.

# demographic attributes / filters
race = "Asian"
sex = "Female"
min_age = 79
max_age = 89

# clinical attributes / filters
covid19_positive = "True"

# fields to return.  
fields = ["submitter_id", # "submitter_id" here is the case/patient's unique identifier in the database
    "project_id" # this is the "project" that the patient belongs to. by default, queries run across all projects
]

In [6]:
## Run the query using the guppy graphQL service

data = query.raw_data_download(
                    data_type="case",
                    fields=fields,
                    filter_object={
                        "AND": [
                            {"=": {"race": race}},
                            {"=": {"sex": sex}},
                            {">=": {"age_at_index": min_age}},
                            {"<=": {"age_at_index": max_age}},
                        ]
                    },
                    sort_fields=[{"submitter_id": "asc"}]
                )

if len(data) > 0 and "submitter_id" in data[0]:
    case_ids = [i['submitter_id'] for i in data] ## make a list of the case (patient) IDs returned
    print("Query returned {} case IDs.".format(len(data)))
    print("Data is a list with rows like this:\n\t {}".format(data[0:1]))
else:
    print("Your query returned no data! Please, check that query parameters are valid.")

Query returned 223 case IDs.
Data is a list with rows like this:
	 [{'project_id': 'Open-R1', 'submitter_id': '639127-003079'}]


In [7]:
## "data_file" query parameters
## In this example, we're asking for files from CT imaging studies of the chest

# imaging_study attributes / filters
source_node = "ct_series_file" # this will limit the files returned to those that are CT series
loinc_system = "Chest" # this is the LOINC-harmonized "body part examined" in the imaging study

# fields to return.  
fields = [
    "project_id", # this is the "project" that the file belongs to. by default, queries run across all projects
    "case_ids", # this is the "submitter_id" of the patient the file is associated with (the patient ID)
    "object_id", # this is the unique identifier (GUID) for a file in MIDRC which can be used to access/download the file
    "source_node", # this is the name of the node in the MIDRC data model under which the file is stored
    "file_name",
    "file_size"
]

In [8]:
# note that the field "data_type" here has changed from "case" (example above) to "data_file". This is the name of the Elasticsearch index
data = query.raw_data_download(
                    data_type="data_file",
                    fields=fields,
                    filter_object={
                        "AND": [
                            {"=": {"source_node": source_node}},
                            {"=": {"loinc_system": loinc_system}},
                            {"IN": {"case_ids": case_ids}},
                        ]
                    },
                    sort_fields=[{"submitter_id": "asc"}]
                )

if len(data) > 0 and "object_id" in data[0]:
    object_ids = [i['object_id'] for i in data] ## make a list of the file object_ids returned by our query
    print("Query returned {} data files with {} object_ids.".format(len(data),len(object_ids)))
    print("Data is a list with rows like this:\n\t {}".format(data[0:1]))
else:
    print("Your query returned no data! Please, check that query parameters are valid.")

Query returned 354 data files with 354 object_ids.
Data is a list with rows like this:
	 [{'case_ids': ['419639-003994'], 'project_id': 'Open-R1', 'file_name': '419639-003994/1.2.826.0.1.3680043.10.474.419639.199624900112098574746851092487/1.2.826.0.1.3680043.10.474.419639.178807703596765069001100631802.zip', 'source_node': 'ct_series_file', 'object_id': 'dg.MD1R/f6f56e26-0b9c-499a-9127-bfa6e6cf71d3', 'file_size': 61626224}]


In [9]:
source_nodes = ["ct_series_file", "annotation_file","dicom_annotation_file"]

In [10]:
# note that the field "data_type" here has changed from "case" (example above) to "data_file". This is the name of the Elasticsearch index
data = query.raw_data_download(
                    data_type="data_file",
                    fields=fields,
                    filter_object={
                        "AND": [
                            {"in": {"source_node": source_nodes}},
                            {"=": {"loinc_system": loinc_system}},
                            {"IN": {"case_ids": case_ids}},
                        ]
                    },
                    sort_fields=[{"submitter_id": "asc"}]
                )

if len(data) > 0:
    object_ids = [i['object_id'] for i in data if 'object_id' in i] ## make a list of the file object_ids returned by our query
    print("Query returned {} data files with {} object_ids.".format(len(data),len(object_ids)))
    print("Data is a list with rows like this:\n\t {}".format(data[0:1]))
else:
    print("Your query returned no data! Please, check that query parameters are valid.")

Query returned 756 data files with 756 object_ids.
Data is a list with rows like this:
	 [{'case_ids': ['419639-006498'], 'project_id': 'Open-R1', 'file_name': 'dg.MD1R__a2f4d0ed-c5f7-44bc-8422-5301406534c7__1.2.826.0.1.3680043.10.474.419639.101193335321403130038940578907__seg.dcm', 'source_node': 'dicom_annotation_file', 'object_id': 'dg.MD1R/c02d8bc1-7ba9-4f04-9cd3-81826862da8e', 'file_size': 1290156}]


In [11]:
## Make a new directory for downloaded files
os.makedirs('downloads', exist_ok=True)

In [12]:
output_dir = rf"C:\Users\Wahaj Sajid\Desktop\Research\Research Project\MIDRC\downloads"

In [13]:
## Run the "gen3 drs-pull object" command to download one of the files
object_id = object_ids[0]
cmd = f'gen3 --auth "{credentials}" --endpoint data.midrc.org drs-pull object {object_id} --output-dir "{output_dir}"'
os.system(cmd)

0

In [None]:
## Simple loop to download all files and keep track of success and failures
success,failure,other=[],[],[]
count,total = 0,len(object_ids)
for object_id in object_ids:
    count+=1
    cmd = f'gen3 --auth "{credentials}" --endpoint data.midrc.org drs-pull object {object_id} --output-dir "{output_dir}"'
    stout = subprocess.run(cmd, shell=True, capture_output=True)
    print("Progress ({}/{}): {}".format(count,total,stout.stdout))
    if "failed" in str(stout.stdout):
        failure.append(object_id)
    elif "successfully" in str(stout.stdout):
        success.append(object_id)
    else:
        other.append(object_id)

Progress (1/756): b'{"succeeded": ["dg.MD1R/c02d8bc1-7ba9-4f04-9cd3-81826862da8e"], "failed": []}\r\n'
Progress (2/756): b'{"succeeded": ["dg.MD1R/f76e5030-b542-437e-8c33-3a43637062c9"], "failed": []}\r\n'
Progress (3/756): b'[2025-08-10 17:36:56,252][CRITICAL] 419639-004250/1.2.826.0.1.3680043.10.474.419639.618144258865779130534258903248/1.2.826.0.1.3680043.10.474.419639.299189889092767238771794258172.zip had an issue while being unpackaged: [Errno 2] No such file or directory: \'C:\\\\Users\\\\Wahaj Sajid\\\\Desktop\\\\Research\\\\Research Project\\\\MIDRC\\\\downloads\\\\419639-004250\\\\1.2.826.0.1.3680043.10.474.419639.618144258865779130534258903248\\\\1.2.826.0.1.3680043.10.474.419639.299189889092767238771794258172\\\\1.2.826.0.1.3680043.10.474.419639.177184149712432440781913301644.dcm\'\r\n[2025-08-10 17:36:56,252][  ERROR] Object dg.MD1R/71323203-3346-44a6-b8d7-c18196369fdd download failed.\r\n[2025-08-10 17:36:56,252][  ERROR] One or more objects have failed to be downloaded. 