In [4]:
## Import Python Packages and scripts
import sys, os, subprocess
import gen3

from gen3.auth import Gen3Auth # authentication SDK class
from gen3.query import Gen3Query # query SDK class

In [5]:
credentials = rf"C:\Users\Wahaj Sajid\Desktop\Research\credentials.json"
api = "https://data.midrc.org" 

In [6]:
auth = Gen3Auth(api, refresh_file=credentials) # authentication class
query = Gen3Query(auth) # query class

In [11]:
## "case" query parameters
## In this example, we're going to filter our patient cohort by asking for:
# female Asian patients in an age range that tested positive for COVID-19.

# demographic attributes / filters
race = "Asian"
sex = "Female"
min_age = 0
max_age = 79

# clinical attributes / filters
covid19_positive = "True"

# fields to return.  
fields = ["submitter_id", # "submitter_id" here is the case/patient's unique identifier in the database
    "project_id" # this is the "project" that the patient belongs to. by default, queries run across all projects
]

In [13]:
# Set a limit
limit = 1000  # Download only 100 cases
## Run the query using the guppy graphQL service
data = query.raw_data_download(
                    data_type="case",
                    fields=fields,
                    filter_object={
                        "AND": [
                            {"=": {"race": race}},
                            {"=": {"sex": sex}},
                            {">=": {"age_at_index": min_age}},
                            {"<=": {"age_at_index": max_age}},
                        ]
                    },
                    sort_fields=[{"submitter_id": "asc"}]
                )

if len(data) > 0 and "submitter_id" in data[0]:
    data = data[:limit]
    case_ids = [i['submitter_id'] for i in data] ## make a list of the case (patient) IDs returned
    print("Query returned {} case IDs.".format(len(data)))
    print("Data is a list with rows like this:\n\t {}".format(data[0:1]))
else:
    print("Your query returned no data! Please, check that query parameters are valid.")

Query returned 1000 case IDs.
Data is a list with rows like this:
	 [{'project_id': 'Open-R1', 'submitter_id': '302028-009456'}]


In [14]:
## "data_file" query parameters
## In this example, we're asking for files from CT imaging studies of the chest

# imaging_study attributes / filters
source_node = "ct_series_file" # this will limit the files returned to those that are CT series
loinc_system = "Chest" # this is the LOINC-harmonized "body part examined" in the imaging study

# fields to return.  
fields = [
    "project_id", # this is the "project" that the file belongs to. by default, queries run across all projects
    "case_ids", # this is the "submitter_id" of the patient the file is associated with (the patient ID)
    "object_id", # this is the unique identifier (GUID) for a file in MIDRC which can be used to access/download the file
    "source_node", # this is the name of the node in the MIDRC data model under which the file is stored
    "file_name",
    "file_size"
]

In [15]:
# note that the field "data_type" here has changed from "case" (example above) to "data_file". This is the name of the Elasticsearch index
data = query.raw_data_download(
                    data_type="data_file",
                    fields=fields,
                    filter_object={
                        "AND": [
                            {"=": {"source_node": source_node}},
                            {"=": {"loinc_system": loinc_system}},
                            {"IN": {"case_ids": case_ids}},
                        ]
                    },
                    sort_fields=[{"submitter_id": "asc"}]
                )

if len(data) > 0 and "object_id" in data[0]:
    object_ids = [i['object_id'] for i in data] ## make a list of the file object_ids returned by our query
    print("Query returned {} data files with {} object_ids.".format(len(data),len(object_ids)))
    print("Data is a list with rows like this:\n\t {}".format(data[0:1]))
else:
    print("Your query returned no data! Please, check that query parameters are valid.")

Query returned 1692 data files with 1692 object_ids.
Data is a list with rows like this:
	 [{'case_ids': ['419639-003289'], 'project_id': 'Open-R1', 'file_name': '419639-003289/1.2.826.0.1.3680043.10.474.419639.294050514746867089723650867701/1.2.826.0.1.3680043.10.474.419639.194962688010589557896821390458.zip', 'source_node': 'ct_series_file', 'object_id': 'dg.MD1R/5638d128-a5fd-4f68-849d-70780195da5f', 'file_size': 50564126}]


In [16]:
source_nodes = ["ct_series_file", "annotation_file","dicom_annotation_file"]

In [17]:
# note that the field "data_type" here has changed from "case" (example above) to "data_file". This is the name of the Elasticsearch index
data = query.raw_data_download(
                    data_type="data_file",
                    fields=fields,
                    filter_object={
                        "AND": [
                            {"in": {"source_node": source_nodes}},
                            {"=": {"loinc_system": loinc_system}},
                            {"IN": {"case_ids": case_ids}},
                        ]
                    },
                    sort_fields=[{"submitter_id": "asc"}]
                )

if len(data) > 0:
    object_ids = [i['object_id'] for i in data if 'object_id' in i] ## make a list of the file object_ids returned by our query
    print("Query returned {} data files with {} object_ids.".format(len(data),len(object_ids)))
    print("Data is a list with rows like this:\n\t {}".format(data[0:1]))
else:
    print("Your query returned no data! Please, check that query parameters are valid.")

Query returned 3141 data files with 3141 object_ids.
Data is a list with rows like this:
	 [{'case_ids': ['419639-002460'], 'project_id': 'Open-R1', 'file_name': 'dg.MD1R__0d7657f7-fbbf-4be3-baa7-5a404ae4808f__1.2.826.0.1.3680043.10.474.419639.198854991532662577544267984030__seg.dcm', 'source_node': 'dicom_annotation_file', 'object_id': 'dg.MD1R/6e69c0f8-05c3-4b64-a1c9-cabd4bdde401', 'file_size': 958192}]


In [11]:
## Make a new directory for downloaded files
os.makedirs('downloads', exist_ok=True)

In [18]:
output_dir = rf"C:\Users\Wahaj Sajid\Desktop\Asian Women (Age  0 to 79) download data"

In [19]:
## Run the "gen3 drs-pull object" command to download one of the files
object_id = object_ids[0]
cmd = f'gen3 --auth "{credentials}" --endpoint data.midrc.org drs-pull object {object_id} --output-dir "{output_dir}"'
os.system(cmd)

0

In [None]:
## Simple loop to download all files and keep track of success and failures
object_id = object_ids[0]
success,failure,other=[],[],[]
count,total = 0,len(object_ids)
for object_id in object_ids:
    count+=1
    cmd = f'gen3 --auth "{credentials}" --endpoint data.midrc.org drs-pull object {object_id} --output-dir "{output_dir}"'
    stout = subprocess.run(cmd, shell=True, capture_output=True)
    print("Progress ({}/{}): {}".format(count,total,stout.stdout))
    if "failed" in str(stout.stdout):
        failure.append(object_id)
    elif "successfully" in str(stout.stdout):
        success.append(object_id)
    else:
        other.append(object_id)             

Progress (1/3141): b'[2025-08-11 22:07:13,293][CRITICAL] IOError opening C:\\Users\\Wahaj Sajid\\Desktop\\Asian Women (Age  0 to 79) download data\\dg.MD1R__0d7657f7-fbbf-4be3-baa7-5a404ae4808f__1.2.826.0.1.3680043.10.474.419639.198854991532662577544267984030__seg.dcm for writing: (\'Connection broken: IncompleteRead(105739 bytes read, 852453 more expected)\', IncompleteRead(105739 bytes read, 852453 more expected))\r\n[2025-08-11 22:07:13,293][  ERROR] Object dg.MD1R/6e69c0f8-05c3-4b64-a1c9-cabd4bdde401 download failed.\r\n[2025-08-11 22:07:13,293][  ERROR] One or more objects have failed to be downloaded. Details:\r\n{"succeeded": [], "failed": ["dg.MD1R/6e69c0f8-05c3-4b64-a1c9-cabd4bdde401"]}\r\n'
Progress (2/3141): b''
Progress (3/3141): b'{"succeeded": ["dg.MD1R/81a6c3cd-2db3-43f4-a23b-48efced1992f"], "failed": []}\r\n'
Progress (4/3141): b'[2025-08-11 22:35:05,001][CRITICAL] 419639-003512/1.2.826.0.1.3680043.10.474.419639.301259917926451449400645667409/1.2.826.0.1.3680043.10.474.