In [30]:
## Import Python Packages and scripts
import sys, os, subprocess
import gen3

from gen3.auth import Gen3Auth # authentication SDK class
from gen3.query import Gen3Query # query SDK class

In [31]:
credentials = rf"C:\Users\Wahaj Sajid\Desktop\Research\credentials.json"
api = "https://data.midrc.org"

In [32]:
auth = Gen3Auth(api, refresh_file=credentials) # authentication class
query = Gen3Query(auth) # query class

In [33]:
## "case" query parameters
## In this example, we're going to filter our patient cohort by asking for:
# female Asian patients in an age range that tested positive for COVID-19.

# demographic attributes / filters
race = "Asian"
sex = "Female"
min_age = 75
max_age = 100

# clinical attributes / filters
covid19_positive = "No"

# fields to return.  
fields = ["submitter_id", # "submitter_id" here is the case/patient's unique identifier in the database
    "project_id" # this is the "project" that the patient belongs to. by default, queries run across all projects
]

In [34]:
# Set a limit
limit = 500  # Download only 100 cases
## Run the query using the guppy graphQL service
data = query.raw_data_download(
                    data_type="case",
                    fields=fields,
                    filter_object={
                        "AND": [
                            {"=": {"race": race}},
                            {"=": {"sex": sex}},
                            {">=": {"age_at_index": min_age}},
                            {"<=": {"age_at_index": max_age}},
                            {"=": {"covid19_positive": covid19_positive}}
                        ]
                    },
                    sort_fields=[{"submitter_id": "asc"}]
                )

if len(data) > 0 and "submitter_id" in data[0]:
    data = data[:limit]
    case_ids = [i['submitter_id'] for i in data] ## make a list of the case (patient) IDs returned
    print("Query returned {} case IDs.".format(len(data)))
    print("Data is a list with rows like this:\n\t {}".format(data[0:1]))
else:
    print("Your query returned no data! Please, check that query parameters are valid.")

[2025-08-14 17:56:23,066][  ERROR] backoff: gave up call gen3.auth._write_to_file(<gen3.auth.Gen3Auth object at 0x0000023CF03D6F50>, C:\Users\Wahaj Sajid/.cache/gen3/token_cache_750b4eb9e1145ccf60ffdf0af86340697e7d394de4e2a420bc189e9c0e3cda81, eyJhbGciOiJSUzI1NiIsImtpZCI6ImZlbmNlX2tleV9rZXkiLCJ0eXAiOiJKV1QifQ.eyJwdXIiOiJhY2Nlc3MiLCJpc3MiOiJodHRwczovL2RhdGEubWlkcmMub3JnL3VzZXIiLCJhdWQiOlsiaHR0cHM6Ly9kYXRhLm1pZHJjLm9yZy91c2VyIiwib3BlbmlkIiwiYWRtaW4iLCJnb29nbGVfY3JlZGVudGlhbHMiLCJmZW5jZSIsImRhdGEiLCJ1c2VyIiwiZ29vZ2xlX3NlcnZpY2VfYWNjb3VudCIsImdvb2dsZV9saW5rIiwiZ2E0Z2hfcGFzc3BvcnRfdjEiXSwiaWF0IjoxNzU1MTc2MTgxLCJleHAiOjE3NTUxNzk3ODEsImp0aSI6ImI3NDM2MmI2LTM1MzMtNGM3ZC1hYTIzLThlM2U4MzUxN2YxYSIsInNjb3BlIjpbIm9wZW5pZCIsImFkbWluIiwiZ29vZ2xlX2NyZWRlbnRpYWxzIiwiZmVuY2UiLCJkYXRhIiwidXNlciIsImdvb2dsZV9zZXJ2aWNlX2FjY291bnQiLCJnb29nbGVfbGluayIsImdhNGdoX3Bhc3Nwb3J0X3YxIl0sImNvbnRleHQiOnsidXNlciI6eyJuYW1lIjoiMDAwMC0wMDAyLTEwOTgtOTgzMSIsImlzX2FkbWluIjpmYWxzZSwiZ29vZ2xlIjp7InByb3h5X2dyb3VwIjpudWxsfX19LCJhe

In [35]:
## "data_file" query parameters
## In this example, we're asking for files from CT imaging studies of the chest

# imaging_study attributes / filters
source_node = "ct_series_file" # this will limit the files returned to those that are CT series
loinc_system = "Chest" # this is the LOINC-harmonized "body part examined" in the imaging study

# fields to return.  
fields = [
    "project_id", # this is the "project" that the file belongs to. by default, queries run across all projects
    "case_ids", # this is the "submitter_id" of the patient the file is associated with (the patient ID)
    "object_id", # this is the unique identifier (GUID) for a file in MIDRC which can be used to access/download the file
    "source_node", # this is the name of the node in the MIDRC data model under which the file is stored
    "file_name",
    "file_size"
]

In [36]:
# note that the field "data_type" here has changed from "case" (example above) to "data_file". This is the name of the Elasticsearch index
data = query.raw_data_download(
                    data_type="data_file",
                    fields=fields,
                    filter_object={
                        "AND": [
                            {"=": {"source_node": source_node}},
                            {"=": {"loinc_system": loinc_system}},
                            {"IN": {"case_ids": case_ids}},
                        ]
                    },
                    sort_fields=[{"submitter_id": "asc"}]
                )

if len(data) > 0 and "object_id" in data[0]:
    object_ids = [i['object_id'] for i in data] ## make a list of the file object_ids returned by our query
    print("Query returned {} data files with {} object_ids.".format(len(data),len(object_ids)))
    print("Data is a list with rows like this:\n\t {}".format(data[0:1]))
else:
    print("Your query returned no data! Please, check that query parameters are valid.")

Query returned 525 data files with 525 object_ids.
Data is a list with rows like this:
	 [{'case_ids': ['419639-000448'], 'project_id': 'Open-R1', 'file_name': '419639-000448/1.2.826.0.1.3680043.10.474.419639.150745648013250387002335306014/1.2.826.0.1.3680043.10.474.419639.182484848410673202516204993978.zip', 'source_node': 'ct_series_file', 'object_id': 'dg.MD1R/924aae0b-c4f2-42c9-8c4d-a26d924fe086', 'file_size': 60588234}]


In [37]:
source_nodes = ["ct_series_file", "annotation_file","dicom_annotation_file"]

In [38]:
# note that the field "data_type" here has changed from "case" (example above) to "data_file". This is the name of the Elasticsearch index
data = query.raw_data_download(
                    data_type="data_file",
                    fields=fields,
                    filter_object={
                        "AND": [
                            {"in": {"source_node": source_nodes}},
                            {"=": {"loinc_system": loinc_system}},
                            {"IN": {"case_ids": case_ids}},
                        ]
                    },
                    sort_fields=[{"submitter_id": "asc"}]
                )

if len(data) > 0:
    object_ids = [i['object_id'] for i in data if 'object_id' in i] ## make a list of the file object_ids returned by our query
    print("Query returned {} data files with {} object_ids.".format(len(data),len(object_ids)))
    print("Data is a list with rows like this:\n\t {}".format(data[0:1]))
else:
    print("Your query returned no data! Please, check that query parameters are valid.")

Query returned 924 data files with 924 object_ids.
Data is a list with rows like this:
	 [{'case_ids': ['419639-006498'], 'project_id': 'Open-R1', 'file_name': 'dg.MD1R__a2f4d0ed-c5f7-44bc-8422-5301406534c7__1.2.826.0.1.3680043.10.474.419639.101193335321403130038940578907__seg.dcm', 'source_node': 'dicom_annotation_file', 'object_id': 'dg.MD1R/c02d8bc1-7ba9-4f04-9cd3-81826862da8e', 'file_size': 1290156}]


In [39]:
output_dir = rf"C:\Users\Wahaj Sajid\Desktop\Asian Women COVID 19 negative (Age  75 to 100) download data"

In [11]:
## Run the "gen3 drs-pull object" command to download one of the files
object_id = object_ids[0]
cmd = f'gen3 --auth "{credentials}" --endpoint data.midrc.org drs-pull object {object_id} --output-dir "{output_dir}"'
os.system(cmd)

KeyboardInterrupt: 

In [None]:
## Simple loop to download all files and keep track of success and failures
object_id = object_ids[0]
success,failure,other=[],[],[]
count,total = 0,len(object_ids)
for object_id in object_ids:
    count+=1
    cmd = f'gen3 --auth "{credentials}" --endpoint data.midrc.org drs-pull object {object_id} --output-dir "{output_dir}"'
    stout = subprocess.run(cmd, shell=True, capture_output=True)
    print("Progress ({}/{}): {}".format(count,total,stout.stdout))
    if "failed" in str(stout.stdout):
        failure.append(object_id)
    elif "successfully" in str(stout.stdout):
        success.append(object_id)
    else:
        other.append(object_id)          

Progress (5/924): b''
Progress (6/924): b''
Progress (7/924): b''
Progress (8/924): b''
Progress (9/924): b''
Progress (10/924): b''
Progress (11/924): b''
Progress (12/924): b''
Progress (13/924): b''
Progress (14/924): b''
Progress (15/924): b''
