In [3]:
## Import Python Packages and scripts
import sys, os, subprocess
import gen3

from gen3.auth import Gen3Auth # authentication SDK class
from gen3.query import Gen3Query # query SDK class

In [4]:
credentials = rf"C:\Users\Wahaj Sajid\Desktop\Research\credentials.json"
api = "https://data.midrc.org" 

In [5]:
auth = Gen3Auth(api, refresh_file=credentials) # authentication class
query = Gen3Query(auth) # query class

In [6]:
## "case" query parameters
## In this example, we're going to filter our patient cohort by asking for:
# female Asian patients in an age range that tested positive for COVID-19.

# demographic attributes / filters
race = "Asian"
sex = "Female"
min_age = 0
max_age = 75

# clinical attributes / filters
covid19_positive = "True"

# fields to return.  
fields = ["submitter_id", # "submitter_id" here is the case/patient's unique identifier in the database
    "project_id" # this is the "project" that the patient belongs to. by default, queries run across all projects
]

In [7]:
# Set a limit
limit = 500  # Download only 100 cases
## Run the query using the guppy graphQL service
data = query.raw_data_download(
                    data_type="case",
                    fields=fields,
                    filter_object={
                        "AND": [
                            {"=": {"race": race}},
                            {"=": {"sex": sex}},
                            {">=": {"age_at_index": min_age}},
                            {"<=": {"age_at_index": max_age}},
                            {"=": {"covid19_positive": "Yes"}}
                        ]
                    },
                    sort_fields=[{"submitter_id": "asc"}]
                )

if len(data) > 0 and "submitter_id" in data[0]:
    data = data[:limit]
    case_ids = [i['submitter_id'] for i in data] ## make a list of the case (patient) IDs returned
    print("Query returned {} case IDs.".format(len(data)))
    print("Data is a list with rows like this:\n\t {}".format(data[0:1]))
else:
    print("Your query returned no data! Please, check that query parameters are valid.")

Query returned 500 case IDs.
Data is a list with rows like this:
	 [{'project_id': 'Open-R1', 'submitter_id': '574856-002470'}]


In [8]:
## "data_file" query parameters
## In this example, we're asking for files from CT imaging studies of the chest

# imaging_study attributes / filters
source_node = "ct_series_file" # this will limit the files returned to those that are CT series
loinc_system = "Chest" # this is the LOINC-harmonized "body part examined" in the imaging study

# fields to return.  
fields = [
    "project_id", # this is the "project" that the file belongs to. by default, queries run across all projects
    "case_ids", # this is the "submitter_id" of the patient the file is associated with (the patient ID)
    "object_id", # this is the unique identifier (GUID) for a file in MIDRC which can be used to access/download the file
    "source_node", # this is the name of the node in the MIDRC data model under which the file is stored
    "file_name",
    "file_size"
]

In [9]:
# note that the field "data_type" here has changed from "case" (example above) to "data_file". This is the name of the Elasticsearch index
data = query.raw_data_download(
                    data_type="data_file",
                    fields=fields,
                    filter_object={
                        "AND": [
                            {"=": {"source_node": source_node}},
                            {"=": {"loinc_system": loinc_system}},
                            {"IN": {"case_ids": case_ids}},
                        ]
                    },
                    sort_fields=[{"submitter_id": "asc"}]
                )

if len(data) > 0 and "object_id" in data[0]:
    object_ids = [i['object_id'] for i in data] ## make a list of the file object_ids returned by our query
    print("Query returned {} data files with {} object_ids.".format(len(data),len(object_ids)))
    print("Data is a list with rows like this:\n\t {}".format(data[0:1]))
else:
    print("Your query returned no data! Please, check that query parameters are valid.")

Query returned 551 data files with 551 object_ids.
Data is a list with rows like this:
	 [{'case_ids': ['10003752-ZKSeLdFwMUDRGpBEpKBFg'], 'project_id': 'Open-A1', 'file_name': '10003752-ZKSeLdFwMUDRGpBEpKBFg/2.16.840.1.114274.1818.50064025241029693839083343429121218196/2.16.840.1.114274.1818.557028016880842653110777615882296339362.zip', 'source_node': 'ct_series_file', 'object_id': 'dg.MD1R/bc44a90d-f742-4fab-9adb-1b92281b0be5', 'file_size': 66086130}]


In [10]:
source_nodes = ["ct_series_file", "annotation_file","dicom_annotation_file"]

In [11]:
# note that the field "data_type" here has changed from "case" (example above) to "data_file". This is the name of the Elasticsearch index
data = query.raw_data_download(
                    data_type="data_file",
                    fields=fields,
                    filter_object={
                        "AND": [
                            {"in": {"source_node": source_nodes}},
                            {"=": {"loinc_system": loinc_system}},
                            {"IN": {"case_ids": case_ids}},
                        ]
                    },
                    sort_fields=[{"submitter_id": "asc"}]
                )

if len(data) > 0:
    object_ids = [i['object_id'] for i in data if 'object_id' in i] ## make a list of the file object_ids returned by our query
    print("Query returned {} data files with {} object_ids.".format(len(data),len(object_ids)))
    print("Data is a list with rows like this:\n\t {}".format(data[0:1]))
else:
    print("Your query returned no data! Please, check that query parameters are valid.")

Query returned 1112 data files with 1112 object_ids.
Data is a list with rows like this:
	 [{'case_ids': ['419639-003843'], 'project_id': 'Open-R1', 'file_name': '419639-003843/1.2.826.0.1.3680043.10.474.419639.731655360684491744764346739445/1.2.826.0.1.3680043.10.474.419639.170162526659076154887335862376.zip', 'source_node': 'ct_series_file', 'object_id': 'dg.MD1R/fd43660f-384f-45bb-ad4d-d2c670934901', 'file_size': 220805282}]


In [11]:
## Make a new directory for downloaded files
os.makedirs('downloads', exist_ok=True)

In [12]:
output_dir = rf"C:\Users\Wahaj Sajid\Desktop\Asian Women COVID 19 positve (Age  0 to 75) download data"

In [None]:
## Run the "gen3 drs-pull object" command to download one of the files
object_id = object_ids[0]
cmd = f'gen3 --auth "{credentials}" --endpoint data.midrc.org drs-pull object {object_id} --output-dir "{output_dir}"'
os.system(cmd)

In [None]:
## Simple loop to download all files and keep track of success and failures
object_id = object_ids[0]
success,failure,other=[],[],[]
count,total = 0,len(object_ids)
for object_id in object_ids:
    count+=1
    cmd = f'gen3 --auth "{credentials}" --endpoint data.midrc.org drs-pull object {object_id} --output-dir "{output_dir}"'
    stout = subprocess.run(cmd, shell=True, capture_output=True)
    print("Progress ({}/{}): {}".format(count,total,stout.stdout))
    if "failed" in str(stout.stdout):
        failure.append(object_id)
    elif "successfully" in str(stout.stdout):
        success.append(object_id)
    else:
        other.append(object_id)             

Progress (3/1112): b''
Progress (4/1112): b''
Progress (5/1112): b''
Progress (6/1112): b''
Progress (7/1112): b''
Progress (8/1112): b''
Progress (9/1112): b''
Progress (10/1112): b''
Progress (11/1112): b''
Progress (12/1112): b''
Progress (13/1112): b''
Progress (14/1112): b''
Progress (15/1112): b''
Progress (16/1112): b''
Progress (17/1112): b''
Progress (18/1112): b''
Progress (19/1112): b''
Progress (20/1112): b''
Progress (21/1112): b''
Progress (22/1112): b''
Progress (23/1112): b''
Progress (24/1112): b''
Progress (25/1112): b''
Progress (26/1112): b''
Progress (27/1112): b''
Progress (28/1112): b''
Progress (29/1112): b''
Progress (30/1112): b''
Progress (31/1112): b''
Progress (32/1112): b''
Progress (33/1112): b''
Progress (34/1112): b''
Progress (35/1112): b''
Progress (36/1112): b''
Progress (37/1112): b''
Progress (38/1112): b''
Progress (39/1112): b''
Progress (40/1112): b''
Progress (41/1112): b''
Progress (42/1112): b''
Progress (43/1112): b''
Progress (44/1112): b''