In [1]:
import requests
import json
import io
import pickle

import pandas as pd

In [2]:
files_endpt = "https://api.gdc.cancer.gov/files"

# Field Groups

https://docs.gdc.cancer.gov/API/Users_Guide/Appendix_A_Available_Fields/#field-group-listing-by-endpoint

https://docs.gdc.cancer.gov/Encyclopedia/pages/TCGA_Barcode/

https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables

In [3]:
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "cases.primary_site",
            "value": ["Breast", "Brain", "Bronchus and lung", "Colon"]
            }
        },
        {
        "op": "=",
        "content":{
            "field": "files.data_format",
            "value": ["SVS"]
            }
        }
    ]
}

In [4]:
json_params = {
    "filters": json.dumps(filters),
    "expand": "cases.project,cases.demographic,cases.diagnoses,cases.samples,cases.tissue_source_site,cases.project,cases",
    "format": "json",
    "size": "20000"
}

In [5]:
response = requests.post(files_endpt, headers={"Content-Type": "application/json"}, json=json_params)
with open('all_metadata.json', 'w') as f:
    json.dump(response.json(), f)

In [6]:
csv_params = {
    "filters": json.dumps(filters),
    "expand": "cases.project,cases.demographic,cases.diagnoses,cases.samples,cases.tissue_source_site,cases.project,cases",
    "format": "csv",
    "size": "20000"
}

In [7]:
response = requests.post(files_endpt, headers={"Content-Type": "application/json"}, json=csv_params)
df = pd.read_csv(io.StringIO(response.content.decode('utf-8')), dtype='object')
df.to_csv('metadata.csv', index=False)