In [None]:
##  This works with HTTPX
async def foo():
    async with httpx.AsyncClient() as client:
        url = 'https://services.cancerimagingarchive.net/services/v3/TCIA/query/getCollectionValues'
        r = await client.get(url) #'https://www.example.com/')
    return r


for i in range(5):
    start = time.time()
    await asyncio.gather(*[foo() for x in range(i)])
    print(f'{i} runs in {time.time() - start} seconds')


# Call TCIA API

In [1]:
import httpx
import asyncio
import pandas as pd
import azure.storage.queue as asq
import os
import json

# variables used throughout
tciabase = 'https://services.cancerimagingarchive.net/services/v3/TCIA/query'


In [2]:
def handleRateLimits(headers):
    limits = dict([(h,int(headers[h])) for h in headers if h.find('x-rate') != -1])
    # Example : {'x-ratelimit-limit-hour': 360000, 'x-ratelimit-remaining-hour': 359961, 'x-ratelimit-limit-second': 1000, 'x-ratelimit-remaining-second': 999}
    
    # TODO: send a queue message to a queue which will 'pause' the querying from TCIA for the right time

## Get Collections

In [3]:
async def getCollectionsAsync():
    urlGetCollections = f'{tciabase}/getCollectionValues'
    async with httpx.AsyncClient() as client:
        r = await client.get(urlGetCollections)
    
    # not really needed here, but should send every time we send a request
    handleRateLimits(r.headers)
    
    if r is not None:
        #return [c['Collection'] for c in r.json()]
        return r.json()

In [4]:
#tcia_collections = await getCollectionsAsync()

In [5]:
#len(tcia_collections)
#print(tcia_collections)

In [6]:

# https://services.cancerimagingarchive.net/services/v3/TCIA/query/getPatientStudy?Collection=TCGA-GBM
# https://services.cancerimagingarchive.net/services/v3/TCIA/query/getSeries?Collection=TCGA-GBM&StudyInstanceUID=1.3.6.1.4.1.14519.5.2.1.7695.4001.130563880911723253267280582465  
# https://services.cancerimagingarchive.net/services/v3/TCIA/query/getImage?SeriesInstanceUID=1.3.6.1.4.1.14519.5.2.1.7695.4001.306204232344341694648035234440



## Get PatientStudies in Collection

In [7]:
async def getPatientStudiesPerCollection(collection):
    urlGetPatientStudyBase = 'https://services.cancerimagingarchive.net/services/v3/TCIA/query/getPatientStudy'
    params = {'Collection': collection}
    async with httpx.AsyncClient() as client:
        r = await client.get(urlGetPatientStudyBase,params=params,timeout=None) #timeout=15.0)
    
    # not really needed here, but should send every time we send a request
    handleRateLimits(r.headers)
    
    if r is not None:
        #return [c['Collection'] for c in r.json()]
        return r.json()

In [8]:
#studies = await getPatientStudiesPerCollection(collections[0]['Collection'])

In [9]:
#print(len(studies))
#print(studies[0])

In [10]:
#study_ids = [s['StudyInstanceUID'] for s in studies] # if s.find('StudyInstanceUID') != -1]

## Get Series

In [11]:
async def getSeriesPerStudy(study):
    urlGetSeriesBase = 'https://services.cancerimagingarchive.net/services/v3/TCIA/query/getSeries'
    params = {'StudyInstanceUID': study}
    async with httpx.AsyncClient() as client:
        r = await client.get(urlGetSeriesBase,params=params,timeout=None) #timeout=15.0)
    
    # not really needed here, but should send every time we send a request
    handleRateLimits(r.headers)
    
    if r is not None:
        #return [c['Collection'] for c in r.json()]
        return r.json()

In [12]:
#series = await getSeriesPerStudy(studies[0]['StudyInstanceUID'])

In [13]:
#len(series)
#print(series[0])

## Pull it all together

In [14]:
# Get all the collections. We'll operate on one at a time
tcia_collections = await getCollectionsAsync()

In [16]:
# For each collection, get the Studies, and then the Series
# Hypothesis: pandas is far more memory efficient than Python dicts so first go get every single
#   Study for all passed in collections and put into a DataFrame and THEN go get Series

for collection in tcia_collections[:2]: # iter through the dictionaries in the list
    
    # To save a LOT of time, don't rerun collections if they've already been run
    if os.path.exists(f'{collection["Collection"]}-series.json'):
        print(f'Already processed {collection["Collection"]}. Please check the data folder.')
        continue
    
    series_list = [] # list to store all the series for a collection
    counter = 0
    # Get the Studies
    studies = await getPatientStudiesPerCollection(collection['Collection'])
    print(f'studies in {collection["Collection"]} : {len(studies)}')
    #print(f'{collection["Collection"]}_studies.csv')
    study_df = pd.DataFrame.from_dict(studies)
    study_df.to_csv(f'{collection["Collection"]}_studies.csv')
    
    for study in studies[:3]:
        series = await getSeriesPerStudy(study['StudyInstanceUID'])
        counter += 1
        if counter % 50 == 0:
            print(str(counter))
        
        #create a list of series dicts (combining metadata from study)
        for s in series:
            # merge the dictionaries using ** to unpack the dictionaries (since .union is in place)
            merged_dict = {**study, **s}
            series_list.append(merged_dict)
    
    # Output results to a data folder to avoid having to burn time running again
    study_df = pd.DataFrame.from_dict(series_list)
    study_df.to_csv(f'data/{collection["Collection"]}_studies_series.csv')
   
    # Also save just the resulting list, since that can be useful, too.  :-)
    with open(f'data/{collection["Collection"]}-series.json',"w") as f:
        json.dump(series_list, f)

    

    

Already processed TCGA-GBM. Please check the data folder.
studies in LIDC-IDRI : 1308


In [22]:
import json
collection = tcia_collections[1]
with open(f'data/{collection["Collection"]}-series-test.json',"w") as f:
    json.dump(series_list, f)
    #series_list = f.read()

In [23]:
type(series_list)

list

In [26]:
# Load the list if we need to
collection = tcia_collections[1]
with open(f'data/{collection["Collection"]}-series-test.json',"r") as f:
    l = json.load(f)
type(l) # should read 'list'

In [None]:
q = asq.QueueClient.from_connection_string(conn_str='DefaultEndpointsProtocol=https;AccountName=sjbfunctest;AccountKey=XuYBliYrXazCmfDdK2jLcaJcfqPgu8tC43TlltTMY413nusjx2N6+IvErYmVXuZfOBVgVaCQ52RObKioS9FDRg==;EndpointSuffix=core.windows.net', queue_name='foofoo3')

try:
    p = q.get_queue_properties()
except:
    q.create_queue()
q.send_message('Hello-There ')
r = asq.TextBase64EncodePolicy()
r.encode('TEST-THIS')

Collecting aiofiles
  Downloading aiofiles-0.5.0-py3-none-any.whl (11 kB)
Installing collected packages: aiofiles
Successfully installed aiofiles-0.5.0


In [155]:
# Works with the addition of async with 

import httpx
import asyncio
import aiofiles

async def download(url:str):
    url = "https://services.cancerimagingarchive.net/services/v3/TCIA/query/getCollections"
    async with httpx.AsyncClient() as client:
        resp = await client.get(url)
    return resp

async def download_lots(i):
    url = "https://services.cancerimagingarchive.net/services/v3/TCIA/query/getCollections"
    await asyncio.gather(*[download(url) for x in range(i)])

#if __name__ == "__main__":
#    asyncio.run(download_lots))  # used outside of Jupyter when I don't have an event loop

for i in range(7):
    start = time.time()
    await download_lots(i)
    print(f'{i} runs in {time.time() - start} seconds')

print('done')

0 runs in 0.0 seconds
1 runs in 0.6469912528991699 seconds
2 runs in 0.6540677547454834 seconds
3 runs in 0.7069270610809326 seconds
4 runs in 1.1080455780029297 seconds
5 runs in 1.0079903602600098 seconds
6 runs in 1.2149596214294434 seconds
done


In [138]:
import httpx
import asyncio
import aiofiles

import os

async def download(url:str, folder:str):
    filename = url.split("/")[-1]
    resp = await httpx.get(url)
    resp.raise_for_status()
    #async with aiofiles.open(os.path.join(folder, filename), "wb") as f:
    #    await f.write(resp.content)


async def download_all_photos(loops: str):
    #resp = httpx.get("https://jsonplaceholder.typicode.com/photos")
    #resp.raise_for_status()
    #urls = list(set(d["url"] for d in resp.json()))[:10]
    #os.makedirs(out_dir, exist_ok=True)
    url = "https://services.cancerimagingarchive.net/services/v3/TCIA/query/getCollectionValues"
    await asyncio.gather(*[download(url, "bob") for x in range(loops)])


#if __name__ == "__main__":
#    asyncio.run(download_all_photos('100_photos'))

for i in range(5):
    start = time.time()
    await download_all_photos(i)
    print(f'{i} runs in {time.time() - start} seconds')

0 runs in 0.0 seconds


TypeError: object Response can't be used in 'await' expression

In [139]:
async with httpx.AsyncClient() as client:
    r = await client.get('https://www.example.com/')
r

<Response [200 OK]>

In [147]:
##  This works with HTTPX
async def foo():
    async with httpx.AsyncClient() as client:
        url = 'https://services.cancerimagingarchive.net/services/v3/TCIA/query/getCollectionValues'
        r = await client.get(url) #'https://www.example.com/')
    return r


for i in range(5):
    start = time.time()
    await asyncio.gather(*[foo() for x in range(i)])
    print(f'{i} runs in {time.time() - start} seconds')


0 runs in 0.0 seconds
1 runs in 0.655052900314331 seconds
2 runs in 0.706932544708252 seconds
3 runs in 0.7170536518096924 seconds
4 runs in 1.0249478816986084 seconds


In [78]:
import aiohttp
from aiohttp import ClientSession
import asyncio

async def call_url(x, session):
    url = "https://services.cancerimagingarchive.net/services/v3/TCIA/query/getSeries?Collection=TCGA-GBM&StudyInstanceUID=1.3.6.1.4.1.14519.5.2.1.7695.4001.130563880911723253267280582465"
    
    response = await session.get(url, timeout=None)
    response_json = await response.json()
    return response_json


async def run_program(x, session):
    """Wrapper for running program in an asynchronous manner"""
    #try:
    response = await call_url(x, session)
        #print(f"Response: {json.dumps(response, indent=2)}")
    #except Exception as err:
        #print(f"Exception occured: {err}")
        #pass



In [79]:
import time
for i in range(5):
    start = time.time()
    #async with httpx.AsyncClient() as session:
    async with ClientSession as sesssion:
        await asyncio.gather(*[run_program(x,session) for x in range(i)])
   # print(f'{i} runs in {time.time() - start}')

AttributeError: __aexit__

In [None]:
    try:
        response = await session.request(method='GET', url=url)
        response.raise_for_status()
        print(f"Response status ({url}): {response.status}")
    except HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except Exception as err:
        print(f"An error ocurred: {err}")

In [157]:
# Works with aiohttp but not httpx 
import aiohttp
import asyncio
import time
import httpx

async def call_url(session):
    url = "https://services.cancerimagingarchive.net/services/v3/TCIA/query/getCollections"
    #response = await session.request(method='GET', url=url)
    response = await session.get(url=url)

    return response

for i in range(1,5):
    start = time.time() # start time for timing event
    async with aiohttp.ClientSession() as session: #use aiohttp
    #async with httpx.AsyncClient as session:  #use httpx
        await asyncio.gather(*[call_url(session) for x in range(i)])
    print(f'{i} call(s) in {time.time() - start} seconds')

1 call(s) in 0.6880629062652588 seconds
2 call(s) in 0.6539404392242432 seconds
3 call(s) in 0.6569993495941162 seconds
4 call(s) in 0.8340511322021484 seconds


In [158]:
#Works with HTTPX
import aiohttp
import asyncio
import time
import httpx

async def call_url(session = None):
    url = "https://services.cancerimagingarchive.net/services/v3/TCIA/query/getCollectionValues"
    response = await session.get(url=url)
    return response

for i in range(1,5):
    start = time.time() # start time for timing event
    #async with aiohttp.ClientSession() as session: #use aiohttp
    session = httpx.AsyncClient() #use httpx
    await asyncio.gather(*[call_url(session) for x in range(i)])
    await session.aclose()
    print(f'{i} call(s) in {time.time() - start} seconds')

1 call(s) in 0.6497354507446289 seconds
2 call(s) in 0.6889312267303467 seconds
3 call(s) in 0.6750121116638184 seconds
4 call(s) in 0.8299829959869385 seconds


In [102]:
for i in range(1,5):
    start = time.time()
    async with aiohttp.ClientSession() as session:
    #async with httpx.AsyncClient as session:
        await asyncio.gather(*[call_url(session) for x in range(i)])
    print(f'{i} call(s) in {time.time() - start} seconds')

0 call(s) in 0.0 seconds
1 call(s) in 7.978963136672974 seconds


In [111]:
import aiohttp
import asyncio
import time
import httpx

async def call_url(session):
    url = "https://services.cancerimagingarchive.net/services/v3/TCIA/query/getCollections"
    #async with aiohttp.ClientSession() as session: #use aiohttp
    async with httpx.AsyncClient as session:  #use httpx
        response = await session.get(url=url)

    return response

for i in range(1,5):
    start = time.time() # start time for timing event
    await asyncio.gather(*[call_url(session) for x in range(i)])
    print(f'{i} call(s) in {time.time() - start} seconds')

AttributeError: __aexit__

In [None]:
print(len(series_list))
print(series_list[:1])



In [None]:
study_df = pd.DataFrame.from_dict(series_list)
study_df.to_csv(f'data/{tcia_collections[0]["Collection"]}_studies_series.csv')


In [None]:
len(study_df)
with open('data/TCGA-GBM-series.json',"w") as f:
    f.write(str(series_list))

In [None]:
series_sample = series[0]    
study_sample = studies[0]    

In [None]:
 
series_fields = [x for x in series_sample]
study_fields = [x for x in study_sample]
print(len(series_fields))
print(len(study_fields))
merged = list(set(series_fields).union(set(study_fields)))

merged2 = {**study_sample, **series_sample}

print(study_sample)

In [None]:
study_sample.update(series_sample)
print(len(study_sample))
print(study_sample)

merged2 = {**study_sample, **series_sample}
print(len(merged2))
print(merged2)

In [None]:
study_sample.update(series_sample)
print(len(study_sample))
study_sample
from collections import OrderedDict
od = OrderedDict(study_sample)
od

x = []
x.append(study_sample)

df = pd.DataFrame.from_dict(x)

df.head()

In [None]:
print(series_sample)
print(study_sample)


In [None]:
df = pd.DataFrame.from_dict(studies)

In [None]:
print(len(df))

In [None]:
df.head()

In [None]:
#https://services.cancerimagingarchive.net/services/v3/TCIA/query/getSeries?Collection=TCGA-GBM&StudyInstanceUID=1.3.6.1.4.1.14519.5.2.1.7695.4001.130563880911723253267280582465

In [None]:
studies = [s['StudyInstanceUID'] for s in res.json()]
#for x in res.json():
#    print(x['StudyInstanceUID'])
studies

In [None]:
    for x in collections.json()[:2]:
        print(x['Collection'])

In [None]:
# Create a list with all the studies
studies = [s['StudyInstanceUID'] for s in res.json()]


#storageConnString = os.environ["AzureWebJobsStorage"]
storageConnString = 'DefaultEndpointsProtocol=https;AccountName=sjbfunctest;AccountKey=XuYBliYrXazCmfDdK2jLcaJcfqPgu8tC43TlltTMY413nusjx2N6+IvErYmVXuZfOBVgVaCQ52RObKioS9FDRg==;EndpointSuffix=core.windows.net'

#x = asq.QueueService(account_name='sjbfunctest', account_key='mykey')
#service = asq.QueueServiceClient.from_connection_string(conn_str=connection_string)
patient_studies_queue = asq.QueueClient.from_connection_string(conn_str=storageConnString,queue_name='studies')

# Create the queue if it doesn't exist...  by exception
#   Which is hacky, but effective
try:
    patient_studies_queue.get_queue_properties()
except:
    patient_studies_queue.create_queue()

# Must base-64 encode since... functions...
enc = asq.TextBase64EncodePolicy()


In [None]:

for study in studies:
    b64 = enc.encode(study)
    patient_studies_queue.send_message(b64)

In [None]:
len(studies)
study_id = studies[0]
study_id

In [None]:
study_id = studies[0]

urlGetSeries = 'https://services.cancerimagingarchive.net/services/v3/TCIA/query/getSeries'
params = {'StudyInstanceUID': study_id}
res = requests.get(urlGetSeries,params=params,timeout=None) #timeout=15.0)



In [None]:
res.json()[0]

In [None]:
# Create a list with all the studies
series = [s['SeriesInstanceUID'] for s in res.json()]

series



In [None]:
#storageConnString = os.environ["AzureWebJobsStorage"]
storageConnString = 'DefaultEndpointsProtocol=https;AccountName=sjbfunctest;AccountKey=XuYBliYrXazCmfDdK2jLcaJcfqPgu8tC43TlltTMY413nusjx2N6+IvErYmVXuZfOBVgVaCQ52RObKioS9FDRg==;EndpointSuffix=core.windows.net'

series_queue = asq.QueueClient.from_connection_string(conn_str=storageConnString,queue_name='series')

# Create the queue if it doesn't exist...  by exception
#   Which is hacky, but effective
try:
    series_queue.get_queue_properties()
except:
    series_queue.create_queue()

# Must base-64 encode since... functions...
enc = asq.TextBase64EncodePolicy()

for s in series[:1]:
    b64 = enc.encode(s)
    series_queue.send_message(b64)
    print(s)

In [None]:
# Get the zip files
url = 'https://services.cancerimagingarchive.net/services/v3/TCIA/query/getImage?SeriesInstanceUID=1.3.6.1.4.1.14519.5.2.1.7695.4001.306204232344341694648035234440'
res = requests.get(url,timeout=None)

In [None]:
res

In [None]:
#Download files


In [None]:
   # Get the study id from the base-64 encoded incoming queue
#series_id = msg.get_body().decode('utf-8')
series_id = series[0]

series_id

In [None]:
urlGetImage = 'https://services.cancerimagingarchive.net/services/v3/TCIA/query/getImage'
params = {'SeriesInstanceUID': series_id}
res = requests.get(urlGetImage,params=params,timeout=None) #timeout=15.0)
print(res.url)

In [None]:
import requests
import shutil

def download_file(url):
    local_filename = "foo4.zip"
    with requests.get(urlGetImage, stream=True) as r:
        with open(local_filename, 'wb') as f:
            shutil.copyfileobj(r.raw, f)

    return local_filename

In [None]:
local_filename = "food4.zip"
with requests.get(urlGetImage,params=params,timeout=None, stream=True) as r:
    with open(local_filename, 'wb') as f:
        shutil.copyfileobj(r.raw, f)

In [None]:
import zipfile
import io
from io import BytesIO

file_like_object = io.BytesIO(res.content)
z = zipfile.ZipFile(file_like_object)

In [None]:
files = z.filelist
f1 = files[1]

#for f in files:
    #print(f)
    #z.read(f)
dcmbytes = z.read(f)
#dcmbytes

In [None]:
#!pip install azure.storage.blob

import azure.storage.blob as blob

In [None]:
parts = f1.filename.split('/')
dcm_names = [p for p in parts if p.find('.dcm') != -1]
if len(dcm_names) > 0:
    dcm_name = dcm_names[0]
dcm_name

In [None]:
storageConnString = 'DefaultEndpointsProtocol=https;AccountName=sjbfunctest;AccountKey=XuYBliYrXazCmfDdK2jLcaJcfqPgu8tC43TlltTMY413nusjx2N6+IvErYmVXuZfOBVgVaCQ52RObKioS9FDRg==;EndpointSuffix=core.windows.net'

b = blob.ContainerClient.from_connection_string(conn_str=storageConnString,container_name='dicoms2')

# Create the queue if it doesn't exist...  by exception
#   Which is hacky, but effective
try:
    b.get_container_properties()
except:
    b.create_container()

for f in files:
    dicom_file = z.read(f)
    parts = f.filename.split('/')
    dcm_parts = [p for p in parts if p.find('.dcm') != -1]
    if len(dcm_parts) == 1: # we have a dicom file, and only one
        dcm_name = f'{series_id}/{dcm_parts[0]}'
        print(dcm_name)
        up = b.upload_blob(data=z.read(f), name=dcm_name)
    
    
    

#upblob = b.upload_blob(data=dcmbytes,name='test3.dcm')

In [None]:
upblob.blob_name

In [None]:

storageConnString = 'DefaultEndpointsProtocol=https;AccountName=sjbfunctest;AccountKey=XuYBliYrXazCmfDdK2jLcaJcfqPgu8tC43TlltTMY413nusjx2N6+IvErYmVXuZfOBVgVaCQ52RObKioS9FDRg==;EndpointSuffix=core.windows.net'

series_queue = asq.QueueClient.from_connection_string(conn_str=storageConnString,queue_name='series')

# Create the queue if it doesn't exist...  by exception
#   Which is hacky, but effective
try:
    series_queue.get_queue_properties()
except:
    series_queue.create_queue()

# Must base-64 encode since... functions...
enc = asq.TextBase64EncodePolicy()


In [None]:



    
    
    
    
    # Create a list with all the studies
    series = [s['SeriesInstanceUID'] for s in res.json()]


    storageConnString = os.environ["AzureWebJobsStorage"]
  
    series_queue = asq.QueueClient.from_connection_string(conn_str=storageConnString,queue_name='series')

    # Create the queue if it doesn't exist...  by exception
    #   Which is hacky, but effective
    try:
        series_queue.get_queue_properties()
    except:
        series_queue.create_queue()

    # Must base-64 encode since... functions...
    enc = asq.TextBase64EncodePolicy()

    for s in series:
        b64 = enc.encode(s)
        series_queue.send_message(b64)

In [None]:
import httpx
import asyncio
import aiofiles

import os

async def download(url:str, folder:str):
    filename = url.split("/")[-1]
    resp = await httpx.get(url)
    resp.raise_for_status()
    async with aiofiles.open(os.path.join(folder, filename), "wb") as f:
        await f.write(resp.content)


async def download_all_photos(out_dir: str):
    resp = await httpx.get("https://jsonplaceholder.typicode.com/photos")
    resp.raise_for_status()
    urls = list(set(d["url"] for d in resp.json()))[:100]
    os.makedirs(out_dir, exist_ok=True)
    await asyncio.gather(*[download(url, out_dir) for url in urls])


if __name__ == "__main__":
    asyncio.run(download_all_photos('100_photos'))

In [156]:
## WORKS

import httpx
import asyncio
import aiofiles

import os

async def download(url:str, folder:str):
    filename = url.split("/")[-1]
    async with httpx.AsyncClient() as session:
        resp = await session.get(url)
        resp.raise_for_status()
    async with aiofiles.open(os.path.join(folder, filename), "wb") as f:
        await f.write(resp.content)
        
async def download_all_photos(out_dir: str):
    async with httpx.AsyncClient() as session:
        resp = await session.get("https://jsonplaceholder.typicode.com/photos")
        resp.raise_for_status()
    urls = list(set(d["url"] for d in resp.json()))[:100]
    os.makedirs(out_dir, exist_ok=True)
    await asyncio.gather(*[download(url, out_dir) for url in urls])


#if __name__ == "__main__":
#    asyncio.run(download_all_photos('100_photos'))
    
await download_all_photos('100_photos')

In [162]:
def write_stuff(f):
    f.write('And stuff with context passed to another method. ')

In [164]:
def write_stuff(f):
    f.write('And stuff with context passed to another method. ')
    
with open('foo.txt',"w") as f:
    f.write('Start with context manager inside with statement. ')
    write_stuff(f)
    f.write('And back to close the with.')


In [165]:
!pip show httpx


Name: httpx
Version: 0.12.1
Summary: The next generation HTTP client.
Home-page: https://github.com/encode/httpx
Author: Tom Christie
Author-email: tom@tomchristie.com
License: BSD
Location: c:\users\stborg\appdata\local\continuum\anaconda3\envs\fastai2\lib\site-packages
Requires: idna, chardet, certifi, h2, sniffio, urllib3, h11, hstspreload, rfc3986
Required-by: 
