In [None]:
##  This works with HTTPX
async def foo():
    async with httpx.AsyncClient() as client:
        url = 'https://services.cancerimagingarchive.net/services/v3/TCIA/query/getCollectionValues'
        r = await client.get(url) #'https://www.example.com/')
    return r


for i in range(5):
    start = time.time()
    await asyncio.gather(*[foo() for x in range(i)])
    print(f'{i} runs in {time.time() - start} seconds')


# Call TCIA API

In [7]:
import httpx
import asyncio
import pandas as pd
import azure.storage.queue as asq
import os
import json
import time

# variables used throughout
tciabase = 'https://services.cancerimagingarchive.net/services/v3/TCIA/query'


In [8]:
def handleRateLimits(headers):
    limits = dict([(h,int(headers[h])) for h in headers if h.find('x-rate') != -1])
    # Example : {'x-ratelimit-limit-hour': 360000, 'x-ratelimit-remaining-hour': 359961, 'x-ratelimit-limit-second': 1000, 'x-ratelimit-remaining-second': 999}
    
    # TODO: send a queue message to a queue which will 'pause' the querying from TCIA for the right time

## Get Collections

In [9]:
async def getCollectionsAsync():
    urlGetCollections = f'{tciabase}/getCollectionValues'
    async with httpx.AsyncClient() as client:
        r = await client.get(urlGetCollections)
    
    # not really needed here, but should send every time we send a request
    handleRateLimits(r.headers)
    
    if r is not None:
        #return [c['Collection'] for c in r.json()]
        return r.json()

In [10]:
#tcia_collections = await getCollectionsAsync()

In [11]:
#len(tcia_collections)
#print(tcia_collections)

In [12]:

# https://services.cancerimagingarchive.net/services/v3/TCIA/query/getPatientStudy?Collection=TCGA-GBM
# https://services.cancerimagingarchive.net/services/v3/TCIA/query/getSeries?Collection=TCGA-GBM&StudyInstanceUID=1.3.6.1.4.1.14519.5.2.1.7695.4001.130563880911723253267280582465  
# https://services.cancerimagingarchive.net/services/v3/TCIA/query/getImage?SeriesInstanceUID=1.3.6.1.4.1.14519.5.2.1.7695.4001.306204232344341694648035234440



## Get PatientStudies in Collection

In [13]:
async def getPatientStudiesPerCollection(collection, async_client):
    urlGetPatientStudyBase = 'https://services.cancerimagingarchive.net/services/v3/TCIA/query/getPatientStudy'
    params = {'Collection': collection}
    #async with httpx.AsyncClient() as client:
    r = await async_client.get(urlGetPatientStudyBase,params=params,timeout=None) #timeout=15.0)
    
    # not really needed here, but should send every time we send a request
    handleRateLimits(r.headers)
    
    if r is not None:
        #return [c['Collection'] for c in r.json()]
        return r.json()

In [14]:
#studies = await getPatientStudiesPerCollection(collections[0]['Collection'])

In [15]:
#print(len(studies))
#print(studies[0])

In [16]:
#study_ids = [s['StudyInstanceUID'] for s in studies] # if s.find('StudyInstanceUID') != -1]

## Get Series

In [17]:
async def getSeriesPerStudy(study, async_client):
    urlGetSeriesBase = 'https://services.cancerimagingarchive.net/services/v3/TCIA/query/getSeries'
    params = {'StudyInstanceUID': study}
    #async with httpx.AsyncClient() as client:
    r = await async_client.get(urlGetSeriesBase,params=params,timeout=None) #timeout=15.0)
    
    # not really needed here, but should send every time we send a request
    handleRateLimits(r.headers)
    
    if r is not None:
        #return [c['Collection'] for c in r.json()]
        return r.json()

In [18]:
#series = await getSeriesPerStudy(studies[0]['StudyInstanceUID'])

In [19]:
#len(series)
#print(series[0])

In [20]:
async def getSeriesPerStudy(study, async_client):
    urlGetSeriesBase = 'https://services.cancerimagingarchive.net/services/v3/TCIA/query/getSeries'
    params = {'StudyInstanceUID': study}
    #async with httpx.AsyncClient() as client:
    r = await async_client.get(urlGetSeriesBase,params=params,timeout=None) #timeout=15.0)
    
    # not really needed here, but should send every time we send a request
    handleRateLimits(r.headers)
    
    if r is not None:
        #return [c['Collection'] for c in r.json()]
        return r.json()

In [21]:
async def getSeriesPerStudyJoinDict(study_id, study_dict, async_client):
    
    #retry three times if necessary
    for i in range(10):
        try:
            series = await getSeriesPerStudy(study_id, async_client)
            # join with the study_dict we already have
                    #create a list of series dicts (combining metadata from study)
            # merge the dictionaries using ** to unpack the dictionaries (since .union is in place)
            series_list = [] # list to store all the series for a collection

            for s in series:
                merged_dict = {**study_dict, **s}
                series_list.append(merged_dict)        
            return series_list
        except:
            time.sleep(10)
            continue
    
    # if we get here we have a problem that's happened three times
    raise Exception("Failed to get series over set of retries")
    

In [22]:
async def getInstancesPerSeries(series, async_client):
    urlGetInstancesBase = 'https://services.cancerimagingarchive.net/services/v3/TCIA/query/getSOPInstanceUIDs'
    params = {'SeriesInstanceUID': series}
    #async with httpx.AsyncClient() as client:
    r = await async_client.get(urlGetInstancesBase,params=params,timeout=None) #timeout=15.0)
    
    # not really needed here, but should send every time we send a request
    handleRateLimits(r.headers)
    
    if r is not None:
        #return [c['Collection'] for c in r.json()]
        return r.json()

In [23]:
async def getInstancesPerSeriesJoinDict(series_id, series_dict, async_client):
    
    #retry, if necessary
    for i in range(10):
        try:
            instances = await getInstancesPerSeries(series_id, async_client)
            # join with the study_dict we already have
                    #create a list of series dicts (combining metadata from study)
            # merge the dictionaries using ** to unpack the dictionaries (since .union is in place)
            instances_list = [] # list to store all the series for a collection

            for s in instances:
                merged_dict = {**series_dict, **s}
                instances_list.append(merged_dict)   
            return instances_list
        except:
            time.sleep(i)
            continue
    
    # if we get here we have a problem that's happened three times
    raise Exception("Failed to get series over set of retries")
    

## Pull it all together

In [24]:
# Get all the collections. We'll operate on one at a time

tcia_collections = await getCollectionsAsync()

In [25]:
print(tcia_collections)

[{'Collection': 'CPTAC-CM'}, {'Collection': 'NaF PROSTATE'}, {'Collection': 'NSCLC-Radiomics-Interobserver1'}, {'Collection': 'NSCLC Radiogenomics'}, {'Collection': 'PROSTATE-DIAGNOSIS'}, {'Collection': 'RIDER Lung PET-CT'}, {'Collection': 'TCGA-GBM'}, {'Collection': 'LGG-1p19qDeletion'}, {'Collection': 'QIN GBM Treatment Response'}, {'Collection': 'TCGA-READ'}, {'Collection': 'TCGA-LUSC'}, {'Collection': 'APOLLO'}, {'Collection': 'TCGA-KIRP'}, {'Collection': 'QIN-HEADNECK'}, {'Collection': 'CT COLONOGRAPHY'}, {'Collection': 'BREAST-DIAGNOSIS'}, {'Collection': 'PROSTATE-MRI'}, {'Collection': 'RIDER PHANTOM MRI'}, {'Collection': 'LungCT-Diagnosis'}, {'Collection': 'QIN-BRAIN-DSC-MRI'}, {'Collection': 'TCGA-LUAD'}, {'Collection': 'CBIS-DDSM'}, {'Collection': 'CPTAC-LSCC'}, {'Collection': 'LCTSC'}, {'Collection': 'RIDER Breast MRI'}, {'Collection': 'RIDER PHANTOM PET-CT'}, {'Collection': 'NSCLC-Radiomics'}, {'Collection': 'MRI-DIR'}, {'Collection': 'QIN LUNG CT'}, {'Collection': 'QIBA CT-

In [31]:
# For each collection, get the Studies, and then the Series
# Hypothesis: pandas is far more memory efficient than Python dicts so first go get every single
#   Study for all passed in collections and put into a DataFrame and THEN go get Series

already_processed_count = 0
for collection in tcia_collections: # iter through the dictionaries in the list
    
    # To save a LOT of time, don't rerun collections if they've already been run

    if os.path.exists(f'data/{collection["Collection"]}-series.json'):
        print(f'Already processed {collection["Collection"]}. Please check the data folder.')
        already_processed_count +=1 
        continue
        
    print(f'Already processed: {already_processed_count}')    
    
    #if collection['Collection'] in ['CBIS-DDSM','VICTRE','HNSCC']: #QIBA CT-1C only 486, others over 7,000, HNSCC is 1200
    #    print(f'Skipping {collection["Collection"]} because of size')
    #    continue
    
    #NOTE: GBM-DSC-MRI-DRO has ZERO series????
    
    series_list = [] # list to store all the series for a collection
    counter = 0
    # Get the Studies
    
    #with httpx.AsyncClient() as async_client: # This doesn't work with 0.12.1 of HTTPX, must be explicit
    async_client = httpx.AsyncClient()
    print('getting studies')
    studies = await getPatientStudiesPerCollection(collection['Collection'], async_client)
    print(f'studies in {collection["Collection"]} : {len(studies)}')
    #print(f'{collection["Collection"]}_studies.csv')
    study_df = pd.DataFrame.from_dict(studies)
    study_df.to_csv(f'data/{collection["Collection"]}_studies.csv')

    # Need to make this MUCH faster
#    for study in studies[:3]:
#        series = await getSeriesPerStudy(study['StudyInstanceUID'], async_client)
#        counter += 1
#        if counter % 50 == 0:
#            print(str(counter))

        #create a list of series dicts (combining metadata from study)
#        for s in series:
#            # merge the dictionaries using ** to unpack the dictionaries (since .union is in place)
#            merged_dict = {**study, **s}
#            series_list.append(merged_dict)

    # Try this instead
    study_ids = [s['StudyInstanceUID'] for s in studies]
    batchsize = 10    


    for i in range(0,len(study_ids),batchsize):
        start = time.time()                  
        trimmed_range = [y for y in range(i,i+batchsize) if y<len(study_ids)]   #only used to shorten the last batch    
        print(trimmed_range)
        #trimmed_study_ids = [study_ids[i] for i in trimmed_range]
        #print(trimmed_study_ids)
        trimmed_studies = [studies[i] for i in trimmed_range]
        #print(trimmed_studies)
        #1.3.6.1.4.1.14519.5.2.1.7695.4001.130563880911723253267280582465
        responses = await asyncio.gather(*[getSeriesPerStudyJoinDict(s["StudyInstanceUID"],s, async_client) for s in trimmed_studies])
        #responses = await asyncio.gather(*[getSeriesPerStudyJoinDict('1.3.6.1.4.1.14519.5.2.1.7695.4001.130563880911723253267280582465',s, async_client) for s in trimmed_studies])
        # unfortunately responses is a list of list of dicts, we need to flatten
        flatten_matrix = [val for sublist in responses for val in sublist] 
        #print(len(flatten_matrix))
        [series_list.append(r) for r in flatten_matrix]
        #print(len(series_list))
        print(f'Studies {i}-{i+batchsize} in {time.time() - start} seconds')  
    #print(len(series_list))    
    
    


    # Output results to a data folder to avoid having to burn time running again
    study_df = pd.DataFrame.from_dict(series_list)
    study_df.to_csv(f'data/{collection["Collection"]}_studies_series.csv')

    # Also save just the resulting list, since that can be useful, too.  :-)
    with open(f'data/{collection["Collection"]}-series.json',"w") as f:
        json.dump(series_list, f)
    
    # close the connection explicitly, until with is supported by HTTPX
    await async_client.aclose()
    
    

    

Already processed CPTAC-CM. Please check the data folder.
Already processed NaF PROSTATE. Please check the data folder.
Already processed NSCLC-Radiomics-Interobserver1. Please check the data folder.
Already processed NSCLC Radiogenomics. Please check the data folder.
Already processed PROSTATE-DIAGNOSIS. Please check the data folder.
Already processed RIDER Lung PET-CT. Please check the data folder.
Already processed TCGA-GBM. Please check the data folder.
Already processed LGG-1p19qDeletion. Please check the data folder.
Already processed QIN GBM Treatment Response. Please check the data folder.
Already processed TCGA-READ. Please check the data folder.
Already processed TCGA-LUSC. Please check the data folder.
Already processed APOLLO. Please check the data folder.
Already processed TCGA-KIRP. Please check the data folder.
Already processed QIN-HEADNECK. Please check the data folder.
Already processed CT COLONOGRAPHY. Please check the data folder.
Already processed BREAST-DIAGNOSIS.

## Handle Series Instances

In [None]:
## use existing TCIA collections
for collection in tcia_collections: # iter through the dictionaries in the list
    
    # Don't try to do items that aready exist
    if not os.path.exists(f'data/{collection["Collection"]}-series.json'):
        print(f'Cannot find csv for {collection["Collection"]}. Please check the data folder.')
        continue
        
    if collection['Collection']=="LDCT-and-Projection-data":
        print('skipping LDCT and Projection data')
        continue
        
    if os.path.exists(f'data/{collection["Collection"]}-instances.json'):
        print(f'Already processed {collection["Collection"]}. Please check the data folder.')
        continue
        
    print(f'Processing {collection["Collection"]}.')
    with open(f'data/{collection["Collection"]}-series.json', 'r') as myfile:
        data = myfile.read()

    data = data.replace("\'", "\"") # stupidly encoded my JSON with single quotes, instead of double - fix that with hack
    series=json.loads(data)
    
    async_client = httpx.AsyncClient()
 
    instance_list = []    
    #series_ids = [s['SeriesInstanceUID'] for s in series]
    batchsize = 50    

    #print(series_ids[:10])   
    #series = series[:60]
    print(f'Processing {len(series)} series.')

    for i in range(0,len(series),batchsize):               
        trimmed_range = [y for y in range(i,i+batchsize) if y<len(series)]   #only used to shorten the last batch    
        #print(f'trimmed_range is {trimmed_range}')

        trimmed_series = [series[i] for i in trimmed_range]
        #print(len(trimmed_series))
        start = time.time()   
        
      
        responses = await asyncio.gather(*[getInstancesPerSeriesJoinDict(s["SeriesInstanceUID"],s, async_client) for s in trimmed_series])
        print(f'Series {i}-{i+batchsize} in {time.time() - start} seconds')
        #print(len(responses))
        # unfortunately responses is a list of list of dicts, we need to flatten
        flatten_matrix = [val for sublist in responses for val in sublist] 
        #print(len(flatten_matrix))
        [instance_list.append(r) for r in flatten_matrix]
    
    print(len(instance_list))
    


    # Output results to a data folder to avoid having to burn time running again
    df = pd.DataFrame.from_dict(instance_list)
    df.to_csv(f'data/{collection["Collection"]}_studies_series_instances.csv')

    # Also save just the resulting list, since that can be useful, too.  :-)
    with open(f'data/{collection["Collection"]}-instances.json',"w") as f:
        json.dump(instance_list, f)
    
    # close the connection explicitly
    await async_client.aclose()
    
    

In [None]:
len(instance_list)

In [None]:
len(series)

In [None]:
print(len(study_df))
study_df.head()

In [None]:
for x in series_list[:1]:
    print(x)

In [None]:
series_list2 = list(set(series_list))

In [None]:
# delete
study_ids = study_ids[0:2]

for i in range(0,len(study_ids),batchsize):
    start = time.time()                  
    trimmed_range = [y for y in range(i,i+batchsize) if y<len(study_ids)]   #only used to shorten the last batch    
    print(trimmed_range)
    trimmed_study_ids = [study_ids[sid] for sid in trimmed_range]
    print(trimmed_study_ids)

    responses = await asyncio.gather(*[getSeriesPerStudyJoinDict(sid, async_client) for sid in trimmed_study_ids])
    # unfortunately responses is a list of list of dicts, we need to flatten
    flatten_matrix = [val for sublist in responses for val in sublist] 
    #print(len(flatten_matrix))
    [series_list.append(r) for r in flatten_matrix]
    print(len(series_list))
    print(f'Studies {i}-{i+batchsize} in {time.time() - start} seconds')  
#print(len(series_list))    

In [None]:
    start = time.time()
    await asyncio.gather(*[foo() for x in range(i)])
    print(f'{i} runs in {time.time() - start} seconds')

# Throttle requests to 50 concurrent
semaphore = asyncio.Semaphore(50)
#...
async def limit_wrap(url):
    async with semaphore:
        # do what you want
#...
results = asyncio.gather([limit_wrap(url) for url in urls])


In [None]:
study_ids = [s['StudyInstanceUID'] for s in studies]
stuff = []

l = [x for x in range(len(study_ids))]
batchsize = 20
for x in range(0,len(l),batchsize):
    #print('--------')
    trimmed = [y for y in range(x,x+batchsize) if y<len(l)]
    print(trimmed)

    for a in l[x:x+len(trimmed)]:
        stuff.append(a)
        
print(len(stuff))
print(len(study_ids))
    

In [None]:
import time
study_ids = [s['StudyInstanceUID'] for s in studies]
print(len(study_ids))


series_list = [] # list to store all the series for a collection

entire_start = time.time()
async_client = httpx.AsyncClient()
batchsize = 10
batches=2
for i in range(0,batchsize*batches,batchsize):
    start = time.time()
    responses = await asyncio.gather(*[getSeriesPerStudy("1.3.6.1.4.1.14519.5.2.1.7695.4001.130563880911723253267280582465", async_client) for sid in study_ids[i:i+batchsize]])
    # unfortunately responses is a list of list of dicts, we need to flatten
    flatten_matrix = [val for sublist in responses for val in sublist] 
    print(len(flatten_matrix))
    [series_list.append(r) for r in flatten_matrix]
    print(len(series_list))
    print(f'Studies {i}-{i+batchsize} in {time.time() - start} seconds')  
await async_client.aclose()
print(time.time() - entire_start)
print(len(series_list))
# 2 batches of 20 studies is 98 seconds, series len = 160
# 2 batches of 10 studies is 50 seconds, series len = 80
# 2 batches of  5 studius is 26 seconds, series len = 40
# 2 batches of  2 studies is 22 seconds, series len = 16
# 2 batches of  1 study   is 15 seconds, series len = 8
# kind of linear growth

In [None]:
print(len(responses))
print(len(series_list))

In [None]:
flatten_matrix = [val for sublist in series_list for val in sublist] 
print(len(flatten_matrix))

series_list[:2]
df = pd.DataFrame.from_dict(series_list)
print(len(df))
df.head()


In [None]:
# Need to make this MUCH faster



for study in studies[:3]:
    series = await getSeriesPerStudy(study['StudyInstanceUID'])

    #create a list of series dicts (combining metadata from study)
    for s in series:
        # merge the dictionaries using ** to unpack the dictionaries (since .union is in place)
        merged_dict = {**study, **s}
        series_list.append(merged_dict)

In [None]:
import json
collection = tcia_collections[1]
with open(f'data/{collection["Collection"]}-series-test.json',"w") as f:
    json.dump(series_list, f)
    #series_list = f.read()

In [None]:
type(series_list)

In [None]:
# Load the list if we need to
collection = tcia_collections[1]
with open(f'data/{collection["Collection"]}-series-test.json',"r") as f:
    l = json.load(f)
type(l) # should read 'list'

In [None]:
q = asq.QueueClient.from_connection_string(conn_str='DefaultEndpointsProtocol=https;AccountName=sjbfunctest;AccountKey=XuYBliYrXazCmfDdK2jLcaJcfqPgu8tC43TlltTMY413nusjx2N6+IvErYmVXuZfOBVgVaCQ52RObKioS9FDRg==;EndpointSuffix=core.windows.net', queue_name='foofoo3')

try:
    p = q.get_queue_properties()
except:
    q.create_queue()
q.send_message('Hello-There ')
r = asq.TextBase64EncodePolicy()
r.encode('TEST-THIS')

In [None]:
# Works with the addition of async with 

import httpx
import asyncio
import aiofiles

async def download(url:str):
    url = "https://services.cancerimagingarchive.net/services/v3/TCIA/query/getCollections"
    async with httpx.AsyncClient() as client:
        resp = await client.get(url)
    return resp

async def download_lots(i):
    url = "https://services.cancerimagingarchive.net/services/v3/TCIA/query/getCollections"
    await asyncio.gather(*[download(url) for x in range(i)])

#if __name__ == "__main__":
#    asyncio.run(download_lots))  # used outside of Jupyter when I don't have an event loop

for i in range(7):
    start = time.time()
    await download_lots(i)
    print(f'{i} runs in {time.time() - start} seconds')

print('done')

In [None]:
import httpx
import asyncio
import aiofiles

import os

async def download(url:str, folder:str):
    filename = url.split("/")[-1]
    resp = await httpx.get(url)
    resp.raise_for_status()
    #async with aiofiles.open(os.path.join(folder, filename), "wb") as f:
    #    await f.write(resp.content)


async def download_all_photos(loops: str):
    #resp = httpx.get("https://jsonplaceholder.typicode.com/photos")
    #resp.raise_for_status()
    #urls = list(set(d["url"] for d in resp.json()))[:10]
    #os.makedirs(out_dir, exist_ok=True)
    url = "https://services.cancerimagingarchive.net/services/v3/TCIA/query/getCollectionValues"
    await asyncio.gather(*[download(url, "bob") for x in range(loops)])


#if __name__ == "__main__":
#    asyncio.run(download_all_photos('100_photos'))

for i in range(5):
    start = time.time()
    await download_all_photos(i)
    print(f'{i} runs in {time.time() - start} seconds')

In [None]:
async with httpx.AsyncClient() as client:
    r = await client.get('https://www.example.com/')
r

In [None]:
##  This works with HTTPX
async def foo():
    async with httpx.AsyncClient() as client:
        url = 'https://services.cancerimagingarchive.net/services/v3/TCIA/query/getCollectionValues'
        r = await client.get(url) #'https://www.example.com/')
    return r


for i in range(5):
    start = time.time()
    await asyncio.gather(*[foo() for x in range(i)])
    print(f'{i} runs in {time.time() - start} seconds')


In [None]:
import aiohttp
from aiohttp import ClientSession
import asyncio

async def call_url(x, session):
    url = "https://services.cancerimagingarchive.net/services/v3/TCIA/query/getSeries?Collection=TCGA-GBM&StudyInstanceUID=1.3.6.1.4.1.14519.5.2.1.7695.4001.130563880911723253267280582465"
    
    response = await session.get(url, timeout=None)
    response_json = await response.json()
    return response_json


async def run_program(x, session):
    """Wrapper for running program in an asynchronous manner"""
    #try:
    response = await call_url(x, session)
        #print(f"Response: {json.dumps(response, indent=2)}")
    #except Exception as err:
        #print(f"Exception occured: {err}")
        #pass



In [None]:
import time
for i in range(5):
    start = time.time()
    #async with httpx.AsyncClient() as session:
    async with ClientSession as sesssion:
        await asyncio.gather(*[run_program(x,session) for x in range(i)])
   # print(f'{i} runs in {time.time() - start}')

In [None]:
    try:
        response = await session.request(method='GET', url=url)
        response.raise_for_status()
        print(f"Response status ({url}): {response.status}")
    except HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except Exception as err:
        print(f"An error ocurred: {err}")

In [None]:
# Works with aiohttp but not httpx 
import aiohttp
import asyncio
import time
import httpx

async def call_url(session):
    url = "https://services.cancerimagingarchive.net/services/v3/TCIA/query/getCollections"
    #response = await session.request(method='GET', url=url)
    response = await session.get(url=url)

    return response

for i in range(1,5):
    start = time.time() # start time for timing event
    async with aiohttp.ClientSession() as session: #use aiohttp
    #async with httpx.AsyncClient as session:  #use httpx
        await asyncio.gather(*[call_url(session) for x in range(i)])
    print(f'{i} call(s) in {time.time() - start} seconds')

In [None]:
import aiohttp
import asyncio
import time
import httpx

async def call_url(session):
    url = "https://services.cancerimagingarchive.net/services/v3/TCIA/query/getCollectionValues"
    #response = await session.request(method='GET', url=url)
    response = await session.get(url=url)

    return response

for i in range(1,5):
    start = time.time() # start time for timing event
    #async with aiohttp.ClientSession() as session: #use aiohttp
    session = httpx.AsyncClient() #use httpx
    await asyncio.gather(*[call_url(session) for x in range(i)])
    await session.aclose()
    print(f'{i} call(s) in {time.time() - start} seconds')

In [None]:
for i in range(1,5):
    start = time.time()
    async with aiohttp.ClientSession() as session:
    #async with httpx.AsyncClient as session:
        await asyncio.gather(*[call_url(session) for x in range(i)])
    print(f'{i} call(s) in {time.time() - start} seconds')

In [None]:
import aiohttp
import asyncio
import time
import httpx

async def call_url(session):
    url = "https://services.cancerimagingarchive.net/services/v3/TCIA/query/getCollections"
    #async with aiohttp.ClientSession() as session: #use aiohttp
    async with httpx.AsyncClient as session:  #use httpx
        response = await session.get(url=url)

    return response

for i in range(1,5):
    start = time.time() # start time for timing event
    await asyncio.gather(*[call_url(session) for x in range(i)])
    print(f'{i} call(s) in {time.time() - start} seconds')

In [None]:
print(len(series_list))
print(series_list[:1])



In [None]:
study_df = pd.DataFrame.from_dict(series_list)
study_df.to_csv(f'data/{tcia_collections[0]["Collection"]}_studies_series.csv')


In [None]:
len(study_df)
with open('data/TCGA-GBM-series.json',"w") as f:
    f.write(str(series_list))

In [None]:
series_sample = series[0]    
study_sample = studies[0]    

In [None]:
 
series_fields = [x for x in series_sample]
study_fields = [x for x in study_sample]
print(len(series_fields))
print(len(study_fields))
merged = list(set(series_fields).union(set(study_fields)))

merged2 = {**study_sample, **series_sample}

print(study_sample)

In [None]:
study_sample.update(series_sample)
print(len(study_sample))
print(study_sample)

merged2 = {**study_sample, **series_sample}
print(len(merged2))
print(merged2)

In [None]:
study_sample.update(series_sample)
print(len(study_sample))
study_sample
from collections import OrderedDict
od = OrderedDict(study_sample)
od

x = []
x.append(study_sample)

df = pd.DataFrame.from_dict(x)

df.head()

In [None]:
print(series_sample)
print(study_sample)


In [None]:
df = pd.DataFrame.from_dict(studies)

In [None]:
print(len(df))

In [None]:
df.head()

In [None]:
#https://services.cancerimagingarchive.net/services/v3/TCIA/query/getSeries?Collection=TCGA-GBM&StudyInstanceUID=1.3.6.1.4.1.14519.5.2.1.7695.4001.130563880911723253267280582465

In [None]:
studies = [s['StudyInstanceUID'] for s in res.json()]
#for x in res.json():
#    print(x['StudyInstanceUID'])
studies

In [None]:
    for x in collections.json()[:2]:
        print(x['Collection'])

In [None]:
# Create a list with all the studies
studies = [s['StudyInstanceUID'] for s in res.json()]


#storageConnString = os.environ["AzureWebJobsStorage"]
storageConnString = 'DefaultEndpointsProtocol=https;AccountName=sjbfunctest;AccountKey=XuYBliYrXazCmfDdK2jLcaJcfqPgu8tC43TlltTMY413nusjx2N6+IvErYmVXuZfOBVgVaCQ52RObKioS9FDRg==;EndpointSuffix=core.windows.net'

#x = asq.QueueService(account_name='sjbfunctest', account_key='mykey')
#service = asq.QueueServiceClient.from_connection_string(conn_str=connection_string)
patient_studies_queue = asq.QueueClient.from_connection_string(conn_str=storageConnString,queue_name='studies')

# Create the queue if it doesn't exist...  by exception
#   Which is hacky, but effective
try:
    patient_studies_queue.get_queue_properties()
except:
    patient_studies_queue.create_queue()

# Must base-64 encode since... functions...
enc = asq.TextBase64EncodePolicy()


In [None]:

for study in studies:
    b64 = enc.encode(study)
    patient_studies_queue.send_message(b64)

In [None]:
len(studies)
study_id = studies[0]
study_id

In [None]:
study_id = studies[0]

urlGetSeries = 'https://services.cancerimagingarchive.net/services/v3/TCIA/query/getSeries'
params = {'StudyInstanceUID': study_id}
res = requests.get(urlGetSeries,params=params,timeout=None) #timeout=15.0)



In [None]:
res.json()[0]

In [None]:
# Create a list with all the studies
series = [s['SeriesInstanceUID'] for s in res.json()]

series



In [None]:
#storageConnString = os.environ["AzureWebJobsStorage"]
storageConnString = 'DefaultEndpointsProtocol=https;AccountName=sjbfunctest;AccountKey=XuYBliYrXazCmfDdK2jLcaJcfqPgu8tC43TlltTMY413nusjx2N6+IvErYmVXuZfOBVgVaCQ52RObKioS9FDRg==;EndpointSuffix=core.windows.net'

series_queue = asq.QueueClient.from_connection_string(conn_str=storageConnString,queue_name='series')

# Create the queue if it doesn't exist...  by exception
#   Which is hacky, but effective
try:
    series_queue.get_queue_properties()
except:
    series_queue.create_queue()

# Must base-64 encode since... functions...
enc = asq.TextBase64EncodePolicy()

for s in series[:1]:
    b64 = enc.encode(s)
    series_queue.send_message(b64)
    print(s)

In [None]:
# Get the zip files
url = 'https://services.cancerimagingarchive.net/services/v3/TCIA/query/getImage?SeriesInstanceUID=1.3.6.1.4.1.14519.5.2.1.7695.4001.306204232344341694648035234440'
res = requests.get(url,timeout=None)

In [None]:
res

In [None]:
#Download files


In [None]:
   # Get the study id from the base-64 encoded incoming queue
#series_id = msg.get_body().decode('utf-8')
series_id = series[0]

series_id

In [None]:
urlGetImage = 'https://services.cancerimagingarchive.net/services/v3/TCIA/query/getImage'
params = {'SeriesInstanceUID': series_id}
res = requests.get(urlGetImage,params=params,timeout=None) #timeout=15.0)
print(res.url)

In [None]:
import requests
import shutil

def download_file(url):
    local_filename = "foo4.zip"
    with requests.get(urlGetImage, stream=True) as r:
        with open(local_filename, 'wb') as f:
            shutil.copyfileobj(r.raw, f)

    return local_filename

In [None]:
local_filename = "food4.zip"
with requests.get(urlGetImage,params=params,timeout=None, stream=True) as r:
    with open(local_filename, 'wb') as f:
        shutil.copyfileobj(r.raw, f)

In [None]:
import zipfile
import io
from io import BytesIO

file_like_object = io.BytesIO(res.content)
z = zipfile.ZipFile(file_like_object)

In [None]:
files = z.filelist
f1 = files[1]

#for f in files:
    #print(f)
    #z.read(f)
dcmbytes = z.read(f)
#dcmbytes

In [None]:
#!pip install azure.storage.blob

import azure.storage.blob as blob

In [None]:
parts = f1.filename.split('/')
dcm_names = [p for p in parts if p.find('.dcm') != -1]
if len(dcm_names) > 0:
    dcm_name = dcm_names[0]
dcm_name

In [None]:
storageConnString = 'DefaultEndpointsProtocol=https;AccountName=sjbfunctest;AccountKey=XuYBliYrXazCmfDdK2jLcaJcfqPgu8tC43TlltTMY413nusjx2N6+IvErYmVXuZfOBVgVaCQ52RObKioS9FDRg==;EndpointSuffix=core.windows.net'

b = blob.ContainerClient.from_connection_string(conn_str=storageConnString,container_name='dicoms2')

# Create the queue if it doesn't exist...  by exception
#   Which is hacky, but effective
try:
    b.get_container_properties()
except:
    b.create_container()

for f in files:
    dicom_file = z.read(f)
    parts = f.filename.split('/')
    dcm_parts = [p for p in parts if p.find('.dcm') != -1]
    if len(dcm_parts) == 1: # we have a dicom file, and only one
        dcm_name = f'{series_id}/{dcm_parts[0]}'
        print(dcm_name)
        up = b.upload_blob(data=z.read(f), name=dcm_name)
    
    
    

#upblob = b.upload_blob(data=dcmbytes,name='test3.dcm')

In [None]:
upblob.blob_name

In [None]:

storageConnString = 'DefaultEndpointsProtocol=https;AccountName=sjbfunctest;AccountKey=XuYBliYrXazCmfDdK2jLcaJcfqPgu8tC43TlltTMY413nusjx2N6+IvErYmVXuZfOBVgVaCQ52RObKioS9FDRg==;EndpointSuffix=core.windows.net'

series_queue = asq.QueueClient.from_connection_string(conn_str=storageConnString,queue_name='series')

# Create the queue if it doesn't exist...  by exception
#   Which is hacky, but effective
try:
    series_queue.get_queue_properties()
except:
    series_queue.create_queue()

# Must base-64 encode since... functions...
enc = asq.TextBase64EncodePolicy()


In [None]:



    
    
    
    
    # Create a list with all the studies
    series = [s['SeriesInstanceUID'] for s in res.json()]


    storageConnString = os.environ["AzureWebJobsStorage"]
  
    series_queue = asq.QueueClient.from_connection_string(conn_str=storageConnString,queue_name='series')

    # Create the queue if it doesn't exist...  by exception
    #   Which is hacky, but effective
    try:
        series_queue.get_queue_properties()
    except:
        series_queue.create_queue()

    # Must base-64 encode since... functions...
    enc = asq.TextBase64EncodePolicy()

    for s in series:
        b64 = enc.encode(s)
        series_queue.send_message(b64)

In [None]:
import httpx
import asyncio
import aiofiles

import os

async def download(url:str, folder:str):
    filename = url.split("/")[-1]
    resp = await httpx.get(url)
    resp.raise_for_status()
    async with aiofiles.open(os.path.join(folder, filename), "wb") as f:
        await f.write(resp.content)


async def download_all_photos(out_dir: str):
    resp = await httpx.get("https://jsonplaceholder.typicode.com/photos")
    resp.raise_for_status()
    urls = list(set(d["url"] for d in resp.json()))[:100]
    os.makedirs(out_dir, exist_ok=True)
    await asyncio.gather(*[download(url, out_dir) for url in urls])


if __name__ == "__main__":
    asyncio.run(download_all_photos('100_photos'))

In [None]:
## WORKS

import httpx
import asyncio
import aiofiles

import os

async def download(url:str, folder:str):
    filename = url.split("/")[-1]
    async with httpx.AsyncClient() as session:
        resp = await session.get(url)
        resp.raise_for_status()
    async with aiofiles.open(os.path.join(folder, filename), "wb") as f:
        await f.write(resp.content)
        
async def download_all_photos(out_dir: str):
    async with httpx.AsyncClient() as session:
        resp = await session.get("https://jsonplaceholder.typicode.com/photos")
        resp.raise_for_status()
    urls = list(set(d["url"] for d in resp.json()))[:100]
    os.makedirs(out_dir, exist_ok=True)
    await asyncio.gather(*[download(url, out_dir) for url in urls])


#if __name__ == "__main__":
#    asyncio.run(download_all_photos('100_photos'))
    
await download_all_photos('100_photos')

In [None]:
def write_stuff(f):
    f.write('And stuff with context passed to another method. ')

In [None]:
def write_stuff(f):
    f.write('And stuff with context passed to another method. ')
    
with open('foo.txt',"w") as f:
    f.write('Start with context manager inside with statement. ')
    write_stuff(f)
    f.write('And back to close the with.')


In [None]:
!pip show httpx


In [29]:
print("Hello! What is your name?")

Hello! What is your name?


In [30]:
input = name

NameError: name 'name' is not defined

In [None]:
print("Hello, " name "! How are you doing, " name "? My name is Bodie, " name ".")