In [1]:
# from pyspark.sql.types import BooleanType
# from pyspark.sql import functions as F

import os
import urllib
import requests
import sys
import math
import json
from zipfile import ZipFile
import pandas as pd
import numpy as np
import ipywidgets as widgets
import random
from IPython.display import HTML
from ipywidgets import Box, Layout
from IPython.display import display

def hide_toggle(for_next=False):
    this_cell = """$('div.cell.code_cell.rendered.selected')"""
    next_cell = this_cell + '.next()'

    toggle_text = 'Toggle show/hide'  # text shown on toggle link
    target_cell = this_cell  # target cell to control with toggle
    js_hide_current = ''  # bit of JS to permanently hide code in current cell (only when toggling next cell)

    if for_next:
        target_cell = next_cell
        toggle_text += ' next cell'
        js_hide_current = this_cell + '.find("div.input").hide();'

    js_f_name = 'code_toggle_{}'.format(str(random.randint(1,2**64)))

    html = """
        <script>
            function {f_name}() {{
                {cell_selector}.find('div.input').toggle();
            }}

            {js_hide_current}
        </script>

        <a href="javascript:{f_name}()">{toggle_text}</a>
    """.format(
        f_name=js_f_name,
        cell_selector=target_cell,
        js_hide_current=js_hide_current, 
        toggle_text=toggle_text
    )

    return HTML(html)
# ------------------------------------------------------------------------------------------------------------------
#
# Refer https://wiki.cancerimagingarchive.net/display/Public/TCIA+Programmatic+Interface+%28REST+API%29+Usage+Guide for complete list of API Resources
#
class TCIAClient:
    GET_IMAGE = "getImage"
    GET_MANUFACTURER_VALUES = "getManufacturerValues"
    GET_MODALITY_VALUES = "getModalityValues"
    GET_COLLECTION_VALUES = "getCollectionValues"
    GET_BODY_PART_VALUES = "getBodyPartValues"
    GET_PATIENT_STUDY = "getPatientStudy"
    GET_SERIES = "getSeries"
    GET_PATIENT = "getPatient"
    GET_SERIES_SIZE = "getSeriesSize"
    CONTENTS_BY_NAME = "ContentsByName"
    GET_SOP_INSTANCE_UID = "getSOPInstanceUIDs"

    def __init__(self, baseUrl, resource):
        self.baseUrl = baseUrl + "/" + resource

    def execute(self, url, queryParameters={}):
        queryParameters = dict((k, v) for k, v in queryParameters.items() if v)
        queryString = "?%s" % urllib.parse.urlencode(queryParameters)
        requestUrl = url + queryString
        resp = requests.get(url=requestUrl)
        return resp
      
    def get_collection_values(self, outputFormat = "json" ):
        # set of all collection names
        serviceUrl = self.baseUrl + "/query/" + self.GET_COLLECTION_VALUES
        queryParameters = { "format" : outputFormat }
        resp = self.execute(serviceUrl , queryParameters)
        return resp
      
    def get_series(self, collection = None , modality = None , studyInstanceUid = None , outputFormat = "json" ):
        serviceUrl = self.baseUrl + "/query/" + self.GET_SERIES
        queryParameters = {"Collection" : collection , "StudyInstanceUID" : studyInstanceUid , "Modality" : modality , "format" : outputFormat }
        resp = self.execute(serviceUrl , queryParameters)
        return resp
      
    def get_sop_instance_uids(self, seriesInstanceUid, outputFormat = "json" ):
        serviceUrl = self.baseUrl + "/query/" + self.GET_SOP_INSTANCE_UID
        queryParameters = {"SeriesInstanceUID" : seriesInstanceUid }
        resp = self.execute(serviceUrl , queryParameters)
        return resp
      
    
    def get_image(self, seriesInstanceUid, local_path):
        '''
        seriesInstanceUid: the series of images to load
        dbfsPath: the path to the mounted folder in dbfs to place the images (should follow convention collection/patient/study/series/image.dcm)
        '''
        serviceUrl = self.baseUrl + "/query/" + self.GET_IMAGE
        queryParameters = { "SeriesInstanceUID" : seriesInstanceUid }
        os.umask(0o022)
        try:
            queryParameters = dict((k, v) for k, v in queryParameters.items() if v)
            queryString = "?%s" % urllib.parse.urlencode(queryParameters)
            requestUrl = serviceUrl + queryString
            zipresp = urllib.request.urlopen(requestUrl)
            
            # Specify path on driver for zip file and extraction file
            tempZipPath = local_path +'/'+ seriesInstanceUid + '.zip'
            os.makedirs(local_path +'/'+ seriesInstanceUid)
    
            bytesRead = zipresp.read()
            fout = open(tempZipPath, "wb")
            fout.write(bytesRead)
            fout.close()
    
            old_dir = os.getcwd()
            os.chdir(local_path)
            zf = ZipFile(tempZipPath)
            zf.extractall(path = local_path+'/'+seriesInstanceUid)
#             print('Writing slice to: ', local_path+'/'+seriesInstanceUid)
#             print()
            zf.close()
            os.remove(tempZipPath)
            os.chdir(old_dir)
            return True
        except requests.HTTPError as e:
            print("HTTP Error:", e.code, serviceUrl)
            return False


## --------------------------------------- DEFINING FUNCTIONS ---------------------------------------------------- ##
def get_collection_data(collection_name):
    # Get a dataframe of patients based on collection name from widget
    collection_df = pd.DataFrame(tcia_client.get_series(collection = collection_name, outputFormat = "json").json())
    collection_df['Collection'] = np.repeat(collection_name, collection_df.shape[0])
    print('Meta data for patients in collection ', collection_name, ' loaded. \n')
    return collection_df


def load_image(collection, patient_id, study_instance_uid, series_instance_uid, dest_path):
    if dest_path:
        path = dest_path +'/'+ collection +'/'+ patient_id +'/'+ study_instance_uid
    else:
        path = os.getcwd() +'/'+ collection +'/'+ patient_id +'/'+ study_instance_uid
        
    return tcia_client.get_image(seriesInstanceUid=series_instance_uid, local_path=path)
    
def load_images(df, dest_path):
    print('Loading images...')
    ## Loops through all of the images in the collection_df, calls a function to store them at the specified location
    ## is_img_loaded is a list of Boolean values (True means image was loaded - False means image failed to load)
    is_img_loaded = list(map(load_image, df['Collection'], df['PatientID'], df['StudyInstanceUID'],
                 df['SeriesInstanceUID'], np.repeat(dest_path, df.shape[0])))
    
    if all(flag == True for flag in is_img_loaded):
        return "Collection loaded successfully!"

    
# ---------------------------------------- DEFINE WIDGETS --------------------------------------------------------- #       
#Instantiate TCIA Client
tcia_client = TCIAClient(baseUrl='https://services.cancerimagingarchive.net/services/v4', resource='TCIA')

# Gather list of all collections to determine which one to load and create widget
collections = sorted([obj['Collection'] for obj in tcia_client.get_collection_values().json()])

collection_dropdown = widgets.Dropdown(
    options=collections,
    value=collections[0],
    description='Collection:',
    disabled=False,
)

collection_dropdown = widgets.Dropdown(
    options=collections,
    value=collections[0],
    description='Collection:',
    disabled=False,
    layout = Layout(width='200px')
)

destination_path = widgets.Text(
    value=os.getcwd(),
    placeholder=os.getcwd(),
    description='Destination Path:',
    disabled=False,
    layout = Layout(width='500px')
)

items = [collection_dropdown, destination_path]
box = Box(children=items)   

# output = widgets.Output()

hide_toggle()     

## Starter Notebook on Pulling Images From TCIA API Directly Into DataLake

**The Cancer Imaging Archive (TCIA)**

> The image data in The Cancer Imaging Archive (TCIA) is organized into purpose-built collections. 
> A collection typically includes studies from several subjects (patients) - typically patients’ imaging related by a common disease (e.g. lung cancer), 
> image modality or type (MRI, CT, digital histopathology, etc) or research focus.  In some collections, there may be only one study per subject. 
> In other collections, subjects may have been followed over time, in which case there will be multiple studies per subject. 
> The subjects typically have a disease and/or particular anatomical site (lung, brain, etc.) in common.  

> TCIA submissions are organized in the following hierarchy, which is important to remember in creating your search query and reviewing search results:

> Collection > Patient (Subject) > Study > Series > Images

> In other words, a Collection is the largest organizing concept within TCIA and it includes data about Patients (also called Subjects). 
> As you continue to drill down to more granular concepts, Patients contain Studies, Studies contain (Image) Series, and Series contain individual Images.


**Directions**
To use this notebook, run the first cell and enter desired inputs.
+ *Collection*: the collection that you want to load
+ *Destination Path*: the folder in which you want to load your images. The images will load in the following file structure <br>
    > Collection > Patient (Subject) > Study > Series > Images

In [2]:
box

Box(children=(Dropdown(description='Collection:', layout=Layout(width='200px'), options=('4D-Lung', 'AAPM-RT-M…

In [3]:
button = widgets.Button(description="Download Images")
output = widgets.Output()
display(button, output)


def on_button_clicked(b):
    with output:
        collection_df = get_collection_data(collection_dropdown.value).head(2)
        dest_path = None if destination_path.value == '' else destination_path.value
        load_images(collection_df, dest_path)

button.on_click(on_button_clicked)

Button(description='Click Me!', style=ButtonStyle())

Output()