In [25]:
import os
import requests
import io
import pandas as pd
import pydicom
from pathlib import Path
import time

from pydicom.dataset import Dataset as DcmDataset
from pydicom.tag import BaseTag as DcmTag
from pydicom.multival import MultiValue as DcmMultiValue

#see https://stackoverflow.com/questions/15746558/how-to-send-a-multipart-related-with-requests-in-python

#from fastcore.foundation import * #Patch here (but also imports fastcore.imports (which is not sufficient))
    # But parallel is NOT here... :-(

from fastai2.data.all import *
#from fastai2 import *



In [26]:

#url = "https://dicom-server-instance.azurewebsites.net/studies"
url = "https://sjbdicomdemo.azurewebsites.net/studies"

dicom_server_name = "sjbpostman"
url = f"https://{dicom_server_name}.azurewebsites.net/studies"
url

'https://sjbpostman.azurewebsites.net/studies'

In [27]:
# Cell
@patch
def dcmread(fn:Path, force = False): #, force = False):
    "Open a `DICOM` file"
    return pydicom.dcmread(str(fn), force)

# Cell
def _cast_dicom_special(x):
    cls = type(x)
    if not cls.__module__.startswith('pydicom'): return x
    if cls.__base__ == object: return x
    return cls.__base__(x)

def _split_elem(res,k,v):
    if not isinstance(v,DcmMultiValue): return
    res[f'Multi{k}'] = 1
    for i,o in enumerate(v): res[f'{k}{"" if i==0 else i}']=o

        # Cell
@patch
def as_dict(self:DcmDataset, px_summ=False): #, window=dicom_windows.brain):
    pxdata = (0x7fe0,0x0010)
    vals = [self[o] for o in self.keys() if o != pxdata]
    its = [(v.keyword,v.value) for v in vals]
    res = dict(its)
    res['fname'] = self.filename
    for k,v in its: _split_elem(res,k,v)
    if not px_summ: return res
    stats = 'min','max','mean','std'
#    try:
#        pxs = self.pixel_array
#        for f in stats: res['img_'+f] = getattr(pxs,f)()
#        res['img_pct_window'] = self.pct_in_window(*window)
#    except Exception as e:
#        for f in stats: res['img_'+f] = 0
#        print(res,e)
    for k in res: res[k] = _cast_dicom_special(res[k])
    return res

# Cell
def _dcm2dict(fn, **kwargs): return fn.dcmread().as_dict(**kwargs)

# Cell
@delegates(parallel)
def _from_dicoms(cls, fns, n_workers=0, **kwargs):
    return pd.DataFrame(parallel(_dcm2dict, fns, n_workers=n_workers, **kwargs))
pd.DataFrame.from_dicoms = classmethod(_from_dicoms)

In [28]:

# Validate goes beyond checking the dicom header, but attempts a actually load every single file into a Pydicom Dataset
#    This later step has shown effective for ensuring that the metadata can be loaded into a dataset
def get_dicom_files_from_directory(dirpath,extension="*.dcm", validate=True):
    
    print(str(validate))
    path = Path(dirpath) 
    # pydicom 2 has an is_valid method to replace is_dicom (maybe look at pydicom2 for other reasons)
    files = [x for x in path.rglob(extension)] # if pydicom.misc.is_dicom(str(x))] # checks for DICM in header (fast and loose)
    if validate:
        for file in files[:]:
            #pydicom.validate() #is this only in pydicom 2?
            try:
                #print(file)
                x = dcmread(file)           
            except:
                # remove file if it throws an exception
                files.remove(x)
    #Potentially look at pydicom.dataset.validate_file_meta(file_meta, enforce_standard=True) for deeper validation
    #  https://pydicom.github.io/pydicom/dev/reference/generated/pydicom.dataset.validate_file_meta.html
    return files
    

In [29]:
#files = get_dicom_files_from_directory('C:\\data\\rsna-pneumonia-detection-challenge\\stage_2_train_images', validate=False) # all succeed
files = get_dicom_files_from_directory(r'C:\data\fakedicom\files', validate=False) # all succeed

#files = get_dicom_files_from_directory('C:\\data\\rsna-pneumonia-detection-challenge\\train_subset', validate=False) # all succeed
print(len(files))
                               

False
3


In [30]:
files = [x for x in files if not 'pydicom' in str(x)]
print(len(files))

3


In [31]:

# Can just use this to load all the files in a directory, instead of calling above
#dirname = 'C:\\githealth\\dicom-samples\\visus.com'
#files = [x for x in Path(dirname).rglob('*.dcm')] # if not 'pydicom' in str(x)] # exclude pydicom test files



In [32]:
# Stupid simple function to get a Dataframe, but can be extended later to add logic
def get_dicom_metadata_as_dataframe(list_of_files,columns=[],save_as_filename=''):
    df = pd.DataFrame.from_dicoms(list_of_files)
    if save_as_filename != '':
        df.to_csv(save_as_filename)
    return df
        
    



In [33]:
## The following goes and gets metadata for 4d-lung


In [34]:
len(files)

3

In [35]:
#df = pd.DataFrame.from_dicoms(files)
#df.to_csv('TestDicom2.csv')

In [36]:
from urllib3.filepost import encode_multipart_formdata, choose_boundary

def encode_multipart_related(fields, boundary=None):
    if boundary is None:
        boundary = choose_boundary()

    body, _ = encode_multipart_formdata(fields, boundary)
    content_type = str('multipart/related; boundary=%s' % boundary)

    return body, content_type

In [37]:
def upload_single_dcm_file(server_url,filepath):
    with open(filepath,'rb') as reader:
        rawfile = reader.read()
    files = {'file': ('dicomfile', rawfile, 'application/dicom')}

    #encode as multipart_related
    body, content_type = encode_multipart_related(fields = files)
    
    headers = {'Accept':'application/dicom+json', "Content-Type":content_type}

    response = requests.post(url, body, headers=headers) #, verify=False)
    
    #return the response object to allow for further processing
    
    #example usage
    #r = upload_single_dcm_file(url,'C:\\githealth\\dicom-samples\\visus.com\\case4\\case4a_002.dcm')
    #print(r.status_code)
    #print(r.request.headers)
    
    return response

In [38]:
#r = upload_dcm_file(url,'C:\\githealth\\dicom-samples\\visus.com\\case4\\case4a_002.dcm')
#print(r.status_code)
#print(r.request.headers)

In [39]:
from collections import OrderedDict
def store_files_to_dicomweb_with_logging_to_dataframe(filepaths, df = None):
    if df is None:
        # Then we have a blank dataframe, maybe do something here.
        #   for now, I'm just using a blank one.
        #columns = ['fname','status_code','response_notes']
        df = pd.DataFrame()
        df['fname']=filepaths
    
    # Fastest to just use lists and dictionaries then create a new dataframe from that...
    numfiles=len(filepaths)
    current_count=0
    
    files_info = []
    for file in filepaths:
        
        # Upload a single file at a time (and time it)
        r = upload_single_dcm_file(url,file)     # call API and get response 
        d = OrderedDict()   # create a new ordered dictionary
        
        # Add the information to the dictionary
        d['fname'] = str(file)     
        d['method'] = r.request.method
        d['url'] = r.url
        d['path_url'] = r.request.path_url
        d['request_headers'] = str(r.request.headers)
        d['request_body_trimmed'] = r.request.body[0:150]   

        d['ok'] = r.ok
        d['status_code'] = r.status_code  
        d['reason'] = r.reason
        d['response_headers'] = r.headers
        d['response_text'] = r.text  
        d['elapsed_time'] = r.elapsed
        d['apparent_encoding'] = r.apparent_encoding
        d['encoding'] = r.encoding
        
 
        files_info.append(d)
        current_count += 1
        if current_count%250==0:
            print(f"{current_count} of {numfiles} uploaded")

    # Create a dataframe from the uploads
    df = pd.DataFrame(files_info)               
    return df
        
    
    

In [40]:
files = files[:1000]
len(files)

3

In [41]:
# This runs all the files
#df_meta = get_dicom_metadata_as_dataframe(files)
#df_meta.to_csv('c:\\!mlhack\\metadata-mlhack_train_subset.csv')

In [42]:
# This runs all the files
#df_upload = store_files_to_dicomweb_with_logging_to_dataframe(files)
#df_upload.to_csv('c:\\!mlhack\\upload-data-mlhack_train_subset.csv')

In [43]:
def process_file_subset(file_subset):
    #print(f'Processing {len(file_subset)} files')
    #df_meta = pd.DataFrame()
    #df_upload = pd.DataFrame() 
    df_meta = get_dicom_metadata_as_dataframe(file_subset)
    #df_meta.to_csv('c:\\!mlhack\\metadata-mlhack_train_subset.csv')
    df_upload = store_files_to_dicomweb_with_logging_to_dataframe(file_subset)
    #df_upload.to_csv('c:\\!mlhack\\upload-data-mlhack_train_subset.csv')
    return df_meta, df_upload

In [44]:
def split_list(a_list, size):
     lists = []
     while len(a_list) > size:
         pice = a_list[:size]
         lists.append(pice)
         a_list = a_list[size:]
     lists.append(a_list)
     return lists

In [45]:
groups_of_files = split_list(files,100)
len(groups_of_files)

1

In [46]:
len(groups_of_files[3])

IndexError: list index out of range

In [47]:
# Process all the files in chunks
batch_size = 50
df_meta = pd.DataFrame()
df_upload = pd.DataFrame()

groups_of_files = split_list(files,batch_size)

for i,group in enumerate(groups_of_files):
    print(f'Processing {i} of {len(groups_of_files)} groups with {len(group)} files each')
    df_meta_subset, df_upload_subset = process_file_subset(group)
    df_meta = pd.concat([df_meta,df_meta_subset])
    df_upload = pd.concat([df_upload,df_upload_subset])
    df_meta.to_csv('c:\\!mlhack\\metadata-ab_train_sofar.csv')
    df_upload.to_csv('c:\\!mlhack\\upload-data-ab_train_sofar.csv')

Processing 0 of 1 groups with 3 files each


In [48]:
#df_meta.to_csv('c:\\!mlhack\\metadata-ab_train.csv')
#df_upload.to_csv('c:\\!mlhack\\upload-data-ab_train.csv')
df_meta.to_csv(r'C:\data\fakedicom\files\metadata.csv')
df_upload.to_csv(r'C:\data\fakedicom\files\upload.csv')

In [27]:
len(df_upload)

1000

In [29]:
df_upload.head()

Unnamed: 0,fname,method,url,path_url,request_headers,request_body_trimmed,ok,status_code,reason,response_headers,response_text,elapsed_time,apparent_encoding,encoding
0,C:\data\rsna-pneumonia-detection-challenge\stage_2_train_images\0004cfab-14fd-4e49-80ba-63a80b6bddd6.dcm,POST,https://ahscrdicom.azurewebsites.net/studies,/studies,"{'User-Agent': 'python-requests/2.23.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': 'application/dicom+json', 'Connection': 'keep-alive', 'Content-Type': 'multipart/related; boundary=1499ee517614918065f30e0366724c00', 'Content-Length': '143226'}","b'--1499ee517614918065f30e0366724c00\r\nContent-Disposition: form-data; name=""file""; filename=""dicomfile""\r\nContent-Type: application/dicom\r\n\r\n\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'",False,409,Conflict,"[Content-Length, Content-Type, Server, Request-Context, X-Content-Type-Options, X-Powered-By, Date]","{""00081198"":{""vr"":""SQ"",""Value"":[{""00081150"":{""vr"":""UI"",""Value"":[""1.2.840.10008.5.1.4.1.1.7""]},""00081155"":{""vr"":""UI"",""Value"":[""1.2.276.0.7230010.3.1.4.8323329.28530.1517874485.775526""]},""00081197"":{""vr"":""US"",""Value"":[45070]}}]}}",00:00:02.064703,ascii,utf-8
1,C:\data\rsna-pneumonia-detection-challenge\stage_2_train_images\000924cf-0f8d-42bd-9158-1af53881a557.dcm,POST,https://ahscrdicom.azurewebsites.net/studies,/studies,"{'User-Agent': 'python-requests/2.23.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': 'application/dicom+json', 'Connection': 'keep-alive', 'Content-Type': 'multipart/related; boundary=c7d489f95d7bcab55f9d552bc1245ac5', 'Content-Length': '131538'}","b'--c7d489f95d7bcab55f9d552bc1245ac5\r\nContent-Disposition: form-data; name=""file""; filename=""dicomfile""\r\nContent-Type: application/dicom\r\n\r\n\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'",True,200,OK,"[Content-Length, Content-Type, Server, Request-Context, X-Content-Type-Options, X-Powered-By, Date]","{""00081199"":{""vr"":""SQ"",""Value"":[{""00081150"":{""vr"":""UI"",""Value"":[""1.2.840.10008.5.1.4.1.1.7""]},""00081155"":{""vr"":""UI"",""Value"":[""1.2.276.0.7230010.3.1.4.8323329.20023.1517874421.277234""]},""00081190"":{""vr"":""UR"",""Value"":[""https://ahscrdicom.azurewebsites.net/studies/1.2.276.0.7230010.3.1.2.8323329.20023.1517874421.277233/series/1.2.276.0.7230010.3.1.3.8323329.20023.1517874421.277232/instances/1.2.276.0.7230010.3.1.4.8323329.20023.1517874421.277234""]}}]}}",00:00:01.869311,ascii,utf-8
2,C:\data\rsna-pneumonia-detection-challenge\stage_2_train_images\000db696-cf54-4385-b10b-6b16fbb3f985.dcm,POST,https://ahscrdicom.azurewebsites.net/studies,/studies,"{'User-Agent': 'python-requests/2.23.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': 'application/dicom+json', 'Connection': 'keep-alive', 'Content-Type': 'multipart/related; boundary=6b733823d71842a42655caf0a343e947', 'Content-Length': '101644'}","b'--6b733823d71842a42655caf0a343e947\r\nContent-Disposition: form-data; name=""file""; filename=""dicomfile""\r\nContent-Type: application/dicom\r\n\r\n\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'",True,200,OK,"[Content-Length, Content-Type, Server, Request-Context, X-Content-Type-Options, X-Powered-By, Date]","{""00081199"":{""vr"":""SQ"",""Value"":[{""00081150"":{""vr"":""UI"",""Value"":[""1.2.840.10008.5.1.4.1.1.7""]},""00081155"":{""vr"":""UI"",""Value"":[""1.2.276.0.7230010.3.1.4.8323329.4475.1517874307.936345""]},""00081190"":{""vr"":""UR"",""Value"":[""https://ahscrdicom.azurewebsites.net/studies/1.2.276.0.7230010.3.1.2.8323329.4475.1517874307.936344/series/1.2.276.0.7230010.3.1.3.8323329.4475.1517874307.936343/instances/1.2.276.0.7230010.3.1.4.8323329.4475.1517874307.936345""]}}]}}",00:00:01.139878,ascii,utf-8
3,C:\data\rsna-pneumonia-detection-challenge\stage_2_train_images\000fe35a-2649-43d4-b027-e67796d412e0.dcm,POST,https://ahscrdicom.azurewebsites.net/studies,/studies,"{'User-Agent': 'python-requests/2.23.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': 'application/dicom+json', 'Connection': 'keep-alive', 'Content-Type': 'multipart/related; boundary=b389b38bfd52833a3d7589443ae81be0', 'Content-Length': '131500'}","b'--b389b38bfd52833a3d7589443ae81be0\r\nContent-Disposition: form-data; name=""file""; filename=""dicomfile""\r\nContent-Type: application/dicom\r\n\r\n\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'",True,200,OK,"[Content-Length, Content-Type, Server, Request-Context, X-Content-Type-Options, X-Powered-By, Date]","{""00081199"":{""vr"":""SQ"",""Value"":[{""00081150"":{""vr"":""UI"",""Value"":[""1.2.840.10008.5.1.4.1.1.7""]},""00081155"":{""vr"":""UI"",""Value"":[""1.2.276.0.7230010.3.1.4.8323329.25090.1517874463.16030""]},""00081190"":{""vr"":""UR"",""Value"":[""https://ahscrdicom.azurewebsites.net/studies/1.2.276.0.7230010.3.1.2.8323329.25090.1517874463.16029/series/1.2.276.0.7230010.3.1.3.8323329.25090.1517874463.16028/instances/1.2.276.0.7230010.3.1.4.8323329.25090.1517874463.16030""]}}]}}",00:00:00.941711,ascii,utf-8
4,C:\data\rsna-pneumonia-detection-challenge\stage_2_train_images\001031d9-f904-4a23-b3e5-2c088acd19c6.dcm,POST,https://ahscrdicom.azurewebsites.net/studies,/studies,"{'User-Agent': 'python-requests/2.23.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': 'application/dicom+json', 'Connection': 'keep-alive', 'Content-Type': 'multipart/related; boundary=8ff273539679b81be349c00f00315363', 'Content-Length': '148730'}","b'--8ff273539679b81be349c00f00315363\r\nContent-Disposition: form-data; name=""file""; filename=""dicomfile""\r\nContent-Type: application/dicom\r\n\r\n\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'",True,200,OK,"[Content-Length, Content-Type, Server, Request-Context, X-Content-Type-Options, X-Powered-By, Date]","{""00081199"":{""vr"":""SQ"",""Value"":[{""00081150"":{""vr"":""UI"",""Value"":[""1.2.840.10008.5.1.4.1.1.7""]},""00081155"":{""vr"":""UI"",""Value"":[""1.2.276.0.7230010.3.1.4.8323329.9271.1517874342.104736""]},""00081190"":{""vr"":""UR"",""Value"":[""https://ahscrdicom.azurewebsites.net/studies/1.2.276.0.7230010.3.1.2.8323329.9271.1517874342.104735/series/1.2.276.0.7230010.3.1.3.8323329.9271.1517874342.104734/instances/1.2.276.0.7230010.3.1.4.8323329.9271.1517874342.104736""]}}]}}",00:00:01.334192,ascii,utf-8
