# Summary
Notebook used to produce 'annotations.json' for the iu_xray dataset

In [1]:
import os
import time
import xml.etree.ElementTree as ET
from sklearn.model_selection import train_test_split
import json

In [2]:
img_path = os.path.join('.','iu_xray', 'images')
report_path = os.path.join('.','iu_xray', 'reports')
imgs = os.listdir(img_path)
reports = os.listdir(report_path)

In [3]:
# observe names of images
imgs[0:10]

['CXR163_IM-0410-12012.png',
 'CXR2595_IM-1086-2001.png',
 'CXR1465_IM-0302-2001.png',
 'CXR2835_IM-1251-1001.png',
 'CXR855_IM-2376-1001.png',
 'CXR444_IM-2079-2001.png',
 'CXR3059_IM-1425-2001.png',
 'CXR2504_IM-1029-2001.png',
 'CXR2395_IM-0944-1001.png',
 'CXR776_IM-2319-2001.png']

In [4]:
# observe names of xml files
reports[0:10]

['162.xml',
 '1390.xml',
 '604.xml',
 '2699.xml',
 '2841.xml',
 '3587.xml',
 '2855.xml',
 '3593.xml',
 '88.xml',
 '610.xml']

Testing on one image

In [5]:
with open(os.path.join(report_path, reports[0]), 'r') as f:
    data = f.read()
tree = ET.parse(os.path.join(report_path, reports[0]))
root = tree.getroot()
for e in root:
    print(e.tag)
root.findall(".//AbstractText[@Label='FINDINGS']")[0].text

meta
uId
pmcId
docSource
IUXRId
licenseType
licenseURL
ccLicense
articleURL
articleDate
articleType
publisher
title
note
specialty
subset
MedlineCitation
MeSH
parentImage
parentImage


'Heart size normal. Lungs are clear. XXXX are normal. No pneumonia, effusions, edema, pneumothorax, adenopathy, nodules or masses.'

In [6]:
start = time.time()
img_dict = {'image_path': []}
tree = ET.parse(os.path.join(report_path, reports[0]))
root = tree.getroot()
pi = root.findall('parentImage')
if len(pi)<2:
    print('cannot find enough imgs')
else:
    img_dict['id'] = pi[0].attrib['id']
    for img_id in pi:
        img_dict['image_path'].append(img_id.attrib['id']+'.png')
    img_dict['report'] = root.findall(".//AbstractText[@Label='FINDINGS']")[0].text
print('This took {} seconds'.format(time.time()-start))
img_dict

This took 0.0010399818420410156 seconds


{'image_path': ['CXR162_IM-0401-1001.png', 'CXR162_IM-0401-2001.png'],
 'id': 'CXR162_IM-0401-1001',
 'report': 'Heart size normal. Lungs are clear. XXXX are normal. No pneumonia, effusions, edema, pneumothorax, adenopathy, nodules or masses.'}

### Reading all reports and storing them in train, val, and test randomly

In [7]:
img_suffix = imgs[0][imgs[0].find('.'):]
num_reports = 0
report_list = []
start = time.time()
for report in reports:
    
    img_dict = {'image_path': []}
    tree = ET.parse(os.path.join(report_path, report))
    root = tree.getroot()
    pi = root.findall('parentImage')
    if len(pi)<2:
        pass
        #print('not enough images! skipping {}'.format(report))
    else:
        img_dict['id'] = pi[0].attrib['id']
        for img_id in pi:
            img_dict['image_path'].append(img_id.attrib['id']+'.png')
        img_dict['report'] = root.findall(".//AbstractText[@Label='FINDINGS']")[0].text
        if img_dict['report'] == None:
            continue
        num_reports += 1
        report_list.append(img_dict)
        
        
print('{} seconds to process {} reports'.format(time.time()-start, len(reports)))
print('Total reports in dataset: {}'.format(num_reports))
print('Reports not included due to lack of images or lack of Report Text: {}'.format(len(reports)-num_reports))

2.5169458389282227 seconds to process 3955 reports
Total reports in dataset: 2955
Reports not included due to lack of images or lack of Report Text: 1000


In [8]:
# double check number of reports
print(len(report_list))

2955


In [9]:
# use sklearn to split into train, validation, and test sets
dev, test, _, _ = train_test_split(report_list, range(len(report_list)), test_size=0.2, random_state=42)
print('Dev size: {}, test size: {}'.format(len(dev),len(test)))
train, val, _, _ = train_test_split(dev, range(len(dev)), test_size=1/8, random_state=42)
print('Train size: {}, val size: {}'.format(len(train),len(val)))

Dev size: 2364, test size: 591
Train size: 2068, val size: 296


In [10]:
# save to dictionary with train, val, and test partitions
annotations = {'train': train, 'val': val, 'test': test}

# convert dictionary to json object
annotations_json = json.dumps(annotations)

# write to json file
with open("annotation.json", "w") as f:
    f.write(annotations_json)