# Summary
Notebook used to produce 'annotations.json' for the iu_xray dataset

In [2]:
import os
import time
import xml.etree.ElementTree as ET
from sklearn.model_selection import train_test_split
import json

In [5]:
img_path = os.path.join('.','iu_xray', 'images')
report_path = os.path.join('.','iu_xray', 'reports')
imgs = os.listdir(img_path)
reports = os.listdir(report_path)

In [6]:
# observe names of images
imgs[0:10]

['CXR1000_IM-0003-1001.png',
 'CXR1000_IM-0003-2001.png',
 'CXR1000_IM-0003-3001.png',
 'CXR1001_IM-0004-1001.png',
 'CXR1001_IM-0004-1002.png',
 'CXR1002_IM-0004-1001.png',
 'CXR1002_IM-0004-2001.png',
 'CXR1003_IM-0005-2002.png',
 'CXR1004_IM-0005-1001.png',
 'CXR1004_IM-0005-2001.png']

In [7]:
# observe names of xml files
reports[0:10]

['1.xml',
 '10.xml',
 '100.xml',
 '1000.xml',
 '1001.xml',
 '1002.xml',
 '1003.xml',
 '1004.xml',
 '1005.xml',
 '1006.xml']

Testing on one image

In [8]:
with open(os.path.join(report_path, reports[0]), 'r') as f:
    data = f.read()
tree = ET.parse(os.path.join(report_path, reports[0]))
root = tree.getroot()
for e in root:
    print(e.tag)
root.findall(".//AbstractText[@Label='FINDINGS']")[0].text

meta
uId
pmcId
docSource
IUXRId
licenseType
licenseURL
ccLicense
articleURL
articleDate
articleType
publisher
title
note
specialty
subset
MedlineCitation
MeSH
parentImage
parentImage


'The cardiac silhouette and mediastinum size are within normal limits. There is no pulmonary edema. There is no focal consolidation. There are no XXXX of a pleural effusion. There is no evidence of pneumothorax.'

In [9]:
start = time.time()
img_dict = {'image_path': []}
tree = ET.parse(os.path.join(report_path, reports[0]))
root = tree.getroot()
pi = root.findall('parentImage')
if len(pi)<2:
    print('cannot find enough imgs')
else:
    img_dict['id'] = pi[0].attrib['id']
    for img_id in pi:
        img_dict['image_path'].append(img_id.attrib['id']+'.png')
    img_dict['report'] = root.findall(".//AbstractText[@Label='FINDINGS']")[0].text
print('This took {} seconds'.format(time.time()-start))
img_dict

This took 0.0044307708740234375 seconds


{'image_path': ['CXR1_1_IM-0001-3001.png', 'CXR1_1_IM-0001-4001.png'],
 'id': 'CXR1_1_IM-0001-3001',
 'report': 'The cardiac silhouette and mediastinum size are within normal limits. There is no pulmonary edema. There is no focal consolidation. There are no XXXX of a pleural effusion. There is no evidence of pneumothorax.'}

### Reading all reports and storing them in train, val, and test randomly

In [11]:
img_suffix = imgs[0][imgs[0].find('.'):]
num_reports = 0
report_list = []
start = time.time()
for report in reports:
    
    img_dict = {'image_path': []}
    tree = ET.parse(os.path.join(report_path, report))
    root = tree.getroot()
    pi = root.findall('parentImage')
    if len(pi)<2:
        pass
        #print('not enough images! skipping {}'.format(report))
    else:
        img_dict['id'] = pi[0].attrib['id']
        for img_id in pi:
            img_dict['image_path'].append(img_id.attrib['id']+'.png')
        img_dict['report'] = root.findall(".//AbstractText[@Label='FINDINGS']")[0].text
        if img_dict['report'] == None:
            continue
        num_reports += 1
        report_list.append(img_dict)
        
        
print('{} seconds to process {} reports'.format(time.time()-start, len(reports)))
print('Total reports in dataset: {}'.format(num_reports))
print('Reports not included due to lack of images or lack of Report Text: {}'.format(len(reports)-num_reports))

10.69963812828064 seconds to process 3955 reports
Total reports in dataset: 2955
Reports not included due to lack of images or lack of Report Text: 1000


In [12]:
# double check number of reports
print(len(report_list))

2955


In [13]:
# use sklearn to split into train, validation, and test sets
dev, test, _, _ = train_test_split(report_list, range(len(report_list)), test_size=0.2, random_state=42)
print('Dev size: {}, test size: {}'.format(len(dev),len(test)))
train, val, _, _ = train_test_split(dev, range(len(dev)), test_size=0.25, random_state=42)
print('Train size: {}, val size: {}'.format(len(train),len(val)))

Dev size: 2364, test size: 591
Train size: 1773, val size: 591


In [14]:
# save to dictionary with train, val, and test partitions
annotations = {'train': train, 'val': val, 'test': test}

# convert dictionary to json object
annotations_json = json.dumps(annotations)

# write to json file
with open("annotation.json", "w") as f:
    f.write(annotations_json)