# Summary
Notebook used to produce 'annotations.json' for the iu_xray dataset

In [111]:
import os
import time
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
import json

In [98]:
img_path = os.path.join('.','iu_xray', 'images')
report_path = os.path.join('.','iu_xray', 'reports')
imgs = os.listdir(img_path)
reports = os.listdir(report_path)

In [99]:
# observe names of images
imgs[0:10]

['CXR163_IM-0410-12012.png',
 'CXR2595_IM-1086-2001.png',
 'CXR1465_IM-0302-2001.png',
 'CXR2835_IM-1251-1001.png',
 'CXR855_IM-2376-1001.png',
 'CXR444_IM-2079-2001.png',
 'CXR3059_IM-1425-2001.png',
 'CXR2504_IM-1029-2001.png',
 'CXR2395_IM-0944-1001.png',
 'CXR776_IM-2319-2001.png']

In [100]:
# observe names of xml files
reports[0:10]

['162.xml',
 '1390.xml',
 '604.xml',
 '2699.xml',
 '2841.xml',
 '3587.xml',
 '2855.xml',
 '3593.xml',
 '88.xml',
 '610.xml']

Testing on one image

In [108]:
with open(os.path.join(report_path, reports[0]), 'r') as f:
    data = f.read()

In [109]:
start = time.time()
img_dict = {'image_path': []}
bs = BeautifulSoup(data, "xml")
pi = bs.find_all('parentImage')
if len(pi)<2:
    print('not enough images! skipping',pi[0]['id'])
else:
    img_dict['id'] = pi[0]['id']
    for img_id in pi:
        img_dict['image_path'].append(img_id['id']+'.png')
    img_dict['report'] = bs.find('AbstractText', {'Label': 'FINDINGS'}).text
print('This took {} seconds'.format(time.time()-start))
img_dict

This took 0.008481979370117188 seconds


{'image_path': ['CXR162_IM-0401-1001.png', 'CXR162_IM-0401-2001.png'],
 'id': 'CXR162_IM-0401-1001',
 'report': 'Heart size normal. Lungs are clear. XXXX are normal. No pneumonia, effusions, edema, pneumothorax, adenopathy, nodules or masses.'}

In [103]:
bs.find('AbstractText', {'Label': 'FINDINGS'}).text

'Heart size normal. Lungs are clear. XXXX are normal. No pneumonia, effusions, edema, pneumothorax, adenopathy, nodules or masses.'

### Reading all reports and storing them in train, val, and test randomly

In [104]:
img_suffix = imgs[0][imgs[0].find('.'):]
num_reports = 0
report_list = []
start = time.time()
for report in reports:
    with open(os.path.join(report_path, report), 'r') as f:
        data = f.read()
    img_dict = {'image_path': []}
    bs = BeautifulSoup(data, "xml")
    pi = bs.find_all('parentImage')
    if len(pi)<2:
        pass
        #print('not enough images! skipping {}'.format(report))
    else:
        num_reports += 1
        img_dict['id'] = pi[0]['id']
        for img_id in pi:
            img_dict['image_path'].append(img_id['id']+img_suffix)
        img_dict['report'] = bs.find('AbstractText', {'Label': 'FINDINGS'}).text
        report_list.append(img_dict)
        
        
print('{} seconds to process {} reports'.format(time.time()-start, len(reports)))
print('Total reports in dataset: {}'.format(num_reports))
print('Reports not included due to lack of images: {}'.format(len(reports)-num_reports))

15.789681196212769 seconds to process 3955 reports
Total reports in dataset: 3405
Reports not included due to lack of images: 550


In [105]:
# double check number of reports
print(len(report_list))

3405


In [106]:
# use sklearn to split into train, validation, and test sets
dev, test, _, _ = train_test_split(report_list, range(len(report_list)), test_size=0.2, random_state=42)
print('Dev size: {}, test size: {}'.format(len(dev),len(test)))
train, val, _, _ = train_test_split(dev, range(len(dev)), test_size=0.25, random_state=42)
print('Train size: {}, val size: {}'.format(len(train),len(val)))

Dev size: 2724, test size: 681
Train size: 2043, val size: 681


In [116]:
# save to dictionary with train, val, and test partitions
annotations = {'train': train, 'val': val, 'test': test}

# convert dictionary to json object
annotations_json = json.dumps(annotations)

# write to json file
with open("annotations.json", "w") as f:
    f.write(annotations_json)