In [60]:
import json
import re
from tqdm import tqdm
from random import randint
from nltk.tokenize import wordpunct_tokenize

In [2]:
with open('/mnt/workspace/iu-x-ray/dataset/reports/qa_adapted_reports.json', 'r') as f:
    vqa_dataset_iuxray = json.load(f)

In [3]:
with open('/mnt/data/mimic-cxr/qa_adapted_reports.json', 'r') as f:
    vqa_dataset_mimiccxr = json.load(f)

In [49]:
def get_sentences():
    for report in vqa_dataset_iuxray['reports']:
        for idx in report['matched']:
            yield report['sentences'][idx].lower()
        for idx in report['unmatched']:
            yield report['sentences'][idx].lower()
    for report in vqa_dataset_mimiccxr['reports']:
        for idx in report['matched']:
            yield report['sentences'][idx].lower()
        for idx in report['unmatched']:
            yield report['sentences'][idx].lower()

In [146]:
unknown_regex = re.compile(r'^(\d+(cm|mm|st|th|nd|rd)?|xxxx|jj|[().\-\\/+#*=><%?;!].*|[:,].+)$')

In [147]:
vocab = dict()
for sentence in tqdm(get_sentences()):
    tokens = wordpunct_tokenize(sentence)
    for token in tokens:
        if unknown_regex.search(token):
            continue
        vocab[token] = vocab.get(token, 0) + 1

1149350it [00:07, 148502.68it/s]


In [148]:
filtered_vocab = [word for word, freq in vocab.items() if freq >= 4]

In [149]:
filtered_vocab.sort()

In [151]:
len(filtered_vocab)

5183

In [150]:
filtered_vocab[:1000]

['"',
 "'",
 ',',
 ':',
 'a',
 'aa',
 'aaa',
 'abandoned',
 'abdomen',
 'abdominal',
 'aberrant',
 'ability',
 'ablation',
 'able',
 'abnormal',
 'abnormalities',
 'abnormality',
 'abnormally',
 'about',
 'above',
 'abpa',
 'abrupt',
 'abruptly',
 'abscess',
 'abscesses',
 'absence',
 'absent',
 'absolute',
 'abundant',
 'abuse',
 'abut',
 'abuts',
 'abutting',
 'ac',
 'accelerated',
 'accentuate',
 'accentuated',
 'accentuates',
 'accentuating',
 'accentuation',
 'acceptable',
 'access',
 'accessed',
 'accessible',
 'accessory',
 'accident',
 'accompanied',
 'accompanies',
 'accompany',
 'accompanying',
 'accomplished',
 'according',
 'accordingly',
 'account',
 'accounted',
 'accounting',
 'accounts',
 'accumulate',
 'accumulated',
 'accumulating',
 'accumulation',
 'accumulations',
 'accurate',
 'accurately',
 'acdf',
 'acentuated',
 'achalasia',
 'achieve',
 'achieved',
 'acid',
 'acknowledged',
 'acromial',
 'acromio',
 'acromioclavicular',
 'acromiohumeral',
 'acromion',
 'across

In [96]:
len(filtered_vocab)

5219

In [88]:
vocab['<unknown>']

30975

In [87]:
filtered_vocab[:1000]

[(1152929, '.'),
 (583778, 'the'),
 (455235, 'is'),
 (332737, 'no'),
 (248441, ','),
 (244565, 'are'),
 (237329, 'of'),
 (205917, 'and'),
 (184234, 'right'),
 (182524, 'pleural'),
 (178767, 'there'),
 (162589, 'in'),
 (149624, 'effusion'),
 (146540, 'left'),
 (145986, 'pneumothorax'),
 (133122, 'or'),
 (117990, 'normal'),
 (105757, 'pulmonary'),
 (100152, 'lung'),
 (90580, 'with'),
 (83697, 'a'),
 (79888, 'acute'),
 (75729, 'atelectasis'),
 (72672, 'unchanged'),
 (70553, 'lungs'),
 (69876, 'seen'),
 (69382, 'edema'),
 (67449, 'consolidation'),
 (66971, '-'),
 (66566, 'size'),
 (64800, 'clear'),
 (63207, 'heart'),
 (63178, 'to'),
 (61557, 'silhouette'),
 (60239, 'mild'),
 (59182, 'tube'),
 (58615, 'focal'),
 (58364, 'mediastinal'),
 (51693, 'contours'),
 (51614, 'pneumonia'),
 (50959, 'cardiac'),
 (48730, 'cardiomediastinal'),
 (48586, 'small'),
 (47351, 'stable'),
 (46287, 'evidence'),
 (46100, 'lower'),
 (45033, 'at'),
 (42710, 'hilar'),
 (42146, 'effusions'),
 (40428, 'process'),
 (3

In [18]:
report = random.choice(vqa_dataset_mimiccxr['reports'])
report = ' '.join(report['sentences'])
report, wordpunct_tokenize(report)

('There has been interval removal of a right IJ catheter. The right lung base opacity has improved since prior study and may represent atelectasis. There are no pleural effusions. Moderate cardiomegaly is stable. There is minimal pulmonary edema. There is no pneumothorax. Right lung base opacity has improved since prior study and may represent atelectasis.',
 ['There',
  'has',
  'been',
  'interval',
  'removal',
  'of',
  'a',
  'right',
  'IJ',
  'catheter',
  '.',
  'The',
  'right',
  'lung',
  'base',
  'opacity',
  'has',
  'improved',
  'since',
  'prior',
  'study',
  'and',
  'may',
  'represent',
  'atelectasis',
  '.',
  'There',
  'are',
  'no',
  'pleural',
  'effusions',
  '.',
  'Moderate',
  'cardiomegaly',
  'is',
  'stable',
  '.',
  'There',
  'is',
  'minimal',
  'pulmonary',
  'edema',
  '.',
  'There',
  'is',
  'no',
  'pneumothorax',
  '.',
  'Right',
  'lung',
  'base',
  'opacity',
  'has',
  'improved',
  'since',
  'prior',
  'study',
  'and',
  'may',
  'r