In [1]:
import math
import re
import nltk
import random
import json
import os
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from source.data_preprocessing_utils import extract_findings_plus_impression_MIMIC_CXR, report_paths_generator_MIMIC_CXR
from pathlib import Path
from tqdm import tqdm

In [2]:
REGULAR_EXPRESSIONS_FOLDER = './regular_expressions/'

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pamessina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
english_stopwords = set(stopwords.words('english'))

In [5]:
with open('./mimic-cxr_findings+impression.txt') as f:
    mimic_reports = f.readlines()

In [6]:
with open('./ui-x-ray_findings+impression.txt') as f:
    iuxray_reports = f.readlines()

In [7]:
len(mimic_reports)

227416

In [8]:
len(iuxray_reports)

3927

In [9]:
class QuestionRule:
    def __init__(self, question, regex):
        self.question = question
        self.regex = regex
    def match(self, sentence):
        return self.regex.search(sentence)

In [10]:
class QuestionAnswerGenerator:
    def __init__(self, debug=False):
        self.reload(debug)
    
    def reload(self, debug=False):
        # load question specifications
        with open(os.path.join(REGULAR_EXPRESSIONS_FOLDER, 'questions.json')) as f:
            questions = json.load(f)
        
        if debug: print(questions)
        
        # load question rules
        self.question_rules = []
        for item in questions:
            regex = self.get_regex_from_files(item['files'])
            self.question_rules.append(QuestionRule(item['question'], regex))
            
        self.questions = [rule.question for rule in self.question_rules]
        self.questions.sort()
        assert (len(self.questions) == len(set(self.questions)))
        self.question2index = {q:i for i,q in enumerate(self.questions)}
        
        # load invalid patterns
        self.invalid_regex = self.get_regex_from_files(['invalid_sentence_patterns.txt'])
        
        # load unknown tokens patterns
        self.unknown_regex = self.get_regex_from_files(['unknown_token_patterns.txt'])
    
    def get_regex_from_files(self, files):
        pattern = ''
        for file in files:
            with open(os.path.join(REGULAR_EXPRESSIONS_FOLDER, file)) as f:
                for line in f.readlines():
                    if len(pattern) > 0:
                        pattern += '|'
                    pattern += f'({line.strip()})'
        return re.compile(pattern, re.IGNORECASE)
    
    def get_matched_questions(self, sentence):
        for rule in self.question_rules:
            if rule.match(sentence):
                yield rule.question

    def generate_qa_pairs(self, text, debug=False):        
        
        # remove duplicates while keeping original order
        sentences = sent_tokenize(text)
        sentences_uniq = set()
        tmp = []
        for s in sentences:
            if s in sentences_uniq: continue
            tmp.append(s)
            sentences_uniq.add(s)
        sentences = tmp
        assert len(sentences_uniq) == len(sentences)
        
        qa_pairs = dict()        
        for s in sentences:
            match = False
            valid = False
            if self.valid_sentence(s):
                valid = True
                for q in self.get_matched_questions(s):
                    try:
                        answers = qa_pairs[q]
                    except KeyError:
                        answers = qa_pairs[q] = []
                    answers.append(s)
                    match = True
            if debug and not match:
                if valid:
                    print(f'** not captured:', s)
                else:
                    print(f'------- invalid:', s)
        return qa_pairs
    
    def generate_qa_pairs_compact_version(self, text):        
        
        # remove duplicates while keeping original order
        sentences = sent_tokenize(text)
        sentences_uniq = set()
        tmp = []
        for s in sentences:
            if s in sentences_uniq:
                continue
            tmp.append(s)
            sentences_uniq.add(s)
        sentences = tmp
        assert len(sentences_uniq) == len(sentences)
        
        output = dict(sentences=sentences, invalid=[], unmatched=[], matched=[], qa=dict())
        for i, s in enumerate(sentences):
            match = False
            valid = False
            if self.valid_sentence(s):
                valid = True
                for q in self.get_matched_questions(s):
                    q_idx = self.question2index[q]
                    try:                        
                        answers = output['qa'][str(q_idx)]
                    except KeyError:
                        answers = output['qa'][str(q_idx)] = []
                    answers.append(i)
                    match = True
            if match:
                output['matched'].append(i)
            elif valid:
                output['unmatched'].append(i)
            else:
                output['invalid'].append(i)
        return output
    
    def get_unmatched_sentences(self, text):
        sentences = sent_tokenize(text)
        for s in sentences:            
            if self.valid_sentence(s):
                unmatched = True
                for rule in self.question_rules:
                    if rule.match(s):
                        unmatched = False
                        break
                if unmatched:
                    yield s
    
    def remove_almost_duplicate_sentences(self, qa_pairs, debug=False):
        clean = dict()
        for k, v in qa_pairs.items():
            vv = []
            for i in range(len(v)):
                dup = False
                for j in range(len(v)):
                    if i == j: continue
                    if self.almost_same_sentence(v[i], v[j], i, j):
                        dup = True
                        if debug:
                            print('dups detected:')
                            print('s1:', v[i])
                            print('s2:', v[j])
                        break
                if not dup:
                    vv.append(v[i])
            assert len(vv) > 0
            clean[k] = vv
        return clean
    
    def almost_same_sentence(self, s1, s2, i1, i2):
        if len(s1) == len(s2):
            return s1 == s2 and i1 < i2
        if len(s1) > len(s2):
            return False
        if s1 in s2:
            return True
        if  len(s1) * 10 < 7 * len(s2):
            return False
        count = 0
        found = 0
        s2 = set(w for w in re.split(r",?\s+", s2) if w not in english_stopwords)
        for w in re.split(r",?\s+", s1):
            if w in english_stopwords:
                continue
            if w in s2:
                found += 1
            count += 1
        return found * 10 > 7 * count
    
    def valid_sentence(self, s):
        if self.invalid_regex.search(s): return False
        unknown_len = sum(len(x.group()) for x in self.unknown_regex.finditer(s))
        if unknown_len * 10 >= len(s): return False
        return True
    
    def count_unmatches(self, text, min_sent_len = 15, debug=False):
        sentences = sent_tokenize(text)
        count = 0
        for s in sentences:
            if self.valid_sentence(s):
                if len(s) < min_sent_len:
                    continue                
                match = False
                for rule in self.question_rules:
                    if rule.match(s):
                        match = True
                        break
                if not match:
                    count += 1
                    if debug:
                        print(s)
        return count
    
    def search_by_unmatched(self, k, reports, n_samples):
        indices = random.sample(range(len(reports)), n_samples)
        pairs = [(self.count_unmatches(reports[idx]), idx) for idx in indices]
        pairs.sort(reverse=True)
        return pairs[k]

In [11]:
def print_qa_pairs(qa_pairs):
    for k, v in qa_pairs.items():
        print('------------')
        print('Q:', k)
        for s in v:
            print('A:',s)

In [12]:
qa_generator = QuestionAnswerGenerator()

In [13]:
len(qa_generator.questions), qa_generator.questions

(84,
 ['what about COPD?',
  'what about adenopathy?',
  'what about air collections?',
  'what about air space disease?',
  'what about air-fluid level?',
  'what about airways?',
  'what about aspiration?',
  'what about atelectasis?',
  'what about azygos lobe?',
  'what about azygos vein?',
  'what about blurring?',
  'what about bones?',
  'what about bowel obstruction and loops?',
  'what about bronchiectasis?',
  'what about bronchogram?',
  'what about bronchovascular crowding?',
  'what about bullae and blebs?',
  'what about calcification?',
  'what about cancer or tumor?',
  'what about cardiomegaly?',
  'what about cholelithiasis?',
  'what about congestive heart failure CHF?',
  'what about consolidation?',
  'what about contrast?',
  'what about densities?',
  'what about edema?',
  'what about emphysema?',
  'what about fissures?',
  'what about fluid overload?',
  'what about fractures?',
  'what about free air?',
  'what about gas distension?',
  'what about granumola?

In [62]:
tmp = qa_generator.search_by_unmatched(0, mimic_reports, 100)
x = tmp[1]
tmp

(2, 208226)

In [73]:
qa_generator.generate_qa_pairs_compact_version(mimic_reports[x])

{'sentences': ['Markedly rotated and lordotic positioning, which makes comparison to the prior film challenging.',
  'Inspiratory volumes are slightly low.',
  'An ET tube is present, the tip lies approximately 3.7 cm above the carina.',
  'An NG tube is present, the tip extends beneath the diaphragm and overlies the gastric fundus.',
  'A right IJ sheath is seen.',
  'Allowing for rotation, this probably similar to the prior study.',
  'No pneumothorax is detected.',
  'The cardiac silhouette is quite difficult to assess due to extreme differences in positioning.',
  'Likely vascular plethora and scattered parenchymal opacities.',
  'On the right, the appearance is probably similar to the prior study.',
  'No gross right effusion.',
  'On left, comparison to the prior study is quite difficult due to differences in positioning.',
  'No definite interval change on the left.',
  'No gross left effusion.',
  'Right and left hemidiaphragms remain well defined.',
  'Lines and tubes as descr

In [74]:
# _report = ""
# print(_report)
# _qa_pairs = qa_generator.generate_question_answer_pairs(_report)
print(mimic_reports[x])
_qa_pairs = qa_generator.generate_qa_pairs(mimic_reports[x], debug=True)
# _qa_pairs = qa_generator.remove_almost_duplicate_sentences(_qa_pairs)
print_qa_pairs(_qa_pairs)

Markedly rotated and lordotic positioning, which makes comparison to the prior film challenging. Inspiratory volumes are slightly low. An ET tube is present, the tip lies approximately 3.7 cm above the carina. An NG tube is present, the tip extends beneath the diaphragm and overlies the gastric fundus. A right IJ sheath is seen. Allowing for rotation, this probably similar to the prior study. No pneumothorax is detected. The cardiac silhouette is quite difficult to assess due to extreme differences in positioning. Likely vascular plethora and scattered parenchymal opacities. On the right, the appearance is probably similar to the prior study. No gross right effusion. On left, comparison to the prior study is quite difficult due to differences in positioning. No definite interval change on the left. No gross left effusion. Right and left hemidiaphragms remain well defined. Lines and tubes as described. No pneumothorax detected. Likely vascular plethora and scattered parenchymal opacitie

In [75]:
def get_informative_unmatched_sentences(corpus, qa_generator, stopwords, n_samples = 2000):
    
    indices = range(len(corpus))
    if len(corpus) > n_samples:
        indices = random.sample(indices, n_samples)
        
    unmatched_sentences = set()
    word2freq = dict()
    
    for i in tqdm(indices):
        for s in qa_generator.get_unmatched_sentences(corpus[i]):
            unmatched_sentences.add(s)
            for w in re.split(r",?\s+", s):
                word2freq[w] = word2freq.get(w, 0) + 1
    
    max_freq = max(word2freq.values())    
    unmatched_sentences = list(unmatched_sentences)
    
    def score(s):
        score_max = 0
        score_sum = 0
        cnt = 0
        for w in re.split(r",?\s+", s):
            if w not in stopwords:
                tmp = max_freq - word2freq.get(w, 0)
                score_max = max(tmp, score_max)
                score_sum += tmp
                cnt += 1
        return score_max * 0.4 + (score_sum / math.sqrt(max(cnt,1))) * 0.6
    
    unmatched_sentences.sort(key=score, reverse=True)
    return unmatched_sentences

In [15]:
unmatched_sentences = get_informative_unmatched_sentences(mimic_reports, qa_generator, english_stopwords, n_samples=1000)

100%|██████████| 1000/1000 [00:12<00:00, 82.63it/s]


In [17]:
len(unmatched_sentences)

137

In [18]:
qa_generator.reload()

In [19]:
print_qa_pairs(qa_generator.generate_question_answer_pairs(' '.join(unmatched_sentences[0:40])))

** not captured: Right lower paratacheal mediastinal buldge likely represents a combination of lymphadopathy and the azygous/right superior intercostal veins.
** not captured: It would be very useful to obtain conventional radiographs, particularly the lateral view to re-evaluate this area.
** not captured: These are of uncertain etiology, but given peripheral location, if real, raise the possibility of emboli.
** not captured: Chronic pulmonary changes are seen in both the right mid to lower and left lower zones.
** not captured: On the right, it developed between , improved later that day, and has recurred.
** not captured: On the lateral view left arm obscures the posterior chest.
** not captured: A battery pack overlies the left lower hemi thorax.
** not captured: Relative paucity of gas seen in the visualized upper portion of the abdomen.
** not captured: If further clarification is needed, oblique views would be useful.
** not captured: This is nonspecific and could be due to ent

In [14]:
def generate_VQA_dataset_from_IU_XRay(qa_generator):
    with open('/mnt/workspace/iu-x-ray/dataset/reports/reports.json') as f:
        original_reports = json.load(f)
        
    qa_adapted_reports = []
    
    for x in tqdm(original_reports.values()):
        findings = x['findings']
        impression = x['impression']
        if findings or impression:
            if findings and not impression:
                text = findings
            elif not findings and impression:
                text = impression
            else:
                if findings[-1] == '.':
                    text = findings + ' ' + impression
                else:
                    text = findings + '. ' + impression
            qa_info = qa_generator.generate_qa_pairs_compact_version(text)
            qa_info['filename'] = x['filename']
            qa_adapted_reports.append(qa_info)
    
    output = {
        'questions': qa_generator.questions,
        'reports': qa_adapted_reports,
    }
    
    return output

In [36]:
def generate_VQA_dataset_from_MIMIC_CXR(qa_generator):    
    
    qa_adapted_reports = [None] * 300000
    
    for i, filepath in tqdm(enumerate(report_paths_generator_MIMIC_CXR())):
        text = extract_findings_plus_impression_MIMIC_CXR(filepath.as_posix())
        qa_info = qa_generator.generate_qa_pairs_compact_version(text)
        qa_info['filepath'] = str(filepath)
        qa_adapted_reports[i] = qa_info
    
    qa_adapted_reports = qa_adapted_reports[:i+1]
    
    output = {
        'questions': qa_generator.questions,
        'reports': qa_adapted_reports,
    }
        
    return output

In [84]:
vqa_dataset_iuxray = generate_VQA_dataset_from_IU_XRay(qa_generator)

100%|██████████| 3955/3955 [00:58<00:00, 67.15it/s]


In [87]:
with open('/mnt/workspace/iu-x-ray/dataset/reports/qa_adapted_reports.json', 'w') as f:
    json.dump(vqa_dataset_iuxray, f)

In [37]:
vqa_dataset_mimiccxr = generate_VQA_dataset_from_MIMIC_CXR(qa_generator)

20573it [09:24, 31.59it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

94237it [46:56, 43.74it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

168336it [1:23:23, 34.58it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000

In [38]:
with open('/mnt/data/mimic-cxr/qa_adapted_reports.json', 'w') as f:
    json.dump(vqa_dataset_mimiccxr, f)

## Generate summary file

In [40]:
with open('/mnt/workspace/iu-x-ray/dataset/reports/qa_adapted_reports.json', 'r') as f:
    vqa_dataset_iuxray = json.load(f)

In [41]:
with open('/mnt/data/mimic-cxr/qa_adapted_reports.json', 'r') as f:
    vqa_dataset_mimiccxr = json.load(f)

In [43]:
len(vqa_dataset_iuxray['reports']), len(vqa_dataset_mimiccxr['reports'])

(3927, 227835)

In [45]:
vqa_dataset_iuxray['reports'][1]

{'sentences': ['The cardiomediastinal silhouette is within normal limits for size and contour.',
  'The lungs are normally inflated without evidence of focal airspace disease, pleural effusion, or pneumothorax.',
  'Stable calcified granuloma within the right upper lung.',
  'No acute bone abnormality.. No acute cardiopulmonary process.'],
 'invalid': [],
 'unmatched': [],
 'matched': [0, 1, 2, 3],
 'qa': {'65': [0],
  '73': [0],
  '72': [1, 2],
  '48': [1],
  '52': [1],
  '3': [1],
  '17': [2],
  '32': [2],
  '11': [3]},
 'filename': '10.xml'}

In [48]:
def generate_summary_file(dataset, k, kk, output_filename):
    
    q2a = [[] for _ in range(len(dataset['questions']))]
    n_invalid = 0
    unmatched_sentences = []
    for ri, report in enumerate(dataset['reports']):
        s = report['sentences']
        for q_idx, a_idxs in report['qa'].items():
            q_idx = int(q_idx)
            a_len = sum(len(s[i]) for i in a_idxs)
            q2a[q_idx].append((a_len, ri))
        n_invalid += len(report['invalid'])
        unmatched_sentences.extend(s[i] for i in report['unmatched'])
            
    sorted_questions = sorted([(len(a), i) for i,a in enumerate(q2a)], reverse=True)    
    
    with open(f'./{output_filename}', 'w') as file:
        file.write('Preguntas ordenadas por frecuencia (decreciente)\n\n')
        for f, q in sorted_questions:
            file.write('%d, %s\n' % (f, dataset['questions'][q]))
        file.write('\n')
        file.write('Ejemplos de respuestas por pregunta:')
        for i,(_,q) in enumerate(sorted_questions):
            answers = q2a[q]
            answers.sort()            
            tmp = []
            for _, a in answers:
                report = dataset['reports'][a]
                sentences = report['sentences']
                a_idxs = report['qa'][str(q)]
                answer = ' '.join(sentences[i] for i in a_idxs)
                tmp.append(answer)
            answers = tmp            
            n = len(answers)
            print("n=%d, %s" % (n, dataset['questions'][q]))
            if n > k:
                sample = []
                for j in range(k):
                    sample.append(answers[random.randint(int(n*j/k), int(n*(j+1)/k)-1)])
            else:
                sample = answers
            file.write('\n\n%d) %s:\n\n' % (i+1, dataset['questions'][q]))
            for a in sample:
                file.write('    A: %s\n' % a)
        file.write('\n\nTotal oraciones inválidas: %d\n' % n_invalid)
        file.write('Total oraciones sin match: %d\n' % len(unmatched_sentences))
        file.write('\n\nEjemplos de oraciones válidas que no hicieron match con ninguna pregunta\n\n')
        unmatched_sentences = list(set(unmatched_sentences))
        unmatched_sentences.sort(key=lambda s:len(s))
        nn = len(unmatched_sentences)
        for j in range(kk):
            file.write('    %s\n' % unmatched_sentences[random.randint(int(nn*j/kk), int(nn*(j+1)/kk)-1)])

In [49]:
generate_summary_file(vqa_dataset_iuxray, 10, 120, 'vqa_iuxray_summary_file.txt')

n=3050, what about the lungs?
n=3035, what about pleural space?
n=2683, what about pneumothorax?
n=2559, what about the heart?
n=2299, what about the mediastinum?
n=1907, what about bones?
n=1221, what about consolidation?
n=1127, what about the cardiac silhouette?
n=832, what about pulmonary vascularity?
n=694, what about opacities?
n=544, what about air space disease?
n=537, what about calcification?
n=525, what about support devices and foreign bodies?
n=439, what about granumola?
n=411, what about infiltrate?
n=364, what about atelectasis?
n=357, what about thoracic aorta?
n=345, what about edema?
n=327, what about tubes and lines?
n=323, what about nodules?
n=297, what about cardiomegaly?
n=244, what about pneumonia?
n=237, what about the diaphragm?
n=215, what about tortuosity?
n=215, what about surgery?
n=209, what about pulmonary hila?
n=204, what about scarring?
n=189, what about fractures?
n=183, what about the ribs?
n=180, what about masses?
n=150, what about interstitial lu

In [50]:
generate_summary_file(vqa_dataset_mimiccxr, 10, 120, 'vqa_mimiccxr_summary_file.txt')

n=182832, what about the lungs?
n=171794, what about pleural space?
n=138668, what about pneumothorax?
n=110147, what about the mediastinum?
n=97334, what about support devices and foreign bodies?
n=94605, what about the heart?
n=90877, what about the cardiac silhouette?
n=78024, what about tubes and lines?
n=69038, what about bones?
n=64624, what about consolidation?
n=61768, what about atelectasis?
n=58789, what about edema?
n=57477, what about opacities?
n=50929, what about pulmonary hila?
n=47488, what about pulmonary vascularity?
n=45200, what about pneumonia?
n=29366, what about cardiomegaly?
n=23595, what about the diaphragm?
n=20860, what about surgery?
n=20446, what about thoracic aorta?
n=19036, what about vena cava?
n=15162, what about the stomach?
n=13647, what about calcification?
n=12622, what about the trachea?
n=12544, what about fractures?
n=11239, what about the apical zone?
n=10516, what about tortuosity?
n=10032, what about the ribs?
n=9782, what about the carina?
n