In [3]:
from tqdm import tqdm
import pandas as pd
from medvqa.datasets.mimiccxr.preprocessing import (
    extract_report_and_patient_background,
    get_reports_txt_paths,
)
from medvqa.datasets.mimiccxr import MIMICCXR_METADATA_CSV_PATH, MIMICCXR_CACHE_DIR
from medvqa.utils.files import save_to_pickle
import os
from pprint import pprint

In [4]:
report_paths  = get_reports_txt_paths()

In [5]:
len(report_paths)

227835

In [16]:
with open(report_paths[-4]) as f:
    print(f.read())

                                 FINAL REPORT
 INDICATION:  ___ year old woman with recurrent endocarditis septic emboli  //
 PNA or abscess
 
 TECHNIQUE:  AP portable chest radiograph
 
 COMPARISON:  ___
 
 FINDINGS: 
 
 No focal consolidation, pleural effusion or pneumothorax identified.  In the
 size the cardiomediastinal silhouette is within normal limits.
 
 Interval removal of the left PICC line.
 
 IMPRESSION: 
 
 No evidence of acute cardiopulmonary disease.



In [16]:
extract_report_and_patient_background(report_paths[-31], True)

                                 FINAL REPORT
 EXAMINATION:  CHEST (PORTABLE AP)
 
 INDICATION:  ___ year old woman with NEW LEFT IJ- ASSESS PLACEMENT  // NEW LEFT
 IJ- ASSESS PLACEMENT      NEW LEFT IJ- ASSESS PLACEMENT
 
 IMPRESSION: 
 
 Compared to prior chest radiographs ___ and ___:25.
 
 New left IJ line ends in the mid SVC.  No pneumothorax or significant pleural
 effusion or mediastinal widening.  Diffuse mild to moderate heterogeneous
 pulmonary abnormality has an interstitial component and the suggestion of
 alveolar opacities or pleural thickening in the upper chest.  The abnormality
 is new since ___, but precise Chronicity is indeterminate and
 differential diagnosis is broad.  This could be all pulmonary edema,
 particular if the patient is hypotensive and in Trendelenburg position.  It
 could be diffuse infection, drug reaction or less likely pulmonary hemorrhage.
 
 Heart is top-normal size, but definitely increased since ___.
 
 ET tube and left internal jugular line a

{'report': 'Compared to prior chest radiographs and :25. New left IJ line ends in the mid SVC. No pneumothorax or significant pleural effusion or mediastinal widening. Diffuse mild to moderate heterogeneous pulmonary abnormality has an interstitial component and the suggestion of alveolar opacities or pleural thickening in the upper chest. The abnormality is new since , but precise Chronicity is indeterminate and differential diagnosis is broad. This could be all pulmonary edema, particular if the patient is hypotensive and in Trendelenburg position. It could be diffuse infection, drug reaction or less likely pulmonary hemorrhage. Heart is top-normal size, but definitely increased since . ET tube and left internal jugular line are in standard placements and an esophageal drainage tube ends in the stomach.',
 'background': 'indication: year old woman with NEW LEFT IJ- ASSESS PLACEMENT // NEW LEFT IJ- ASSESS PLACEMENT NEW LEFT IJ- ASSESS PLACEMENT.'}

In [1103]:
_re_paragraph_breaks.split('''WET READ: ___ ___ ___ 6:17 PM
  Comp:  ___
  
  Interval placement of right pigtail catheter
  Persistent right costophrenic angle opacity and more medial right basilar
  opacity as well.
  Thin lucency along the lateral chest wall may represent air introduced during
  the procedure.  No large pneumothorax is present.
 ______________________________________________________________________________
                                 FINAL REPORT
 EXAMINATION:  CHEST (PORTABLE AP)
 
 INDICATION:  ___ year old man with chest tube  // ? ptx      ? ptx
 
 COMPARISON:  Chest radiographs most recently ___.
 
 IMPRESSION: 
 
 Since ___:55, with insertion of a right basal pigtail pleural
 drainage catheter, most of the previous moderate right pleural effusion has
 been evacuated.  Small localized pneumothorax is new.  Opacification in the
 right lower lobe could be atelectasis alone or there could be pneumonia. 
 Previous mild pulmonary edema has improved.  Left lower lobe is chronically
 atelectatic reflected in elevation of the left hemidiaphragm and leftward
 mediastinal shift.  Overall lung volumes are large, due to emphysema or
 obstructive lung disease.  Moderate cardiomegaly is underestimated as a
 result.''')

['WET READ: ___ ___ ___ 6:17 PM\n  Comp:  ___',
 '\n  \n  ',
 '  \n  ',
 None,
 'Interval placement of right pigtail catheter\n  Persistent right costophrenic angle opacity and more medial right basilar\n  opacity as well.\n  Thin lucency along the lateral chest wall may represent air introduced during\n  the procedure.  No large pneumothorax is present.',
 None,
 None,
 '\n ______________________________________________________________________________\n',
 '                                 FINAL REPORT\n EXAMINATION:  CHEST (PORTABLE AP)',
 '\n \n ',
 ' \n ',
 None,
 'INDICATION:  ___ year old man with chest tube  // ? ptx      ? ptx',
 '\n \n ',
 ' \n ',
 None,
 'COMPARISON:  Chest radiographs most recently ___.',
 '\n \n ',
 ' \n ',
 None,
 'IMPRESSION:',
 ' \n \n ',
 ' \n ',
 None,
 'Since ___:55, with insertion of a right basal pigtail pleural\n drainage catheter, most of the previous moderate right pleural effusion has\n been evacuated.  Small localized pneumothorax is new.  Opac

In [1015]:
_p = "  SA AAaa:\n RES-  - R- SE: asdfasdf sdfds  \n ASDKFD: /ASD FASDF: \nBaaa:"
[_p[i.span()[0]:i.span()[1]] for i in _re_header.finditer(_p)]

['  SA AAaa:', '\n RES-  - R- SE:', '\n ASDKFD:', '\nBaaa:']

In [1969]:
from collections import OrderedDict
import re
_re_header = re.compile(r'(^|\n)\s*([A-Z][a-zA-Z]*(( |-|&)+[a-zA-Z]+)*?:)')
_re_paragraph_breaks = re.compile(r'(\s*\n(\s*\n\s*)+)|(\n\s*_+\s*\n)')

def split_report_into_sections(report_path, debug=False):
    with open(report_path) as f:
        text = f.read()
    if debug:
        print(text)
    paragraphs = _re_paragraph_breaks.split(text)
    sections = OrderedDict()
    last_k = None
    for p in paragraphs:
        if p is None: continue
#         if debug:
#             print('------------------')
#             print(p)
        spans = [i.span() for i in _re_header.finditer(p)]
        if len(spans) > 0:            
            if spans[0][0] > 0:
                if last_k is None:
                    k = '(HEADERLESS PARAGRAPH)'
                else:
                    k = f'(AFTER) {last_k}'
                try:
                    tmp = sections[k]
                except KeyError:
                    tmp = sections[k] = []
                tmp.append(p[:spans[0][0]])
            for i, span in enumerate(spans):
                k = p[span[0]:span[1]]
                k = ' '.join(k.split()).strip()
                v = p[span[1]:] if i+1 == len(spans) else p[span[1]:spans[i+1][0]]
                # sections[k] = v
                try:
                    tmp = sections[k]
                except KeyError:
                    tmp = sections[k] = []
                tmp.append(v)
                last_k = k
        else:
            if last_k is None:
                k = '(HEADERLESS PARAGRAPH)'
            else:
                k = f'(AFTER) {last_k}'
            try:
                tmp = sections[k]
            except KeyError:
                tmp = sections[k] = []
            tmp.append(p)
    return sections

In [364]:
# split_report_into_sections(rp, True)

In [2019]:
SECTION_HEADERS_FOR_REPORT = set([
    'FINDINGS:',
    'FINDNINGS:',
    'FINGDINGS:',
    'FINIDNGS:',
    'Findings:',
    'FINDINS:',
    'FINDING:',
    'FINDINDGS:',
    'FIMPRESSION:',
    'FINSINGS:',
    'FINDIGNS:',
    'FINDINGS and IMPRESSION:',
    'MPRESSION:',
    'IMPRESSION:',
    'IMPRESSON:',
    'IMPRESSIONS:',
    'IMPRESION:',
    'IMPESSION:',
    'IMPRSSION:',
    'IMPRESSOIN:',
    'Findings and Impression:',
    'FINDINGS AND IMPRESSION:',
    'Impression:',
#     'EXAMINATION:',
    'REPORT:',
    'CONCLUSION:',
    'Conclusion:',
    'THEY REPORT TEXT FOLLOWS:',
    'PORTABLE SUPINE FRONTAL VIEW OF THE CHEST:',
    'FRONTAL AND LATERAL CHEST RADIOGRAPHS:',
    'FRONTAL AND LATERAL CHEST RADIOGRAPH:',
    'FRONTAL AND LATERAL VIEWS OF THE CHEST:',
    'PA AND LATERAL CHEST RADIOGRAPH:',
    'PA AND LATERAL VIEWS OF THE CHEST:',
    'PA AND LATERAL:',
    'AP AND LATERAL:',
    'PA AND LATERAL CHEST:',
    'FRONTAL AND LATERAL CHEST:',
    'TWO VIEWS OF THE CHEST:',
    'PORTABLE AP CHEST:',
    'PORTABLE AP CHEST RADIOGRAPH:',
    'PA AND LATERAL CHEST RADIOGRAPHS:',
    'UPRIGHT PORTABLE RADIOGRAPH OF THE CHEST:',
    'PORTABLE RADIOGRAPH OF THE CHEST:',
    'FRONTAL CHEST RADIOGRAPH:',
    'FRONTAL UPRIGHT PORTABLE CHEST:',
    'PORTABLE FRONTAL CHEST RADIOGRAPH:',
    'PORTABLE UPRIGHT FRONTAL VIEW OF THE CHEST:',
    'UPRIGHT AP VIEW OF THE CHEST:',
    'PORTABLE CHEST RADIOGRAPH:',
    'SINGLE AP PORTABLE VIEW:',
    'SINGLE PORTABLE CHEST RADIOGRAPH:',
    'SINGLE PORTABLE VIEW OF THE CHEST:',
    'UPRIGHT FRONTAL CHEST RADIOGRAPHS:',
    'AP:',
    'AP UPRIGHT:',
    'AP CHEST:',
    'CHEST RADIOGRAPH:',
    'TWO VIEWS OF THE THORACIC SPINE:',
    'TWO VIEWS:',
    'AP AND LATERAL VIEWS OF THE CHEST:',
    'ERECT FRONTAL CHEST RADIOGRAPH:',
    'PORTABLE FRONTAL VIEW OF THE CHEST:',
    'PORTABLE RADIOGRAPH:',
    'CHEST AP:',
    'ONE VIEW OF THE CHEST:',
    'AP FILM:',
    'SUPINE AP VIEW OF THE CHEST:',
    'AP VIEW OF THE CHEST:',
    'AP AND LATERAL CHEST RADIOGRAPHS:',
    'AP PORTABLE CHEST:',
    'PORTABLE AP UPRIGHT CHEST RADIOGRAPH:',
    'SINGLE FRONTAL VIEW OF THE CHEST:',
    'DOUBLE CHEST RADIOGRAPH:',
    'CHEST TWO VIEWS:',
    'SUPINE PORTABLE CHEST RADIOGRAPH:',
    'SINGLE PORTABLE AP CHEST RADIOGRAPH:',
    'PORTABLE AP VIEW OF THE CHEST:',
    'PORTABLE ERECT RADIOGRAPH:',
    'PFI:',
    'PORTABLE SUPINE RADIOGRAPH OF THE CHEST:',
    'SINGLE AP VIEW:',
    'PORTABLE UPRIGHT AP VIEW OF THE CHEST:',
    'FRONTAL VIEW OF THE CHEST:',
    'SUPINE PORTABLE CHEST:',
    'PORTABLE UPRIGHT RADIOGRAPH OF THE CHEST:',
    'SINGLE AP VIEW OF THE CHEST:',
    'SINGLE AP PORTABLE CHEST RADIOGRAPH:',
    'PORTABLE AP AND LATERAL CHEST RADIOGRAPH:',
    'CHEST PORTABLE:',
    'PA AND LATERAL VIEWS:',
    'AP SUPINE:',
    'CHEST SINGLE VIEW:',
    'FOUR IMAGES:',
    'PORTABLE SUPINE CHEST RADIOGRAPH:',
    'SEMIUPRIGHT PORTABLE RADIOGRAPH OF THE CHEST:',
    'SINGLE FRONTAL CHEST RADIOGRAPHS:',
    'FRONTAL SUPINE PORTABLE CHEST:',
    'CHEST PA AND LAT RADIOGRAPH:',
    'AP UPRIGHT AND LATERAL:',
    'AP PORTABLE FRONTAL CHEST RADIOGRAPH:',
    'SINGLE PORTABLE FRONTAL VIEW OF THE CHEST:',
#     'RIGHT RIBS:',
    'VIEWS:',
    'FRONTAL AND LATERAL VIEWS THE CHEST:',
    'AP FRONTAL CHEST RADIOGRAPH:',
    'KYPHOTIC POSITIONING:',
    'PORTABLE UPRIGHT CHEST RADIOGRAPH:',
    'UPRIGHT FRONTAL AND LATERAL CHEST RADIOGRAPHS:',
    'SUPINE CHEST RADIOGRAPH:',
    'AP PORTABLE:',
    'OSSEOUS STRUCTURES:',
    'SINGLE VIEW:',
    'SINGLE PORTABLE UPRIGHT VIEW OF THE CHEST:',    
    'SINGLE FRONTAL PORTABLE VIEW OF THE CHEST:',
    'FRONTAL AND LATERAL RADIOGRAPHS:',
    'RADIOGRAPH:',
    'SINGLE AP SUPINE PORTABLE VIEW:',
    'LORDOTIC POSITIONING:',
    'UPRIGHT AP AND LATERAL VIEWS OF THE CHEST:',
    'SUPINE PORTABLE FRONTAL CHEST RADIOGRAPH:',
    'AP CHEST RADIOGRAPH:',
    'FRONTAL AND LATERAL VIEWS CHEST:',
    'FRONTAL PORTABLE CHEST:',
    'FRONTAL PORTABLE SUPINE CHEST:',
    'UPRIGHT FRONTAL VIEW OF THE CHEST:',
    'PORTABLE SUPINE AP CHEST RADIOGRAPH:',
    'PA AND LATERAL VIEWS OF CHEST:',
    'TECHNIQUE PA AND LATERAL VIEWS OF THE CHEST:',
    'AP FRONTAL AND LATERAL CHEST RADIOGRAPHS:',
    'CHEST PA:',
    'AP PORTABLE CHEST RADIOGRAPH:',
    'PORTABLE AP FRONTAL CHEST RADIOGRAPH:',
    'SINGLE AP PORTABLE VIEW WHICH INCLUDES THE UPPER ABDOMEN:',
    'LATERAL VIEWS OF THE CHEST:',
    'SUPINE FRONTAL VIEW OF THE CHEST:',
    'FRONTAL CHEST RADIOGRAPH WITH THE PATIENT IN SUPINE AND UPRIGHT POSITIONS:',
    'AP UPRIGHT VIEW OF THE CHEST:',
    'AP AND LATERAL CHEST RADIOGRAPH:',
    'PORTABLE SEMI-ERECT FRONTAL CHEST RADIOGRAPH:',
    'PORTABLE SUPINE FRONTAL CHEST RADIOGRAPH:',
    'FRONTAL SEMI-UPRIGHT PORTABLE CHEST:',
    'SEMI-ERECT PORTABLE AP CHEST RADIOGRAPH:',
    'PORTABLE SEMI-UPRIGHT AP CHEST RADIOGRAPH:',
    'PA AND LATERAL RADIOGRAPHS OF THE CHEST:',
    'FRONTAL PORTABLE UPRIGHT RADIOGRAPH:',
    'UPRIGHT PORTABLE CHEST RADIOGRAPH:',
    'AP PORTABLE UPRIGHT CHEST RADIOGRAPH:',
    'PORTABLE UPRIGHT RADIOGRAPH CHEST:',
    'SINGLE AP PORTABLE VIEW OF THE CHEST:',
    'SEMI-ERECT FRONTAL CHEST RADIOGRAPH:',
    'SEMI-UPRIGHT AP VIEW OF THE CHEST:',
    'FRONTAL SEMI-SUPINE PORTABLE CHEST:',
    'FRONTAL PORTABLE UPRIGHT CHEST:',
    'CHEST AP SUPINE:',
    'AP UPRIGHT AND LATERAL CHEST RADIOGRAPHS:',
    'PORTABLE UPRIGHT AP CHEST RADIOGRAPH:',
    'PORTABLE UPRIGHT AP VIEW OF THE ABDOMEN:',
    'PORTABLE AP SEMI-UPRIGHT CHEST RADIOGRAPH:',
    'PA AND LAT CHEST RADIOGRAPH:',
    'PORTABLE AP SEMI-ERECT RADIOGRAPH:',
    'Chest:',
    'PORTABLE SEMI-UPRIGHT FRONTAL CHEST RADIOGRAPH:',
    'PA AND LAT:',
    'FRONTAL AP AND LATERAL CHEST:',
    'SEMI-ERECT PORTABLE CHEST RADIOGRAPH:',
#     'Support Devices:',
#     'Lungs:', 
#     'Pleura:', 
#     'Heart:',
#     'Lines and Tubes:',
#     'Lines and tubes:',
#     'Mediastinum:',
#     'Mediastinum and hila:',
#     'Osseous structures:', 
#     'Other findings:',
#     'Bony thorax:',
#     'Pelvis:',
#     'Abdomen:',
    'AP UPRIGHT CHEST RADIOGRAPH:',
    'PA AND LATERAL UPRIGHT CHEST RADIOGRAPHS:',
    'TWO PORTABLE ERECT VIEWS OF THE CHEST:',
    'CHEST AND UPPER ABDOMEN:',
    'Portable chest:',
    'PA and lateral chest reviewed in the absence of prior chest imaging:',
    'CHEST AND PELVIS FILMS:',
    'Chest radiographs:',
    'FRONTAL and LATERAL VIEWS OF THE CHEST:',
    'PORTABLES SEMI-ERECT CHEST RADIOGRAPH:',
    'BEDSIDE FRONTAL CHEST RADIOGRAPH:',
    'SEMI-ERECT PORTABLE CHEST:',
    'SEMI-UPRIGHT PORTABLE RADIOGRAPH OF THE CHEST:',
    'AP UPRIGHT VIEWS OF THE CHEST DURING INSPIRATION AND EXPIRATION:',
    'SEMI-ERECT PORTABLE FRONTAL CHEST RADIOGRAPH:',
    'PORTABLE SEMI-UPRIGHT VIEW OF THE CHEST:',
    'FRONTAL SUPINE PORTABLE VIEW OF THE CHEST:',
    'PA & LATERAL VIEW OF THE CHEST:',
    'PA and lateral views of the chest:',
    'In comparison to previous radiographs:',
    'SUPINE CHEST:',
    'TWO PORTABLE AP VIEWS OF THE CHEST:',
    'PORTABLE SUPINE CHEST:',
    'SUPINE PORTABLE FRONTAL VIEW OF THE CHEST:',
    'BEDSIDE AP UPRIGHT RADIOGRAPH OF THE CHEST:',
    'Again demonstrated are:',
    'RIGHT AND LEFT FRONTAL OBLIQUE VIEWS OF THE CHEST:',
    'PORTABLE SEMI-UPRIGHT RADIOGRAPH OF THE CHEST:',
    'AP UPRIGHT AND LATERAL VIEWS OF THE CHEST:',
    'AP SUPINE CHEST RADIOGRAPH:',
    'FRONTAL VIEWS OF THE CHEST:',
    'PA AND LATERAL CHEST FILMS:',
    'BEDSIDE AP RADIOGRAPH OF THE CHEST:',
    'PORTABLE AP UPRIGHT RADIOGRAPH OF THE CHEST:',
    'CHEST RADIOGRAPHS:',
    'PA AND LATERAL VIEWS CHEST:',
    'PORTABLE SEMI-ERECT AP CHEST RADIOGRAPH:',
    'SINGLE FRONTAL CHEST RADIOGRAPH:',
    'PORTABLE AP FRONTAL VIEW OF THE CHEST:',
    'SUPINE PORTABLE CHEST RADIOGRAPHS:',
    'PORTABLE AP UPRIGHT VIEW OF THE CHEST:',
    'PORTABLE PA CHEST RADIOGRAPH:',
    'AP view of chest:',
    'UPRIGHT AP AND LATERAL VIEWS OF CHEST:',
    'PORTABLE SEMI-ERECT CHEST RADIOGRAPH:',
    'SEMI-UPRIGHT PORTABLE CHEST RADIOGRAPH:',
    'SINGLE AP ERECT PORTABLE VIEW OF THE CHEST:',
    'SINGLE UPRIGHT PORTABLE CHEST RADIOGRAPH:',
    'AP and lateral views of the chest:',
    'UPRIGHT AP AND LATERAL CHEST RADIOGRAPH:',
    'UPRIGHT FRONTAL AND LATERAL VIEWS OF THE CHEST:',
    'PORTABLE UPRIGHT FRONTAL CHEST RADIOGRAPH:',
    'FRONTAL AND LATERAL VIEWS OF CHEST:',
    'FRONTAL CHEST RADIOGRAPHS:',
    'PA and lateral views of chest:',
    'SINGLE PORTABLE UPRIGHT CHEST RADIOGRAPH:',
    'SUPINE AP PORTABLE CHEST RADIOGRAPH:',
    'Additional findings:',
    'SUPINE PORTABLE AP CHEST RADIOGRAPH:',
    'SINGLE SEMI-ERECT PORTABLE VIEW OF THE CHEST:',
    'PORTABLE SEMI-UPRIGHT CHEST RADIOGRAPH:',
    'FRONTAL LATERAL CHEST RADIOGRAPH:',
    'SEMI-ERECT PORTABLE VIEW OF THE CHEST:',
    'PA AND LATERAL FILMS OF THE CHEST:',
    'SEMI-UPRIGHT AP AND LATERAL VIEWS OF THE CHEST:',
    'CHEST PA AND LATERAL RADIOGRAPH:',
    'FRONTAL AND LATERAL UPRIGHT CHEST RADIOGRAPH:',
    'PORTABLE SEMI-ERECT FRONTAL CHEST RADIOGRAPHS:',
    'OPINION:',
    'FRONTAL PORTABLE CHEST RADIOGRAPH:',
    'SEMIERECT PORTABLE RADIOGRAPH OF THE CHEST:',
    'PORTABLE SEMI-ERECT AP AND PA CHEST RADIOGRAPH:',
    'FRONTAL SEMI UPRIGHT PORTABLE CHEST:',
    'SEMIERECT AP VIEW OF THE CHEST:',
    'AP VIEW AND LATERAL VIEW OF THE CHEST:',
    'PORTABLE SEMI-ERECT CHEST:',
    'PORTABLE SEMI-UPRIGHT FRONTAL VIEW OF THE CHEST:',
    'AP SEMI-ERECT CHEST RADIOGRAPH:',
    'SINGLE AP UPRIGHT PORTABLE CHEST RADIOGRAPH:',
    'SINGLE SUPINE PORTABLE VIEW OF THE CHEST:',
    'PA AND AP CHEST RADIOGRAPH:',
    'SINGLE PORTABLE AP VIEW OF THE CHEST:',
    'PORTABLE AP CHEST RADIOGRAPHS:',
    'PORTABLE SEMI-UPRIGHT FRONTAL CHEST RADIOGRAPHS:',
    'SEMI-UPRIGHT AP AND LATERAL CHEST RADIOGRAPHS:',
    'SINGLE PORTABLE CHEST X-RAY:',
    'PORTABLE SEMI-UPRIGHT RADIOGRAPH:',
    'AP portable view of the chest:',
    'AP chest reviewed in the absence of prior chest radiographs:',
    'PA and lateral chest reviewed in the absence of prior chest radiographs:',
])
SECTION_HEADERS_FOR_REPORT.update([
    f'(AFTER) {key}' for key in SECTION_HEADERS_FOR_REPORT
])    
SECTION_HEADERS_FOR_REPORT.update([
    '(AFTER) WET READ:',
    '(AFTER) COMPARISON:',
    '(AFTER) COMPARISON FILM:',
])

SECTION_HEADERS_FOR_REPORT_BACKUP = set([
    '(AFTER) REASON FOR EXAMINATION:',
    '(AFTER) REASON FOR EXAM:',
    '(AFTER) REASON FOR INDICATION:',
    '(AFTER) INDICATION:',
    '(AFTER) CLINICAL HISTORY:',
    '(AFTER) HISTORY:',
    '(AFTER) TECHNIQUE:',
    '(AFTER) TYPE OF EXAMINATION:',
    '(AFTER) EXAM:',
    '(HEADERLESS PARAGRAPH)',
])

SECTION_HEADERS_FOR_PATIENT_BACKGROUND = set([
    'REASON FOR EXAMINATION:',
    'REASON FOR EXAM:',
    'REASON FORE EXAM:',
    'REASON FOR THE EXAM:',
    'Reason for exam:',
    'REASON FOR INDICATION:',
    'INDICATION:',
    'INDCATION:',
    'IDICATION:',
    'NDICATION:',
    'INDICATIONS:',
    'Indication:',
    'ADDENDUM Indication:',
#     'Comparisons:',
#     'COMPARISON:',
#     'COMPARISONS:',
#     'Comparison:',
#     'COMPARISION:',
#     'COMPARISON STUDY:',
#     'COMPARISON TO PRIOR STUDY:',
#     'COMPARISON STUDIES:',
#     'COMPARISON EXAMS:',
#     'COMPARRISON:',
#     'COMPARISON CHEST:',
#     'Comp:',
#     'PRIOR EXAM:',
    'WET READ:',
    '(AFTER) WET READ:',
    'RESIDENT WET READ:',
    'PRELIMINARY RESIDENT WET READ:',
    'PRELIMINARY REPORT:',
    'CLINICAL HISTORY:',
    'CLINICAL HISTORY History:',
    'CLINICAL INDICATION:',
    'Clincal indication:',
    'CLINICAL INFORMATION:',
    'CLINIC INDICATION:',
    'ADDITIONAL CLINICAL HISTORY PROVIDED:',
    'HISTORY:',
    'History:',
    'ADDENDUM INDICATION:',
    'PATIENT HISTORY:',
    'COR:',
    'CHOCTAW:',
    'ICHCt abd:',
    'NCHCT:',
    'ILLNESS:',
    'Surg:',
    'Pre-op:',
    'Pre-op CXR Surg:',
    'Pre-op Surg:',
    'Pre-op chest xray Surg:',
    'Pre-operative planning Surg:',
    'Please obtain pre-op CXR Surg:',
    'CXR Surg:',
    'BKA Surg:',
    'X-ray Surg:',
    'Story:',
    'Head CT:',
    'CT Head:',
    'Please eval for acute process Surg:',
    'Please assess for any abnormalities Surg:',
    'Please evaluate Surg:',
])

IGNORABLE_HEADERS = set([
     'EXAM:', 'STUDY:', 'TYPE OF EXAMINATION:',
     'DATE:', 'RECOMMENDATIONS:', 'CXR:', 'TYPE OF THE EXAMINATION:',
     'RECOMMENDATION:', 'CT:', 'CTA:', 'RUQ:', 'QUESTIONS TO BE ANSWERED:',
    'CLINICAL INFORMATION & QUESTIONS TO BE ANSWERED:',
    "CC:", 'Compressing:', 'Contact name:', 'Common:',
])

SECTION_HEADERS_FOR_PATIENT_BACKGROUND.update([x.lower() for x in SECTION_HEADERS_FOR_PATIENT_BACKGROUND])
SECTION_HEADERS_FOR_REPORT.update([x.lower() for x in SECTION_HEADERS_FOR_REPORT])
IGNORABLE_HEADERS.update([x.lower() for x in IGNORABLE_HEADERS])

KNOWN_HEADERS = SECTION_HEADERS_FOR_PATIENT_BACKGROUND |\
                SECTION_HEADERS_FOR_REPORT_BACKUP |\
                SECTION_HEADERS_FOR_REPORT |\
                IGNORABLE_HEADERS
KNOWN_HEADERS.update(x.lower() for x in [
    'EXAMINATION:',
    'COMPARISON:',
    'COMPARISONS:',
    'COMPARISON STUDY:',
    'Comparisons:',
    'COMPARISON:',
    'COMPARISONS:',
    'Comparison:',
    'COMPARISION:',
    'COMPARISON STUDY:',
    'COMPARISON TO PRIOR STUDY:',
    'COMPARISON STUDIES:',
    'COMPARISON EXAMS:',
    'COMPARRISON:',
    'COMPARISON comparison:',
    'COMPARISON CHEST:',
    'Comp:',
    'The comparison:',
    'PRIOR EXAM:',
    'CT chest:',
    '(HEADERLESS PARAGRAPH)',
    'Correction:',
    'The:',
    'Support Devices:',
    'Lungs:', 
    'Pleura:', 
    'Heart:',
    'Lines and Tubes:',
    'Lines and tubes:',
    'Mediastinum:',
    'Mediastinum and hila:',
    'Osseous structures:', 
    'Other findings:',
    'Bony thorax:',
    'RIGHT RIBS:',
    'Multitrauma:',
    'Pelvis:',
    'Postoperative changes including:',
    'Abdomen:',
    'Tip of the new intra-aortic:',
    'Multiple pulmonary abnormalities include:',
    'The following support devices are identified:',
    'CHEST:',
    'ADDENDUM Clarification:',
    'ADDENDUM IMPRESSION:',
    'AMENDMENT:',
    'ADDENDUM:',
    'ADDENDUM Correction:',
    'ADDENDUM To clarify the above read:',
    'ADDENDUM The correct report should read as follows:',
    'ADDENDUM Please read of the IMPRESSION as following:',
    'The correct IMPRESSION heading should be inserted after:',
    'Allowing for these limitations:',
    'The impression should read:',
    'Impression should read:',
     'Right lower lung opacity has increased:',
    'Final:',
    'CTA chest:',
    'Note:',
    'Comment:',
    'NOTIFICATIONS:',
    'NOTIFICATION:',
    'MAIN REPORT:',
    'TECHNIQUE:',
    'PROCEDURE:',
    'NOTE:',
    'COMMENTS:',
    'COMMENT:',
    'PORTABLE CHEST:',
    'REFERENCE EXAM:',
    'REFERENCE EXAMINATION:',
    'COMPARISON EXAM:',
    'COMPARISON FILM:',
    'COMPARISON FILMS:',
    'CHEST COMPARISON FILM:',
])

In [2031]:
for i in tqdm(range(i+1, len(report_paths))):
    rp = report_paths[i]
    try:
        sections = split_report_into_sections(rp)
        for k in sections.keys():
            if not k.startswith('(AFTER)'):
                assert k.lower() in KNOWN_HEADERS, k
        assert len(sections) > 0
        
        tmp = extract_report_and_patient_background(rp)
        assert len(tmp['report']) > 0
#         assert len(tmp['background']) > 0
    except AssertionError:
        print(i, rp)
        raise

  3%|▎         | 3609/120600 [00:12<06:48, 286.35it/s]

110844 /mnt/data/mimic-cxr/files/p14/p14065514/s56763333.txt





AssertionError: 

In [1941]:
re_invalid.search('''AP single view of the chest has been obtained with patient in semi-upright
 position.  The heart size is within normal limits paying attention to AP
 technique.  No typical configurational abnormality is seen.  Thoracic aorta
 unremarkable with the exception of some calcium deposits in the wall at the
 level of the arch.  The pulmonary vasculature is not congested.  No signs of
 acute parenchymal infiltrates are present.  The right lateral pleural sinus is
 free; however, there is a small amount of pleural density blunting the left
 lateral pleural sinus, and some crowded vasculature on the left base suggests
 some degree of atelectasis.  Acute new infiltrates cannot be identified on
 this portable single view chest examination.  No pneumothorax exists in the
 apical area.  Our records do not include a previous chest examination
 available for comparison.''')

<_sre.SRE_Match object; span=(789, 796), match='records'>

In [2033]:
import random

In [2054]:
rp = random.choice(report_paths)
try:
    sections = split_report_into_sections(rp)
    for k in sections.keys():
        if not k.startswith('(AFTER)'):
            assert k.lower() in KNOWN_HEADERS, k
    assert len(sections) > 0

    tmp = extract_report_and_patient_background(rp)
    assert len(tmp['report']) > 0
    assert len(tmp['background']) > 0
except AssertionError:
    print(i, rp)
    raise

In [2055]:
pprint(extract_report_and_patient_background(rp, True))

                                 FINAL REPORT
 CHEST RADIOGRAPH
 
 INDICATION:  Recent pneumonia, persistent cough, evaluation.
 
 COMPARISON:  No comparison available at the time of dictation.
 
 FINDINGS:  Normal lung volumes.  Normal size of the cardiac silhouette. 
 Normal appearance of the lung parenchyma.  No evidence of pneumonia.  Small
 right pericardial fat pad.  No pleural effusions.

OrderedDict([('(HEADERLESS PARAGRAPH)',
              ['                                 FINAL REPORT\n'
               ' CHEST RADIOGRAPH',
               '\n \n ',
               ' \n ']),
             ('INDICATION:',
              ['  Recent pneumonia, persistent cough, evaluation.']),
             ('(AFTER) INDICATION:', ['\n \n ', ' \n ']),
             ('COMPARISON:',
              ['  No comparison available at the time of dictation.']),
             ('(AFTER) COMPARISON:', ['\n \n ', ' \n ']),
             ('FINDINGS:',
              ['  Normal lung volumes.  Normal size of the cardiac 

In [1964]:
re_invalid = load_invalid_patterns_regex()

In [1829]:
re_invalid.search('The impression and recommendation above was entered by Dr. on at '
             '11:12 into the Department of Radiology critical communications '
             'system for direct communication to the referring provider.')

<_sre.SRE_Match object; span=(52, 57), match='by Dr'>

In [1500]:
from medvqa.metrics.medical.med_completeness import MEDICAL_TERMS_PATH
from medvqa.utils.files import read_lines_from_txt
from nltk import wordpunct_tokenize

medical_terms = set(read_lines_from_txt(MEDICAL_TERMS_PATH))

In [1501]:
def contains_medical_terms(text, k):
    count = 0
    for x in wordpunct_tokenize(text.lower()):
        if x in medical_terms:
            count += 1
            if count >= k: return True
    return False

In [1817]:
import re
from medvqa.datasets.qa_pairs_extractor import REGULAR_EXPRESSIONS_FOLDER

def load_invalid_patterns_regex():
    pattern = ''
    with open(os.path.join(REGULAR_EXPRESSIONS_FOLDER, 'invalid_sentence_patterns.txt')) as f:
        for line in f.readlines():
            if len(pattern) > 0:
                pattern += '|'
            pattern += f'({line.strip()})'
    return re.compile(pattern, re.IGNORECASE)

In [1844]:
def contains_no_invalid_patterns(text):
    return not re_invalid.search(text)

In [1521]:
contains_medical_terms('in comparison to previous chest x-ray support devices radiography', 1)

True

In [1607]:
'3dAsdfasdf'[0].isalpha()

False

In [1955]:
def extract_report_and_patient_background(report_path, debug=False):
    sections = split_report_into_sections(report_path, debug=debug)
    if debug:
        pprint(sections)
    report_chunks = []
    background_chunks = []
    
    if debug:
        print('===================================================')
    
    for k_, v in sections.items():
        k = k_.lower()
        
        if k in SECTION_HEADERS_FOR_PATIENT_BACKGROUND:
            
            if type(v) is list:
                for i in range(len(v)):
                    v[i] = v[i].replace('_', ' ')
                    v[i] = v[i].strip()
                    if len(v[i]) > 0 and v[i][-1] != '.': v[i] += '.'
                if k == '(after) wet read:':
                    v = ' '.join(x for x in v if len(x) > 0 and contains_medical_terms(x, 1))
                else:
                    v = ' '.join(x for x in v if len(x) > 0)
            else:
                v = v.replace('_', ' ').strip()
                if k == '(after) wet read:' and not contains_medical_terms(v, 1):
                    v = ''                    
            v = ' '.join(v.split())
            if len(v) == 0 or v == '.': continue
            if v[-1] != '.': v += '.'
            background_chunks.append((k, v))
            
        elif k in SECTION_HEADERS_FOR_REPORT:
            
            if type(v) is list:
                for i in range(len(v)):
                    v[i] = v[i].replace('_', ' ')
                    v[i] = v[i].strip()
                    if len(v[i]) > 0 and v[i][-1] != '.': v[i] += '.'
                v = ' '.join(x for x in v if len(x) > 0)
            else:
                v = v.replace('_', ' ').strip()
            v = ' '.join(v.split())
            if len(v) == 0 or v == '.': continue
            if v[-1] != '.': v += '.'            
            
            report_chunks.append((k, v))
            
        elif k not in IGNORABLE_HEADERS:
            
            if type(v) is list:
                for i in range(len(v)):
                    v[i] = v[i].replace('_', ' ')
                    v[i] = v[i].strip()
#                     if len(v[i]) > 0 and v[i][-1] != '.' and (i+1 == len(v) or len(v[i+1]) == 0 or
#                                                               v[i+1][0].isalpha()): v[i] += '.'
                v = ' '.join(x for x in v if len(x) > 0 and\
                             contains_medical_terms(x, 2) and\
                             contains_no_invalid_patterns(x))
            elif contains_medical_terms(v, 2) and contains_no_invalid_patterns(v):
                v = v.replace('_', ' ').strip()
            else:
                v = ''
            v = ' '.join(v.split())
            if len(v) == 0 or v == '.': continue
            if v[-1] != '.': v += '.'
                
            if (not k.startswith('(after)') and contains_medical_terms(k, 1)):
                v = f'{k_} {v}'
            
            report_chunks.append((k, v))
            
    if len(report_chunks) == 0:
        for i, p in enumerate(background_chunks):
            if contains_medical_terms(p[1], 3) and contains_no_invalid_patterns(p[1]):
                report_chunks.append(p)
                background_chunks[i] = None
        background_chunks = [p for p in background_chunks if p is not None]

    return dict(
        report = report_chunks,
        background = background_chunks,
    )

In [756]:
len("""Chest tubes and right pleural catheter are in unchanged position.  There is no
 evident pneumothorax.  Bibasilar opacities have increased.  This is partially""")

157

In [23]:
len(split_report_into_sections(report_paths[0]))

6