In [60]:
# %matplotlib inline
import sys
sys.path.append("../../notebooks")

import utils
utils.jpt_autoreload()
utils.jpt_full_width()
utils.jpt_suppress_warnings()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [61]:
import os, re, time
import pandas as pd
import numpy as np

from label_reports import get_chf_cohort, label_report
import negex
from regex_utils import WordMatch
from section_parser import section_text

from tabulate import tabulate

from datasets import MimicCxrLabels, MimicCxrReader, MimicCxrBase

In [62]:

def assign_label(row):
    # Assign label for a keyword's word match result
    #
    #  neg aff
    #  [0. 0.]  ->  nan
    #  [0. 1.]  ->  1
    #  [1. 0.]  ->  0
    #  [1. 1.]  ->  nan
    #
    if sum(row) == 1:
        return row[1]
    else:
        return np.nan


def filter_sections(report):
    # Remove text in INDICATION section ... 
    #     as it might contain keywords which contains keywords 
    #     but does not mean existence of keywords
    # 
    sections, section_names, section_idx = section_text(report)
    filtered = [i for i,x in enumerate(section_names) 
                if x not in ['indication', 'history', 'comparison', 'technique']]
    report = "".join(sections[i] for i in filtered)
    report = report.replace('\n','').replace('\r','')
    return report
    

def cols_value_counts(df):
    C = df.apply(lambda col: pd.Series.value_counts(col, dropna=False))
    C = C.transpose()
    C = C.replace({np.nan: 0})
#     C = C.sort_values(by=[np.nan])
    return C


    

In [63]:
cxr_labels = MimicCxrLabels()
cxr_reader = MimicCxrReader()
meta_df = MimicCxrBase().get_meta_df()

In [33]:

# cohort \in [ 'all',  'chf' ]
#               224k    28k 
cohort = 'all'

# current path
current_path = '.'

# chf diagnosis information for mimic-cxr data
chf_metadata_path = os.path.join(current_path, 'mimic_cxr_heart_failure', 'mimic_cxr_metadata_hf.tsv')

# negex
negex_trigger_path = os.path.join(current_path, 'negex', 'negex_triggers.txt')

# keywords
keywords_version = 'miccai2020'
neg_kwd_path = os.path.join(
    current_path, 'keywords', keywords_version, 'keywords_negated.tsv')
aff_kwd_path = os.path.join(
    current_path, 'keywords', keywords_version, 'keywords_affirmed.tsv')

# save resulting labels
df_save_path = os.path.join(current_path, f'negex_findings_version={keywords_version}_cohort={cohort}.csv')


In [34]:
df_neg = pd.read_csv(neg_kwd_path, sep="\t")
df_aff = pd.read_csv(aff_kwd_path, sep="\t")

aff_kwd = df_aff['keyword_terms'].to_list()
neg_kwd = df_neg['keyword_terms'].to_list()

keywords = neg_kwd + aff_kwd
keywords = [ 'pulmonary edema',
             'mild pulmonary edema',
             'moderate pulmonary edema',
             'vascular congestion',
             'fluid overload',
             'acute cardiopulmonary process',
             'cephalization',
             'pulmonary vascular congestion',
             'hilar engorgement',
             'vascular plethora',
             'pulmonary vascular prominence',
             'pulmonary vascular engorgement',
             'kerley',
             'interstitial edema',
             'interstitial thickening',
             'interstitial pulmonary edema',
             'interstitial marking',
             'interstitial abnormality',
             'interstitial abnormalities',
             'interstitial process',
             'alveolar infiltrates',
             'severe pulmonary edema',
             'perihilar infiltrates',
             'hilar infiltrates',
             'interstitial opacities',
             'parenchymal opacities',
             'alveolar opacities',
             'ill defined opacities',
             'patchy opacities',
]

kwd_to_severities = df_aff.set_index('keyword_terms').to_dict()['pulmonary_edema_severity']
kwd_to_severities.update(df_neg.set_index('keyword_terms').to_dict()['pulmonary_edema_severity'])
kwd_to_severities

{'cephalization': 1,
 'pulmonary vascular congestion': 1,
 'hilar engorgement': 1,
 'vascular plethora': 1,
 'pulmonary vascular prominence': 1,
 'pulmonary vascular engorgement': 1,
 'interstitial opacities': 2,
 'kerley': 2,
 'interstitial edema': 2,
 'interstitial thickening': 2,
 'interstitial pulmonary edema': 2,
 'interstitial marking': 2,
 'interstitial abnormality': 2,
 'interstitial abnormalities': 2,
 'interstitial process': 2,
 'alveolar infiltrates': 3,
 'severe pulmonary edema': 3,
 'perihilar infiltrates': 3,
 'hilar infiltrates': 3,
 'parenchymal opacities': 3,
 'alveolar opacities': 3,
 'ill defined opacities': 3,
 'ill-defined opacities': 3,
 'patchy opacities': 3,
 'pulmonary edema': 0,
 'vascular congestion': 0,
 'fluid overload': 0,
 'acute cardiopulmonary process': 0}

In [65]:
# `meta_df` restrict to chf cohort

if cohort == 'all':
    meta_df = MimicCxrBase().get_meta_df()
    study_ids = meta_df['study_id'].unique()
elif cohort == 'chf':
    meta_df = pd.read_csv(chf_metadata_path, sep='\t')
    meta_df = meta_df[meta_df['heart_failure'] == 1]
    study_ids = meta_df['study_id'].unique()

print(f"#study_id = {len(study_ids)}")
meta_df

#study_id = 218139


Unnamed: 0,dicom_id,subject_id,study_id,ViewPosition
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,PA
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,PA
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,AP
5,fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818,10000032,53911762,AP
6,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,10000032,56699142,AP
...,...,...,...,...
377104,3fcd0406-9b111603-feae7033-96632b3a-111333e5,19999733,57132437,PA
377105,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,19999733,57132437,PA
377107,58766883-376a15ce-3b323a28-6af950a0-16b793bd,19999987,55368167,AP
377108,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,19999987,58621812,AP


In [None]:
labels = {}
start = time.time()

for i, study_id in enumerate(study_ids):

    try:
        report = cxr_reader.get_report(study_id)
    except:
        continue

    report = filter_sections(report)
    sentences = re.split('\.|\:', report)
    

    # (#keywords, #tags, #sentences)
    #     where tags = [negated, affirmed]
    T = np.zeros((len(keywords), 2, len(sentences)))

    for si, sentence in enumerate(sentences):
        word_match = WordMatch(sentence, keywords)
        kwd_aff = word_match.mention_positive()
        kwd_neg = word_match.mention_negative()
        for ki, k in enumerate(keywords):
            T[ki,0,si] = kwd_neg[k]
            T[ki,1,si] = kwd_aff[k]

    # aggregate results wrt sentences in reports
    T = np.amax(T, axis=2)
    label = np.apply_along_axis(assign_label, axis=1, arr=T)
    assert(label.size == len(keywords))
    
    labels[study_id] = label

    if i%1000 == 0:
        end = time.time()
        print(f'Iter={i}\tTime Elapsed={end-start:.3f}')
        start = time.time()

        
df = pd.DataFrame.from_dict(labels, orient='index', columns=keywords)
df = df.rename_axis('study_id').reset_index()
print(len(df))
    

In [None]:
df.to_csv(df_save_path, index=False)
print(f'saved to {df_save_path}')

In [73]:
# take a look at summary statistics
#
def cols_value_counts(df):
    C = df.apply(lambda col: pd.Series.value_counts(col, dropna=False))
    C = C.transpose()
    C = C.replace({np.nan: 0})
    C_index = C.index
    C['severity'] = [kwd_to_severities[idx] if idx in kwd_to_severities else np.nan for idx in C_index]
    C = C.sort_values(by=['severity'])
    return C

df_save_path = './negex_findings_version=miccai2020_nopacities_cohort=all.csv'
# df_save_path = './negex_findings_version=miccai2020_nopacities_cohort=chf.csv'
df = pd.read_csv(df_save_path)
C = cols_value_counts(df.iloc[:,1:])

# C = C[(C[0]+C[1])>500]
label_names = C.index.to_list()
label_remove = [
    'cephalization',
    'hilar engorgement',
    'vascular plethora',
    'pulmonary vascular prominence',
    'interstitial process',
    'interstitial abnormalities',
    'interstitial pulmonary edema',
    'interstitial thickening',
    'kerley',
    'ill defined opacities',
    'alveolar infiltrates',
    'perihilar infiltrates',
    'hilar infiltrates',
]
df = df[list(set(label_names)-set(label_remove))]


C = cols_value_counts(df.iloc[:,1:])


print(tabulate(C, headers=['findings'] + [str(x) for x in list(C.columns)]))

findings                          0.0    1.0     nan    severity
------------------------------  -----  -----  ------  ----------
fluid overload                    147   2788  212158           0
acute cardiopulmonary process   27751    121  187221           0
pulmonary edema                 26559  24334  164200           0
vascular congestion              8659  12984  193450           0
pulmonary vascular congestion    2481   8843  203769           1
pulmonary vascular engorgement    233   1418  213442           1
interstitial abnormality           39   1121  213933           2
interstitial edema                374   4000  210719           2
interstitial marking                0   2867  212226           2
parenchymal opacities            1787   2981  210325           3
alveolar opacities                  3    402  214688           3
severe pulmonary edema              8   1056  214029           3
patchy opacities                    3   1875  213215           3
moderate pulmonary edema 

In [64]:


def cols_value_counts(df):
    C = df.apply(lambda col: pd.Series.value_counts(col, dropna=False))
    C = C.transpose()
    C = C.replace({np.nan: 0})
    C = C.sort_values(by=[np.nan])
    return C


def proc_findings_labels(df, thresh=500, unwanted_cols = ['pulmonary edema']):

    # thresholding entries with >thresh number of samples
    C = cols_value_counts(df.iloc[:,1:])
    C = C[(C[0]+C[1])>thresh]
    label_names = C.index.to_list()

    # remove unwanted labels
    label_names = list(set(label_names)-set(unwanted_cols))

    # filter columns
    df_f = df[['study_id']+label_names]

    # randomly sample minor class to match major class
    #    so that #samples with 0/1 are the same
    #
    for l in label_names:
        addv = 0 if (C.loc[l,0] < C.loc[l,1]) else 1
        addn =  int(np.abs(C.loc[l,0] - C.loc[l,1]))
        addIdx = np.random.choice(df_f[df_f[l].isnull()].index, size=addn, replace=False)
        df_f.iloc[addIdx, df_f.columns.get_loc(l)] = addv
        
    return df_f


df_f = proc_findings_labels(df)

C = cols_value_counts(df_f.iloc[:,1:])
print(tabulate(C, headers=['findings'] + [str(x) for x in list(C.columns)]))


findings                         nan    1.0    0.0
-----------------------------  -----  -----  -----
vascular congestion            11849   2179   2179
pulmonary vascular congestion  13081   1563   1563
mild pulmonary edema           13463   1372   1372
acute cardiopulmonary process  14331    938    938
interstitial edema             14509    849    849
moderate pulmonary edema       14575    816    816
interstitial marking           15139    534    534


In [10]:
df_ = df[df['kerley']==0]


for i, study_id in enumerate(df_['study_id']):
    
    print('----------------------------------------')
    print(cxr_reader.get_report(study_id))
    print(df_.iloc[i,:][df_.iloc[i,:].notnull()])
    
    if i > 10:
        break

----------------------------------------
                                 FINAL REPORT
 EXAMINATION:  CHEST (PORTABLE AP)
 
 INDICATION:  ___ year old woman with asthma exacerbation and increasing O2
 reqauirement  // interval change      interval change
 
 COMPARISON:  Chest radiographs since ___, most recently ___.
 
 IMPRESSION: 
 
 Heart size is top-normal, comparable to the size on conventional radiographs
 ___.  Lungs are clear.  Slight it interval increase in caliber of
 mediastinal veins is not accompanied by pulmonary vascular engorgement, edema,
 or pleural effusion.  Lungs are clear.  No pneumothorax.
 
 Vascular clips denote prior neck surgery in the region of the left thyroid
 lobe.

study_id                          54471901.0
pulmonary vascular engorgement           0.0
Name: 291, dtype: float64
----------------------------------------
                                 FINAL REPORT
 INDICATION:  Chest pain, here to evaluate for acute cardiopulmonary process.
 
 COMPARISON