
+ `v1`
    ```
    # finding_labels = [
    #     'acute cardiopulmonary process',
    #     'vascular congestion',
    #     'pulmonary vascular congestion',
    #     'mild pulmonary edema',
    #     'interstitial edema',
    #     'moderate pulmonary edema',
    #     'parenchymal opacities',
    #     'interstitial marking',
    #     'fluid overload',
    #     'interstitial pulmonary edema',
    #     'patchy opacities',
    #     'pulmonary vascular engorgement',
    #     'interstitial opacities',
    #     'interstitial abnormality',
    #     'severe pulmonary edema',
    # ]
    ```
+ `v3` mostly findings that describes images
    ```
    ['vascular congestion',
     'fluid overload',
     'acute cardiopulmonary process',
     'vascular congestion',
     'vascular enlargement',
     'vascular plethora',
     'vascular engorgement',
     'vascular prominence',
     'cephalization',
     'hilar abnormalities',
     'hilar congestion',
     'hilar vessels',
     'hilar vessel',
     'hilar engorgement',
     'hilar prominence',
     'hilar infiltrates',
     'peribronchial cuffing',
     'bronchial cuffing',
     'bronchial wall thickening',
     'septal lines',
     'septal line',
     'septal thickening',
     'kerley',
     'b lines',
     'b-lines',
     'b line',
     'interstitial abnormality',
     'interstitial abnormalities',
     'interstitial marking',
     'interstitial markings',
     'interstitial infiltrates',
     'air bronchograms',
     'parenchymal opacity',
     'parenchymal opacification',
     'parenchymal opacities',
     'parenchymal infiltrates',
     'airspace opacity',
     'airspace opacfication',
     'airspace opacities',
     'nodular opacity',
     'nodular opacfication',
     'nodular opacities',
     'bibasilar opacity',
     'bibacilar opacfication',
     'bibasilar opacities',
     'bibasal infiltrates',
     'bibasal infiltrates',
     'perihilar opacity',
     'perihilar opacification',
     'perihilar opacities',
     'perihilar infiltrates',
     'alveolar opacity',
     'alveolar opacification',
     'alveolar opacities',
     'alveolar infiltrates',
     'patchy opacity',
     'patchy opacification',
     'patchy opacities',
     'ground-glass opacity',
     'ground-glass opacities',
     'ground-glass pulmonary opacities',
     'ground-glass pulmonary opacity']
    
    label_names = [
        'vascular congestion',
        'hilar congestion',
        'peribronchial cuffing',
        'septal lines',
        'interstitial abnormality',
        'air bronchograms',
        'parenchymal opacity',
        'nodular opacity',
        'bibasilar opacity',
        'perihilar opacity',
        'patchy opacity',
    ]
    ```

+ './negex_findings_cohort=all_v2.csv'
    + merged, hierearhicy corrected 16 labels, balanced dataset

+ background on findnigs
    + pretty helpful https://radiopaedia.org/articles/pulmonary-oedema?lang=us
    + https://www.radiologymasterclass.co.uk/tutorials/chest/chest_pathology/chest_pathology_page8
    + has localization info pretty useful https://www.radiologymasterclass.co.uk/tutorials/chest/chest_pathology/chest_pathology_start

In [1]:
# %matplotlib inline
import sys
sys.path.append("../../notebooks")

import utils
utils.jpt_autoreload()
utils.jpt_full_width()
utils.jpt_suppress_warnings()

In [2]:
import os, re, time
import pandas as pd
import numpy as np

from label_reports import get_chf_cohort, label_report
import negex
from regex_utils import WordMatch
from section_parser import section_text
from extract_findings import extract_findings

from tabulate import tabulate
from datasets import MimicCxrLabels, MimicCxrReader, MimicCxrBase

In [3]:

def assign_label(row):
    # Assign label for a keyword's word match result
    #
    #  neg aff
    #  [0. 0.]  ->  nan
    #  [0. 1.]  ->  1
    #  [1. 0.]  ->  0
    #  [1. 1.]  ->  nan
    
    #
    if sum(row) == 1:
        return row[1]
    else:
        return np.nan


def filter_sections(report):
    # Remove text in INDICATION section ... 
    #     as it might contain keywords which contains keywords 
    #     but does not mean existence of keywords
    # 
    sections, section_names, section_idx = section_text(report)
    filtered = [i for i,x in enumerate(section_names) 
                if x not in ['indication', 'history', 'comparison', 'technique']]
    report = "".join(sections[i] for i in filtered)
    report = report.replace('\n','').replace('\r','')
    return report
    

def cols_value_counts(df):
    C = df.apply(lambda col: pd.Series.value_counts(col, dropna=False))
    C = C.transpose()
    C = C.replace({np.nan: 0})
#     C = C.sort_values(by=[np.nan])
    return C


    

In [4]:
cxr_labels = MimicCxrLabels()
cxr_reader = MimicCxrReader()
meta_df = MimicCxrBase().get_meta_df()

In [9]:

# cohort \in [ 'all',  'chf' ]
#               224k    28k 
cohort = 'chf'

# current path
current_path = '.'

# chf diagnosis information for mimic-cxr data
chf_metadata_path = os.path.join(current_path, 'mimic_cxr_heart_failure', 'mimic_cxr_metadata_hf.tsv')

# negex
negex_trigger_path = os.path.join(current_path, 'negex', 'negex_triggers.txt')

# keywords
keywords_version = 'miccai2020'
neg_kwd_path = os.path.join(
    current_path, 'keywords', keywords_version, 'keywords_negated.tsv')
aff_kwd_path = os.path.join(
    current_path, 'keywords', keywords_version, 'keywords_affirmed.tsv')

# save resulting labels
df_save_path = os.path.join(current_path, f'negex_findings_version={keywords_version}_cohort={cohort}_v2.csv')


In [10]:
df_neg = pd.read_csv(neg_kwd_path, sep="\t")
df_aff = pd.read_csv(aff_kwd_path, sep="\t")

aff_kwd = df_aff['keyword_terms'].to_list()
neg_kwd = df_neg['keyword_terms'].to_list()


kwd_to_severities = df_aff.set_index('keyword_terms').to_dict()['pulmonary_edema_severity']
kwd_to_severities.update(df_neg.set_index('keyword_terms').to_dict()['pulmonary_edema_severity'])
kwd_to_severities

keywords = neg_kwd + aff_kwd
# keywords = [ 'pulmonary edema',
#              'mild pulmonary edema',
#              'moderate pulmonary edema',
#              'vascular congestion',
#              'fluid overload',
#              'acute cardiopulmonary process',
#              'cephalization',
#              'pulmonary vascular congestion',
#              'hilar engorgement',
#              'vascular plethora',
#              'pulmonary vascular prominence',
#              'pulmonary vascular engorgement',
#              'kerley',
#              'interstitial edema',
#              'interstitial thickening',
#              'interstitial pulmonary edema',
#              'interstitial marking',
#              'interstitial abnormality',
#              'interstitial abnormalities',
#              'interstitial process',
#              'alveolar infiltrates',
#              'severe pulmonary edema',
#              'perihilar infiltrates',
#              'hilar infiltrates',
#              'interstitial opacities',
#              'parenchymal opacities',
#              'alveolar opacities',
#              'ill defined opacities',
#              'patchy opacities'
#            ]

# chf
keywords_few = [
    'peribronchial cuff', # normally just peribronchial cuffing
    'cardiophrenic angle',  # moslty costophrenic angle
    'cardiophrenic angles', # moslty costophrenic angle
    'interlobular thickening', # 0 in chf
    'basilar septal lines',
    'scattered septal lines',
    'thickening of septal lines',
    'thickened septal lines',
    'vascular shadowing',
    'vessel enlargement', # 0
    'micronoduli',        # 0
    'batwing',        # 1
    'bat wing', # 0
    'bat-wing', # 1
    'interlobular fissure', # 1
    'interlobar fissure', # 4
    'fissural thickening', # 8
    'perihilar cuffing', # 0
    'peribronchial infiltration', # 3
    'perihilar haze', # 10
    # pleural effusion has a lot of examples but not for edema
    'costophrenic angle',
    'costophrenic angles',
    'blunting of costophrenic angle',
    ## 
    'elevation of central venous pressure', # 1
    'ill-defined pulmonary vessels', # 7
    'azygos shadow', # 1
    'cephalisation', # 0
    'pulmonary venous diversion', # 0
    'antler', # 0
    'mediastinum', # lots but irrelevant to edema
    'cardiothoracic ratio', # 0
    'cardiac silhouette', # seems irrelevant to edema
    'lymphatic distension',  # 0  manifest as kerley b lines
    'interlobular', # 5
    'hilar indictinctness', # 0
    'ill-defined hila', # 0
    'airspace shadow', # 0
    'airspace shadowing', # 0
    'alveolograms', # 0
    'perihilar haze', # 10
    'perihilar shadow', # 0 same as batwing
    'perihilar shadowing', # 0
    # ask about this 
    'elevated pulmonary venous pressure', # lots, but seems not relevant to edema
    'elevation of pulmonary venous pressure', # lots
    'fissural thickening', # 8
    # too little, ~50 in chf
    'ground-glass opacity',
    'ground-glass opacities',
    'ground-glass pulmonary opacities',
    'ground-glass pulmonary opacity',
]


keywords = [
    #>>>>>>>> 0 <<<<<<<<
    'pulmonary edema',
    'fluid overload',
    'acute cardiopulmonary process',
    #>>>>>>>> 1 <<<<<<<<
    # - cephlization or upper lobe pulmonary venous diversion (stag's antler sign)
    # - increased cardiothoracic ratio/ cardiac silhouette size
    'vascular congestion',
        'pulmonary vascular congestion',
        'vascular enlargement',
        'vascular plethora',
        'vascular engorgement',
        'vascular prominence',
        'cephalization',
        # vascular = hilar {engorgement,prominence etc.} ?
        # hilar is more specific i think ..
    'hilar congestion',
        'hilar engorgement',
        'hilar prominence',
        'hilar infiltrates',
        #     prominence of the hila and perihilar vessels bilaterally suggesting some pulmonary edema
        #     mildly prominent hilar vessels
        #     Prominent right hilar vessels and central pulmonary vessels are unchanged. pulmonary edema appears unchange
        #     Prominence of the right hilar vessels is unchanged. mild pulmonary edema.
        #     increasing interstitial prominence and fullness in the perihilar vessels suggesting an element of mild interstitial edema.
        #     Increases in diameter of the perihilar vessels suggest the presence of mild pulmonary edema.
        #     Hilar vessels are newly enlarged, and vascular caliber in the lung apices is also noted. mild edema 
    'peribronchial cuffing', # haziness around wall of bronchus, doughnut sign
        'bronchial cuffing',
        'bronchial wall thickening', #  abnormal thickening of bronchial wall, cause for periboronchial cuffing
    #>>>>>>>> 2 <<<<<<<<
    # - peribronchial cuffing, perihilar haze
    # - septal (kerley) lines
    # - thickening of interlobular fissures
    'septal lines',
        'septal line',
        'septal thickening',
        'kerley',
        'b lines',
        'b-lines',
        'b line',
        # similar to b lines ... merge ?
        #     Scattered septal lines are present bilaterally suggesting the presence of interstitial edema
        #     increased interstitial opacity with septal lines compatible with mild pulmonary edema 
        #     here is minimal peribronchial cuffing and thickening of septal lines
        #     thickened septal lines which reflect mild pulmonary edema.
        #     Significant bilateral interstitial pulmonary edema is seen with associated septal lines
    ## abnormality -> marking
    'interstitial abnormality',
        'interstitial abnormalities',
        'interstitial marking',
        'interstitial markings',
        'interstitial infiltrates',
        'interstitial edema',
        'interstitial pulmonary edema',
    #>>>>>>>> 3 <<<<<<<<
    # - air space opacification, .. classically batwing distribution  (perihilar!)
    # - maybe air bronchograms
    'air bronchograms',  # airopacification of alveoli
    # differnt kinds of opacities
    #     parenchymal -> airspace/nodular/bibasilar/perihilar/alveolar
    #     alveolar -> patchy
    #
    'parenchymal opacity',
        'parenchymal opacification',
        'parenchymal opacities',
        'parenchymal infiltrates',
        'airspace opacity',
        'airspace opacification',
        'airspace opacities',
        'alveolar opacity',
        'alveolar opacification',
        'alveolar opacities',
        'alveolar infiltrates',
    'nodular opacity',
        'nodular opacification',
        'nodular opacities',
    'bibasilar opacity',
        'bibasilar opacification',
        'bibasilar opacities',
        'bibasilar infiltrates',
    'perihilar opacity', # specifies location of opacities
        'perihilar opacification', 
        'perihilar opacities',
        'perihilar infiltrates',
    'patchy opacity',
        'patchy opacification',
        'patchy opacities',
]

# keywords = keywords + ['no '+x for x in keywords]
keywords

['pulmonary edema',
 'fluid overload',
 'acute cardiopulmonary process',
 'vascular congestion',
 'pulmonary vascular congestion',
 'vascular enlargement',
 'vascular plethora',
 'vascular engorgement',
 'vascular prominence',
 'cephalization',
 'hilar congestion',
 'hilar engorgement',
 'hilar prominence',
 'hilar infiltrates',
 'peribronchial cuffing',
 'bronchial cuffing',
 'bronchial wall thickening',
 'septal lines',
 'septal line',
 'septal thickening',
 'kerley',
 'b lines',
 'b-lines',
 'b line',
 'interstitial abnormality',
 'interstitial abnormalities',
 'interstitial marking',
 'interstitial markings',
 'interstitial infiltrates',
 'interstitial edema',
 'interstitial pulmonary edema',
 'air bronchograms',
 'parenchymal opacity',
 'parenchymal opacification',
 'parenchymal opacities',
 'parenchymal infiltrates',
 'airspace opacity',
 'airspace opacification',
 'airspace opacities',
 'alveolar opacity',
 'alveolar opacification',
 'alveolar opacities',
 'alveolar infiltrates'

In [25]:
# `meta_df` restrict to chf cohort
cohort = 'all'
df_save_path = os.path.join(current_path, f'negex_findings_cohort={cohort}_v4.csv')

if cohort == 'all':
    meta_df = MimicCxrBase().get_meta_df()
    study_ids = meta_df['study_id'].unique()
elif cohort == 'chf':
    meta_df = pd.read_csv(chf_metadata_path, sep='\t')
    meta_df = meta_df[meta_df['heart_failure'] == 1]
    study_ids = meta_df['study_id'].unique()

print(f"#study_id = {len(study_ids)}")

print(cxr_reader.get_report(study_ids[10]))

#study_id = 218139
                                 FINAL REPORT
 HISTORY:  Recurrent vomiting, subjective fever and cough.
 
 TECHNIQUE:  Upright AP and lateral views of the chest.
 
 COMPARISON:  ___.
 
 FINDINGS:
 
 Lung volumes are low.  The heart size is normal.  The mediastinal and hilar
 contours are unremarkable.  New nodular opacities are clustered within the
 left upper lobe, and to a lesser extent, within the right upper lobe.  There
 is no pneumothorax or left-sided pleural effusion.  Pulmonary vascularity is
 within normal limits.  Postsurgical changes are noted in the right chest with
 partial resection of the right 6th rib, lateral right pleural thickening and
 chronic blunting of the costophrenic sulcus.  
 
 IMPRESSION:
 
 New nodular opacities within both upper lobes, left greater than right.
 Findings are compatible with metastases, as was noted in the lung bases on the
 subsequent CT of the abdomen and pelvis performed later the same day.



In [26]:
labels = {}
start = time.time()

for i, study_id in enumerate(study_ids):
    
    try:
        report = cxr_reader.get_report(study_id)
    except:
        continue

    label = extract_findings(report, keywords)
    labels[study_id] = label

    if i%1000 == 0:
        end = time.time()
        print(f'Iter={i}\tTime Elapsed={end-start:.3f}')
        start = time.time()
        
        
df = pd.DataFrame.from_dict(labels, orient='index', columns=keywords)
df = df.rename_axis('study_id').reset_index()
print(len(df))

Iter=0	Time Elapsed=0.025
Iter=1000	Time Elapsed=11.008
Iter=2000	Time Elapsed=7.739
Iter=3000	Time Elapsed=8.036
Iter=4000	Time Elapsed=9.594
Iter=5000	Time Elapsed=11.340
Iter=6000	Time Elapsed=10.919
Iter=7000	Time Elapsed=10.093
Iter=8000	Time Elapsed=9.383
Iter=9000	Time Elapsed=8.947
Iter=10000	Time Elapsed=11.272
Iter=11000	Time Elapsed=11.992
Iter=12000	Time Elapsed=11.342
Iter=13000	Time Elapsed=12.269
Iter=14000	Time Elapsed=12.301
Iter=15000	Time Elapsed=11.826
Iter=16000	Time Elapsed=10.791
Iter=17000	Time Elapsed=12.737
Iter=18000	Time Elapsed=13.899
Iter=19000	Time Elapsed=13.206
Iter=20000	Time Elapsed=12.649
Iter=21000	Time Elapsed=13.283
Iter=22000	Time Elapsed=9.984
Iter=23000	Time Elapsed=10.181
Iter=24000	Time Elapsed=10.680
Iter=25000	Time Elapsed=14.081
Iter=26000	Time Elapsed=12.908
Iter=27000	Time Elapsed=13.297
Iter=28000	Time Elapsed=13.619
Iter=29000	Time Elapsed=10.671
Iter=30000	Time Elapsed=10.410
Iter=31000	Time Elapsed=10.087
Iter=32000	Time Elapsed=13.0

In [28]:
df.to_csv(df_save_path, index=False)
print(f'saved to {df_save_path}')

saved to ./negex_findings_cohort=all_v4.csv


In [None]:
## See discrepancy with findings' to severity label and consensus image GT label

# 'miccai2020_nopacities'
affirmed_keywords_path = os.path.join('./', 'keywords', 'wpq', 'keywords_affirmed.tsv')
dfaff = pd.read_csv(affirmed_keywords_path, sep="\t")
FtoS = dfaff.set_index('keyword_terms')['pulmonary_edema_severity'].to_dict()
FtoS

def finding_to_severity_fn(row):
    Y = [FtoS[k] for k, v in row.items()
         if k in FtoS and not np.isnan(v)]
    if len(Y) == 0:
        return np.nan
    else:
        Ymax = np.nanmax(Y)
        return Ymax

df['EdemaSeverityFromFindings'] = df.apply(finding_to_severity_fn, axis=1)
print('finding->label')
print(df['EdemaSeverityFromFindings'].value_counts())


dfc = cxr_labels.df
dfc = dfc[['dicom_id', 'study_id', 'split', 'EdemaSeverity']]
print('finding gt')
print(dfc['EdemaSeverity'].value_counts())

dfm = pd.merge(dfc, df, how='right', on=['study_id'])

from sklearn.metrics import confusion_matrix

dfm = dfm[dfm['split']=='test_consensus_image'] # test_expert_report
print(dfm['EdemaSeverity'].value_counts())

dfm = dfm[~np.isnan(dfm['EdemaSeverity'])]
dfm = dfm[~np.isnan(dfm['EdemaSeverityFromFindings'])]


y_true = dfm['EdemaSeverity'].to_numpy() # no nan
y_pred = dfm['EdemaSeverityFromFindings'].to_numpy() # possibly nan

# (true, pred)
confusion_matrix(y_pred, y_true)


# miccai2020
# array([[ 0,  0,  0,  0],
#        [17, 13,  5,  3],
#        [ 9, 12, 11,  2],
#        [ 3,  0,  4,  0]])

# miccai no opacities 
# array([[ 0,  0,  0,  0],
#        [22, 14,  5,  3],
#        [ 9, 12, 11,  2],
#        [ 3,  0,  4,  0]])
# wpq
# array([[ 0,  0,  0,  0],
#        [23, 16,  6,  4],
#        [ 9, 12, 11,  2],
#        [ 4,  1,  4,  0]])

In [21]:
def cols_value_counts(df):
    C = df.apply(lambda col: pd.Series.value_counts(col, dropna=False))
    C = C.transpose()
    C = C.replace({np.nan: 0})
    C_index = C.index
#     C['severity'] = [kwd_to_severities[idx] if idx in kwd_to_severities else np.nan for idx in C_index]
#     C = C.sort_values(by=['severity'])
    return C

# df_save_path = './negex_findings_version=miccai2020_nopacities_cohort=all.csv'

df = pd.read_csv(df_save_path)
print(len(df))

C = cols_value_counts(df.iloc[:,1:])
print(tabulate(C, headers=['findings'] + [str(x) for x in list(C.columns)]))

16207
findings                         0.0    1.0    nan
-----------------------------  -----  -----  -----
pulmonary edema                 2152   4305   9750
fluid overload                    18    391  15798
acute cardiopulmonary process    938      3  15266
vascular congestion              475   2179  13553
pulmonary vascular congestion    205   1563  14439
vascular enlargement               1     12  16194
vascular plethora                  0     57  16150
vascular engorgement              31    339  15837
vascular prominence                0     45  16162
cephalization                      2    115  16090
hilar congestion                   2    116  16089
hilar engorgement                  1     49  16157
hilar prominence                   0     32  16175
hilar infiltrates                  0      2  16205
peribronchial cuffing              1     64  16142
bronchial cuffing                  0     78  16129
bronchial wall thickening          0     18  16189
septal lines             

In [29]:
# Goal
# step 1: merge and drop findings with too many samples
# step 2: add labels from hierarchy
# step 3: assign all others negatives


def combine_label_synonyms(x, y):
    """ Combine label, 
        x    y    output
        0    0    0
        1    0    nan
        0    1    nan
        1    1    1
        nan  0/1  0/1
        0/1  nan  0/1
        
        ```
            df = pd.DataFrame({'x': [0,1,0,1,np.nan,1],'y': [0,0,1,1,0,np.nan]})
            df.agg(lambda row: combine_label_synonyms(row['x'], row['y']), axis=1)
            # 0    0.0
            # 1    NaN
            # 2    NaN
            # 3    1.0
            # 4    0.0
            # 5    1.0
        ```
    """
    x_notnull = not np.isnan(x)
    y_notnull = not np.isnan(y)
    if x_notnull and y_notnull:
        if x == y:
            return x
        else:
            return np.nan
    else:
        if x_notnull:
            return x
        else:
            return y
        
def merge_2cols(df, col1, col2):
    # merge col1 to col2
    df[col2] = df.agg(lambda row: combine_label_synonyms(row[col1], row[col2]), axis=1)
    return df


cols_merge = [('vascular congestion', 'pulmonary vascular congestion', 'vascular enlargement', 'vascular plethora', 'vascular engorgement', 'vascular prominence', 'cephalization'),
              ('hilar congestion', 'hilar engorgement', 'hilar prominence', 'hilar infiltrates'),
              ('peribronchial cuffing', 'bronchial cuffing', 'bronchial wall thickening'),
              ('septal lines', 'septal line', 'septal thickening', 'kerley', 'b lines', 'b-lines', 'b line'),
              ('interstitial abnormality', 'interstitial abnormalities', 'interstitial marking', 'interstitial markings', 'interstitial infiltrates',
                  'interstitial pulmonary edema', 'interstitial edema'),
              ('parenchymal opacity', 'parenchymal opacification', 'parenchymal opacities', 'parenchymal infiltrates',
                                      'airspace opacity', 'airspace opacification', 'airspace opacities',
                                      'alveolar opacity', 'alveolar opacification', 'alveolar opacities', 'alveolar infiltrates'),
              ('nodular opacity', 'nodular opacification', 'nodular opacities'),
              ('bibasilar opacity', 'bibasilar opacification', 'bibasilar opacities', 'bibasilar infiltrates'),
              ('perihilar opacity', 'perihilar opacification', 'perihilar opacities', 'perihilar infiltrates'),
              ('patchy opacity', 'patchy opacification', 'patchy opacities')]
    
df = df.copy()
for cols in cols_merge:
    assert(len(cols)>=2)
    col_mergeto = cols[0]
    for col_mergefrom in cols[1:]:
        df = merge_2cols(df, col_mergefrom, col_mergeto)

import itertools
cols_drop = list(itertools.chain(*[x[1:] for x in cols_merge]))
cols_drop

for col in cols_drop:
    df = df.drop(col, 1)
    
df


# def combine_label_hierarchy(row, colp, colc):
#     """ enforce hierarchy p -> c
        
#         p    c    p'  c'
#         nan  1    1   
#         0    nan      0
#         1/0  0/1  should not happen
#         1/0  1/0  nothing changes
#         nan  nan  nothing changes
        
#     ```
#         df = pd.DataFrame({'p': [np.nan,0,1,0,1,0,np.nan],'c': [1,np.nan,0,1,1,0,np.nan]})
#              p    c
#             0  NaN  1.0
#             1  0.0  NaN
#             2  1.0  0.0
#             3  0.0  1.0
#             4  1.0  1.0
#             5  0.0  0.0
#             6  NaN  NaN
#         df[['p','c']].apply(lambda row: combine_label_hierarchy(row, 'p', 'c'), axis=1)
#         	p	c
#             0	1.0	1.0
#             1	0.0	0.0
#             2	1.0	0.0
#             3	0.0	1.0
#             4	1.0	1.0
#             5	0.0	0.0
#             6	NaN	NaN
#     ```
#     """
#     if np.isnan(row[colp]) and row[colc] == 1:
#         row[colp] = 1
#     elif row[colp] == 0 and np.isnan(row[colc]):
#         row[colc] = 0
#     elif (not np.isnan(row[colp])) and (not np.isnan(row[colc])) and row[colp] != row[colc]:
#         print(f'parent/child inconsistent {row[colp]} vs. {row[colc]}')
#     else:
#         pass
#     return row
    
# col_parentchild = [
#     ('interstitial abnormality', 'interstitial marking'),
#     ('interstitial marking', 'interstitial edema'),
#     ('parenchymal opacities', 'alveolar opacities'),
#     ('alveolar opacities', 'patchy opacities'),]

# for colp, colc in col_parentchild:
#     print(colp, colc)
#     df[[colp,colc]] = df[[colp,colc]].apply(
#         lambda row: combine_label_hierarchy(row, colp, colc), axis=1)
    

    
C = cols_value_counts(df.iloc[:,1:])
print(tabulate(C, headers=['findings'] + [str(x) for x in list(C.columns)]))

findings                         0.0    1.0     nan
-----------------------------  -----  -----  ------
pulmonary edema                26559  24334  164200
fluid overload                   147   2788  212158
acute cardiopulmonary process  27751    121  187221
vascular congestion             9045  16350  189698
hilar congestion                  16    912  214165
peribronchial cuffing              4    829  214260
septal lines                      10    410  214673
interstitial abnormality         496   9145  205452
air bronchograms                  42    891  214160
parenchymal opacity             2838   7417  204838
nodular opacity                   17   2206  212870
bibasilar opacity                 34   5289  209770
perihilar opacity                 11   1579  213503
patchy opacity                     7   3850  211236


In [60]:

    
C = cols_value_counts(df.iloc[:,1:])
print(tabulate(C, headers=['findings'] + [str(x) for x in list(C.columns)]))

findings                         0.0    1.0     nan
-----------------------------  -----  -----  ------
pulmonary edema                26559  24334  164200
fluid overload                   147   2788  212158
acute cardiopulmonary process  27751    121  187221
vascular congestion             9049  16352  189692
hilar congestion                  16    912  214165
peribronchial cuffing              4    841  214248
septal lines                      10    414  214669
interstitial abnormality         498   9146  205449
air bronchograms                  42    891  214160
parenchymal opacity             2896   7424  204773
nodular opacity                   17   2206  212870
bibasilar opacity                 35   5289  209769
perihilar opacity                 11   1579  213503
patchy opacity                     7   3850  211236


In [61]:
dfc = df.copy()

# for chf cohort
# for severity=0, assign all keywords corresponding to higher level findings to 0

levels_findings = {
    1: ["vascular congestion", "hilar congestion", "peribronchial cuffing"],
    2: ["septal lines", "interstitial abnormality"],
    3: ["air bronchograms", "parenchymal opacity"]
}

edema_df_path = os.path.join('../../', 'notebooks', 'data', 'MimicCxrDataset', f'regex_results_wpq_negprec=False_opacitiesdeconfound=False.tsv')
edema_df = pd.read_csv(edema_df_path, sep="\t")

for c in [2,1,0]:
    study_ids_withc = edema_df[edema_df['regex_label']==c]['study_id']
    print(f'c={c} (#studyid={len(study_ids_withc)})')

    if c != 0:
        keywords_gt_lvlc = sum([v for k,v in levels_findings.items() if k>c], [])
    else:
        keywords_gt_lvlc = sum([v for k,v in levels_findings.items() if k>c], [])
        keywords_gt_lvlc = list(set(keywords_gt_lvlc)-set(['septal lines', 'air bronchograms']))
        
    print(keywords_gt_lvlc)

    def assign_zeros(row):
        for col in keywords_gt_lvlc:
            if np.isnan(row[col]):
                row[col] = 0
        return row
    dfc[dfc['study_id'].isin(study_ids_withc)] = \
        dfc[dfc['study_id'].isin(study_ids_withc)].apply(assign_zeros, axis=1)
    dfc[dfc['study_id'].isin(study_ids_withc)]

C = cols_value_counts(dfc.iloc[:,1:])
print(tabulate(C, headers=['findings'] + [str(x) for x in list(C.columns)]))



c=2 (#studyid=1971)
['air bronchograms', 'parenchymal opacity']
c=1 (#studyid=2319)
['septal lines', 'interstitial abnormality', 'air bronchograms', 'parenchymal opacity']
c=0 (#studyid=2948)
['interstitial abnormality', 'parenchymal opacity', 'hilar congestion', 'peribronchial cuffing', 'vascular congestion']
findings                         0.0    1.0     nan
-----------------------------  -----  -----  ------
pulmonary edema                26559  24334  164200
fluid overload                   147   2788  212158
acute cardiopulmonary process  27751    121  187221
vascular congestion            11396  16352  187345
hilar congestion                2768    912  211413
peribronchial cuffing           2756    841  211496
septal lines                    2274    414  212405
interstitial abnormality        5465   9146  200482
air bronchograms                4201    891  210001
parenchymal opacity             9734   7424  197935
nodular opacity                   17   2206  212870
bibasilar op

In [63]:

label_names = [
    'pulmonary edema',
    'fluid overload',
    'acute cardiopulmonary process',
    'vascular congestion',
    'hilar congestion',
    'peribronchial cuffing',
    'septal lines',
    'interstitial abnormality',
    'air bronchograms',
    'parenchymal opacity',
    'nodular opacity',
    'bibasilar opacity',
    'perihilar opacity',
    'patchy opacity',
]

dfa = dfc.copy()
C = cols_value_counts(dfa.iloc[:,1:])

# v1,v2: randomly sample minor class to match major class
#    so that #samples with 0/1 are the same
# v3: sample negatives for all other findings from images with acute-pulmonary-proc=0
# v4: modify a little bit include edema in interstitial abnormality.
# v5: sample negatives by first assign label=0 for higher level findings if lower level findings are mentioned
#
for l in label_names:
    if l not in dfa.columns.to_list():
        continue
        
    # skipping thees since have engouth negatives
    if l in sum([v for k,v in levels_findings.items()], []) and C.loc[l,0]>C.loc[l,1]:
        print('skipping ', l)
        continue
        
        
    addv = 0 if (C.loc[l,0] < C.loc[l,1]) else 1
    addn =  int(np.abs(C.loc[l,0] - C.loc[l,1]))
    if l in ['pulmonary edema', 'acute cardiopulmonary process']:
        indices = df[df[l].isnull()].index
    else:
        indices = df[df[l].isnull() & (df['acute cardiopulmonary process']==0)].index
    addIdx = np.random.choice(indices, size=addn, replace=False)
    dfa.iloc[addIdx, dfa.columns.get_loc(l)] = addv
    
    
## remove 

for col in ['pulmonary edema', 'fluid overload', 'acute cardiopulmonary process']:
    if col in dfa.columns.to_list():
        dfa = dfa.drop(col, 1)

C = cols_value_counts(dfa.iloc[:,1:])
print(tabulate(C, headers=['findings'] + [str(x) for x in list(C.columns)]))

df_save_path = f'../../notebooks/data/MimicCxrDataset/negex_findings_cohort={cohort}_v5.csv'
dfa.to_csv(df_save_path, index=False)
print(f'saved to {df_save_path}')

# dfa

skipping  hilar congestion
skipping  peribronchial cuffing
skipping  septal lines
skipping  air bronchograms
skipping  parenchymal opacity
findings                    0.0    1.0     nan
------------------------  -----  -----  ------
vascular congestion       16210  16352  182531
hilar congestion           2768    912  211413
peribronchial cuffing      2756    841  211496
septal lines               2274    414  212405
interstitial abnormality   9023   9146  196924
air bronchograms           4201    891  210001
parenchymal opacity        9734   7424  197935
nodular opacity            2206   2206  210681
bibasilar opacity          5289   5289  204515
perihilar opacity          1579   1579  211935
patchy opacity             3850   3850  207393
saved to ../../notebooks/data/MimicCxrDataset/negex_findings_cohort=all_v5.csv


In [53]:

C = cols_value_counts(dfa.iloc[:,1:])
print(tabulate(C, headers=['findings'] + [str(x) for x in list(C.columns)]))

findings                     nan    1.0    0.0
------------------------  ------  -----  -----
vascular congestion       189519  16340   9234
hilar congestion          214096    981     16
peribronchial cuffing     214259    830      4
septal lines              214673    410     10
interstitial abnormality  211027   4013     53
air bronchograms          214160    891     42
parenchymal opacity       204826   7417   2850
nodular opacity           212850   2226     17
bibasilar opacity         209720   5339     34
perihilar opacity         213502   1580     11
patchy opacity            211231   3855      7


In [75]:
# take a look at summary statistics
#
def cols_value_counts(df):
    C = df.apply(lambda col: pd.Series.value_counts(col, dropna=False))
    C = C.transpose()
    C = C.replace({np.nan: 0})
    C_index = C.index
    C['severity'] = [kwd_to_severities[idx] if idx in kwd_to_severities else np.nan for idx in C_index]
    C = C.sort_values(by=['severity'])
    return C

df_save_path = './negex_findings_version=miccai2020_nopacities_cohort=all.csv'
# df_save_path = './negex_findings_version=miccai2020_nopacities_cohort=chf.csv'
df = pd.read_csv(df_save_path)
C = cols_value_counts(df.iloc[:,1:])

# C = C[(C[0]+C[1])>500]
label_names = C.index.to_list()
label_remove = [
    'cephalization',
    'hilar engorgement',
    'vascular plethora',
    'pulmonary vascular prominence',
    'interstitial process',
    'interstitial abnormalities',
    'interstitial pulmonary edema',
    'interstitial thickening',
    'kerley',
    'ill defined opacities',
    'alveolar infiltrates',
    'perihilar infiltrates',
    'hilar infiltrates',
]
df = df[list(set(label_names)-set(label_remove))]


C = cols_value_counts(df.iloc[:,1:])


print(tabulate(C, headers=['findings'] + [str(x) for x in list(C.columns)]))

findings                          0.0    1.0     nan    severity
------------------------------  -----  -----  ------  ----------
acute cardiopulmonary process   27751    121  187221           0
vascular congestion              8659  12984  193450           0
pulmonary edema                 26559  24334  164200           0
fluid overload                    147   2788  212158           0
pulmonary vascular congestion    2481   8843  203769           1
pulmonary vascular engorgement    233   1418  213442           1
interstitial opacities             15   1280  213798           2
interstitial marking                0   2867  212226           2
interstitial abnormality           39   1121  213933           2
interstitial edema                374   4000  210719           2
alveolar opacities                  3    402  214688           3
patchy opacities                    3   1875  213215           3
parenchymal opacities            1787   2981  210325           3
mild pulmonary edema     

In [64]:


def cols_value_counts(df):
    C = df.apply(lambda col: pd.Series.value_counts(col, dropna=False))
    C = C.transpose()
    C = C.replace({np.nan: 0})
    C = C.sort_values(by=[np.nan])
    return C


def proc_findings_labels(df, thresh=500, unwanted_cols = ['pulmonary edema']):

    # thresholding entries with >thresh number of samples
    C = cols_value_counts(df.iloc[:,1:])
    C = C[(C[0]+C[1])>thresh]
    label_names = C.index.to_list()

    # remove unwanted labels
    label_names = list(set(label_names)-set(unwanted_cols))

    # filter columns
    df_f = df[['study_id']+label_names]

    # randomly sample minor class to match major class
    #    so that #samples with 0/1 are the same
    #
    for l in label_names:
        addv = 0 if (C.loc[l,0] < C.loc[l,1]) else 1
        addn =  int(np.abs(C.loc[l,0] - C.loc[l,1]))
        addIdx = np.random.choice(df_f[df_f[l].isnull()].index, size=addn, replace=False)
        df_f.iloc[addIdx, df_f.columns.get_loc(l)] = addv
        
    return df_f


df_f = proc_findings_labels(df)

C = cols_value_counts(df_f.iloc[:,1:])
print(tabulate(C, headers=['findings'] + [str(x) for x in list(C.columns)]))


findings                         nan    1.0    0.0
-----------------------------  -----  -----  -----
vascular congestion            11849   2179   2179
pulmonary vascular congestion  13081   1563   1563
mild pulmonary edema           13463   1372   1372
acute cardiopulmonary process  14331    938    938
interstitial edema             14509    849    849
moderate pulmonary edema       14575    816    816
interstitial marking           15139    534    534


In [10]:
df_ = df[df['kerley']==0]


for i, study_id in enumerate(df_['study_id']):
    
    print('----------------------------------------')
    print(cxr_reader.get_report(study_id))
    print(df_.iloc[i,:][df_.iloc[i,:].notnull()])
    
    if i > 10:
        break

----------------------------------------
                                 FINAL REPORT
 EXAMINATION:  CHEST (PORTABLE AP)
 
 INDICATION:  ___ year old woman with asthma exacerbation and increasing O2
 reqauirement  // interval change      interval change
 
 COMPARISON:  Chest radiographs since ___, most recently ___.
 
 IMPRESSION: 
 
 Heart size is top-normal, comparable to the size on conventional radiographs
 ___.  Lungs are clear.  Slight it interval increase in caliber of
 mediastinal veins is not accompanied by pulmonary vascular engorgement, edema,
 or pleural effusion.  Lungs are clear.  No pneumothorax.
 
 Vascular clips denote prior neck surgery in the region of the left thyroid
 lobe.

study_id                          54471901.0
pulmonary vascular engorgement           0.0
Name: 291, dtype: float64
----------------------------------------
                                 FINAL REPORT
 INDICATION:  Chest pain, here to evaluate for acute cardiopulmonary process.
 
 COMPARISON