
- harder negatives
    - assign harder negatives .. basically higher lvl keywords=0 if lower lvl severity label decided 

In [1]:
# %matplotlib inline
import sys
sys.path.append("../../notebooks")

import utils
utils.jpt_autoreload()
utils.jpt_full_width()
utils.jpt_suppress_warnings()

In [2]:
import os
import pandas as pd
import numpy as np

from label_reports import get_chf_cohort, label_report
from regex_utils import WordMatch

from datasets import MimicCxrLabels, MimicCxrReader, MimicCxrBase

In [3]:
cxr_labels = MimicCxrLabels()
cxr_reader = MimicCxrReader()
meta_df = MimicCxrBase().get_meta_df()

In [4]:
current_path = '.'
# keyword terms for labeling pulmonary edema severity in a negated fashion
#

keywords_version = 'wpq' # 'miccai2020'
opacitiesdeconfound = False
negprec = False

# pulmonary_edema_severity	keyword_terms
# 0	pulmonary edema
# 0	vascular congestion 
# 0	fluid overload
# 0	acute cardiopulmonary process
negated_keywords_path = os.path.join(current_path, 'keywords', keywords_version, 'keywords_negated.tsv')
#
# keyword terms for labeling pulmonary edema severity in a affirmed fashion
# pulmonary_edema_severity	keyword_terms
# 1	cephalization
# 1	pulmonary vascular congestion
# 1	hilar engorgement
# etc.
affirmed_keywords_path = os.path.join(current_path, 'keywords', keywords_version, 'keywords_affirmed.tsv')
#
# keyword terms for labeling pulmonary edema severity in a mentioned fashion
# pulmonary_edema_severity	keyword_terms
# 0	no pulmonary edema
# 0	no vascular congestion
# 0	no fluid overload
# 0	no acute cardiopulmonary process
mentioned_keywords_path = os.path.join(current_path, 'keywords', keywords_version, 'keywords_mentioned.tsv')

opacity_keywords = [
    'interstitial opacities',
    'parenchymal opacities',
    'alveolar opacities',
    'ill defined opacities',
    'ill-defined opacities',
    'patchy opacities',
]


# the directory that contains reports for regex labeling
report_dir = os.path.join(current_path, 'example_data')
# CHF diagnosis information for mimic-cxr data
chf_metadata_path = os.path.join(current_path, 'mimic_cxr_heart_failure', 'mimic_cxr_metadata_hf.tsv')
# whether to limit the cohort to congestive heart failure
limit_to_chf = True


df_n = pd.read_csv(negated_keywords_path,  sep="\t")
df_a = pd.read_csv(affirmed_keywords_path, sep="\t")
df_m = pd.read_csv(mentioned_keywords_path, sep="\t")

def keywords_label_to_list(df):
    return df['pulmonary_edema_severity'].to_list(), df['keyword_terms'].to_list()

df_chf = pd.read_csv(chf_metadata_path, sep='\t')
if limit_to_chf:
    df_chf = df_chf[df_chf['heart_failure'] == 1]
chf_study_ids = df_chf['study_id'].unique()
print(len(chf_study_ids))


list(zip(*keywords_label_to_list(df_a)))

17162


[(1, 'vascular congestion'),
 (1, 'vascular enlargement'),
 (1, 'vascular plethora'),
 (1, 'vascular engorgement'),
 (1, 'vascular prominence'),
 (1, 'cephalization'),
 (1, 'hilar congestion'),
 (1, 'hilar engorgement'),
 (1, 'hilar prominence'),
 (1, 'peribronchial cuffing'),
 (1, 'bronchial cuffing'),
 (1, 'bronchial wall thickening'),
 (2, 'septal lines'),
 (2, 'septal line'),
 (2, 'septal thickening'),
 (2, 'kerley'),
 (2, 'b lines'),
 (2, 'b-lines'),
 (2, 'b line'),
 (2, 'interstitial abnormality'),
 (2, 'interstitial abnormalities'),
 (2, 'interstitial marking'),
 (2, 'interstitial markings'),
 (2, 'interstitial infiltrates'),
 (2, 'interstitial opacities'),
 (2, 'interstitial thickening'),
 (2, 'interstitial process'),
 (2, 'interstitial edema'),
 (2, 'interstitial pulmonary edema'),
 (3, 'severe pulmonary edema'),
 (3, 'parenchymal opacity'),
 (3, 'parenchymal opacification'),
 (3, 'parenchymal opacities'),
 (3, 'parenchymal infiltrates'),
 (3, 'airspace opacity'),
 (3, 'airspa

In [12]:
labeled_study_ids = {}
regex_labels = {}
relevant_keywords = {}
c = 0
c_regex = 0
c_labels = [0,0,0,0]

import time

start = time.time()

# for i, (study_id, l) in enumerate(labels_prev.items()):
# for i, (study_id, l) in enumerate(labels_prev):
for i, study_id in enumerate(df_chf['study_id'].unique()):
    study_id = int(study_id)

    c_regex += 1
    if c_regex%1000 == 0:
        print("{} reports have been processed!".format(c_regex))

    try:
        report = cxr_reader.get_report(study_id, remove_nextline=True)
    except:
        continue

    
    # if has atlectasis/pneumonia, then do not use opacities for keyword search
    
    if opacitiesdeconfound:
        
        dicom_id = cxr_labels.df[cxr_labels.df['study_id']==study_id]['dicom_id'].to_list()
        if len(dicom_id) != 0:
            chex_labels = cxr_labels.get_chexpert_labels([dicom_id[0]])
            has_confonding_variables = np.any(chex_labels[0,[0,11]]==1)
            edema_binary = chex_labels[0,3]
            if has_confonding_variables:
                df_a_ = df_a[~df_a['keyword_terms'].isin(opacity_keywords)]
            else:
                df_a_ = df_a
    else:
        df_a_ = df_a

    severities, keywords = keywords_label_to_list(df_a_)
    label_a, severity_keywords_a = label_report(
        report, severities, keywords, tag='affirmed')

    severities, keywords = keywords_label_to_list(df_n)
    label_n, severity_keywords_n = label_report(
        report, severities, keywords, tag='negated')
    
    severities, keywords = keywords_label_to_list(df_m)
    label_m, severity_keywords_m = label_report(
        report, severities, keywords, tag='mentioned')
    

    # Negated condition takes precedence.
    # Otherwise, takes the most severe condition
    
    if negprec:
        if label_n == 0:
            label = 0
        else:
            label = max([label_a, label_n, label_m])
    else:
        label = max([label_a, label_n, label_m])

    if label != -1:
        c += 1
        relevant_keywords[c] = severity_keywords_a[label] + severity_keywords_n[label] + severity_keywords_m[label]
        labeled_study_ids[c] = study_id
        regex_labels[c] = label
        c_labels[label] += 1
        

        
end = time.time()
print(f'took {end-start:.2}s')
        
        

regex_df = pd.DataFrame(
    {'study_id': labeled_study_ids,
     'regex_label': regex_labels,
     'relevant_keywords': relevant_keywords})
output_csv_path = f'regex_results_{keywords_version}_negprec={negprec}_opacitiesdeconfound={opacitiesdeconfound}.tsv'
regex_df.to_csv(output_csv_path, sep="\t")


regex_df

# 
# 1	mild pulmonary edema ?
# 2 moderate pulmonary edema ?


1000 reports have been processed!
2000 reports have been processed!
3000 reports have been processed!
4000 reports have been processed!
5000 reports have been processed!
6000 reports have been processed!
7000 reports have been processed!
8000 reports have been processed!
9000 reports have been processed!
10000 reports have been processed!
11000 reports have been processed!
12000 reports have been processed!
13000 reports have been processed!
14000 reports have been processed!
15000 reports have been processed!
16000 reports have been processed!
17000 reports have been processed!
took 6.8e+01s


Unnamed: 0,study_id,regex_label,relevant_keywords
1,54577367,0,[pulmonary edema]
2,54980801,0,"[acute cardiopulmonary process, no acute cardi..."
3,59988438,0,[vascular congestion]
4,50109051,1,[vascular engorgement]
5,51895247,0,[pulmonary edema]
...,...,...,...
8163,59159686,0,[pulmonary edema]
8164,54878259,3,[airspace opacities]
8165,59281793,1,[vascular congestion]
8166,59694089,1,[vascular congestion]


In [10]:
print(cxr_reader.get_report(study_id))

                                 FINAL REPORT
 REASON FOR EXAMINATION:  Evaluation of the patient with COPD and diastolic
 congestive heart failure with pulmonary edema.
 
 Ap chest radiograph.
 
 Since the prior study, there is progression of pre-existing pulmonary vascular
 congestion and upper zone re-distribution with currently added interstitial
 opacities, bronchial wall thickening and thickening of the minor fissure.  No
 interval increase in pleural effusion demonstrated, and no pneumothorax is
 seen.



In [8]:

regex_df.to_csv(os.path.join('../../', 'notebooks', 'data', 'MimicCxrDataset', output_csv_path), sep="\t")

In [49]:
# edema_df = cxr_labels.get_edema_df()
# edema_df = edema_df[['dicom_id', 'EdemaSeverity']]
# dfm = pd.merge(edema_df, regex_df, how='right', on=['dicom_id'], suffixes=('_regex', '_ci'))
consensus_df = pd.read_csv(
    '/data/vision/polina/scratch/wpq/github/interpretability/notebooks/data/MimicCxrDataset/consensus_image_edema_severity.csv')
dfm = pd.merge(consensus_df, regex_df, how='right', on=['study_id'])
print(f"{len(dfm[dfm['edema_severity']!=dfm['regex_label']])}/{len(dfm)} changed labels from regex to consensus image")

# Table
a = np.zeros((4,4))
for x in range(4):
    for y in range(4):
        dfs = dfm[(dfm['regex_label']==x)&(dfm['edema_severity']==y)]
        a[x,y]=len(dfs)
        
        if x == 0 and y == 2 and len(dfs)>0:
            print(dfs['study_id'].to_list()[0])
        
from tabulate import tabulate
print(output_csv_path)
print(tabulate(np.hstack((np.arange(4).reshape(-1,1), a)), headers=['regex->ci', 0,1,2,3]))


# regex_results_miccai2020
# 76/132 changed labels from regex to consensus image
#   regex->ci    0    1    2    3
# -----------  ---  ---  ---  ---
#           0   25    8    0    1
#           1   11   15    5    4
#           2    8   10    7    2
#           3   11    9    7    9
#
# 74/132 changed labels from regex to consensus image
# regex_results_miccai2020_negprec=True_opacitiesdeconfound=False.tsv
#   regex->ci    0    1    2    3
# -----------  ---  ---  ---  ---
#           0   28   10    3    2
#           1   10   14    4    3
#           2    7   10    7    2
#           3   10    8    5    9
#
# 65/121 changed labels from regex to consensus image
# regex_results_miccai2020_negprec=True_opacitiesdeconfound=True.tsv
#   regex->ci    0    1    2    3
# -----------  ---  ---  ---  ---
#           0   28   10    3    2
#           1   10   14    4    3
#           2    7   10    8    2
#           3    7    5    2    6
#
# 67/121 changed labels from regex to consensus image
# regex_results_miccai2020_negprec=False_opacitiesdeconfound=True.tsv
#   regex->ci    0    1    2    3
# -----------  ---  ---  ---  ---
#           0   25    9    0    1
#           1   11   15    5    4
#           2    8   10    8    2
#           3    8    5    4    6

# # remove opacities keyword
# 92/141 changed labels from regex to consensus image
#   regex->ci    0    1    2    3
# -----------  ---  ---  ---  ---
#           0   26    9    2    1
#           1   11   15    5    4
#           2    9   10    8    2
#           3    0    0    0    0   


67/121 changed labels from regex to consensus image
regex_results_miccai2020_negprec=False_opacitiesdeconfound=True.tsv
  regex->ci    0    1    2    3
-----------  ---  ---  ---  ---
          0   25    9    0    1
          1   11   15    5    4
          2    8   10    8    2
          3    8    5    4    6


In [11]:
# df = cxr_labels.df
# dft = df[(df['split']=='train')&(df['EdemaSeverity'].notnull())]
# # dft = df[df['EdemaSeverity'].notnull()]
# SId, studlabels_prev = dft[['study_id', 'EdemaSeverity']]
# counts = 0
# for i, (study_id, l) in enumerate(dft[['study_id', 'EdemaSeverity']].set_index('study_id')['EdemaSeverity'].to_dict().items()):

#     if i%1000 == 0:
#         print("{} reports have been processed!".format(i))

#     report = cxr_reader.get_report(study_id, remove_nextline=True)
# #     if 'severe pulmonary edema' in report:
#     if 'patchy opacities' in report:
#         counts += 1
        
# print(counts)
        
        
# # 35 reports in training has alveolar opacities

# # parenchymal opacities
# # train: 223, all: 278
# # patchy opacities
# # train: 74, all: 100

In [63]:
# negprec ... some makes sense, some not ... will only hurt pulmonary vascular congestion but otherwise helps reduce opacities FP
# 58988106   Right upper lobe parenchymal opacities are grossly unchanged from ___.  No superimposed acute cardiopulmonary process.
# 51397090
# 50145470   Unchanged pulmonary vascular congestion without overt pulmonary edema. ... 
# 53919055   Pulmonary vascular congestion without overt pulmonary edema.
# 50762469   pulmonary vascular congestion
# 52853233   
# 54393504   Chronic changes in the lungs without definite superimposed acute  cardiopulmonary process.
# 54594082   Patchy opacities in the lung bases may reflect atelectasis though infection, no overt pulmonary edema. 
# 52549668   No definitive evidence of acute cardiopulmonary process. 
study_id = 58430521
print(cxr_reader.get_report(study_id))

if opacitiesdeconfound:
    chex_labels = cxr_labels.get_chexpert_labels(
        [cxr_labels.df[cxr_labels.df['study_id']==study_id]['dicom_id'].to_list()[0]])
    has_confonding_variables = np.any(chex_labels[0,[0,11]]==1)
    edema_binary = chex_labels[0,3]
    if has_confonding_variables:
        df_a_ = df_a[~df_a['keyword_terms'].isin(opacity_keywords)]
    else:
        df_a_ = df_a
else:
    df_a_ = df_a

severities, keywords = keywords_label_to_list(df_a_)
label_a, severity_keywords_a = label_report(
    report, severities, keywords, tag='affirmed')

severities, keywords = keywords_label_to_list(df_n)
label_n, severity_keywords_n = label_report(
    report, severities, keywords, tag='negated')

severities, keywords = keywords_label_to_list(df_m)
label_m, severity_keywords_m = label_report(
    report, severities, keywords, tag='mentioned')

print(label_a, label_n, label_m)

                                 FINAL REPORT
 EXAMINATION:  CHEST (PA AND LAT)
 
 INDICATION:  ___ year old woman with presyncope, congestive heart failure,
 end-stage renal disease.
 
 TECHNIQUE:  Chest PA and lateral
 
 COMPARISON:  Chest radiograph ___
 
 FINDINGS: 
 
 Right-sided dual lumen central venous catheter tip terminates in the right
 atrium, unchanged.  Mild to moderate enlargement of the cardiac silhouette is
 re- demonstrated.  The mediastinal contour is unchanged.  Moderate pulmonary
 edema is present, similar to that seen on the prior exam, with a new small
 left pleural effusion.  Patchy opacities in the lung bases likely reflect
 areas of atelectasis.  No pneumothorax is present.  Clips project over the
 left axilla.  There are no acute osseous abnormalities.
 
 IMPRESSION: 
 
 Moderate pulmonary edema, similar to the previous study, with new small left
 pleural effusion.  Bibasilar atelectasis.

-1 0 -1
