

- get most frequent/interesting phrases from chf cohort reports

In [1]:

import sys
sys.path.append("../../notebooks")

import utils
utils.jpt_autoreload()
utils.jpt_full_width()
utils.jpt_suppress_warnings()

In [2]:
import os, re, time
import pandas as pd
import numpy as np

from label_reports import get_chf_cohort, label_report
import negex
from regex_utils import WordMatch
from section_parser import section_text
from extract_findings import extract_findings

from tabulate import tabulate
from pprint import pprint

from datasets import MimicCxrLabels, MimicCxrReader, MimicCxrBase

In [3]:
cxr_reader = MimicCxrReader()
cxr_labels = MimicCxrLabels()
cxr_base = MimicCxrBase()

In [21]:

keywords = [ 
    'acute cardiopulmonary process',
    'focal consolidation',
    'pleural effusion',
    'pneumothorax',
    'nodular opacities',
    'pneumonia'
]

[('acute cardiopulmonary process', 0.0),
 ('focal consolidation', 0.0),
 ('pleural effusion', 0.0),
 ('pneumothorax', 0.0),
 ('nodular opacities', 1.0),
 ('pneumonia', nan)]


In [4]:
cohort = 'chf'
current_path = '.'
chf_metadata_path = os.path.join(
    current_path, 'mimic_cxr_heart_failure', 'mimic_cxr_metadata_hf.tsv')

if cohort == 'all':
    meta_df = MimicCxrBase().get_meta_df()
    study_ids = meta_df['study_id'].unique()
else:
    meta_df = pd.read_csv(chf_metadata_path, sep='\t')
    meta_df = meta_df[meta_df['heart_failure'] == 1]
    study_ids = meta_df['study_id'].unique()
    

In [90]:
# study_id = 54577367
# report = cxr_reader.get_report(study_id)

# def section_finding(report):
#     sections, section_names, section_idx = section_text(report)
#     if 'findings' in section_names:
#         ind = section_names.index('findings')
#         return sections[ind]
#     else:
#         ""
# section = section_finding(report)

# data = {}
# data[study_id] = section

# df = pd.DataFrame.from_dict(data, columns=['findings'],orient='index')
# df

In [117]:
gen_findings_text = False
findings_df_path = f'note_findings_wordfreq_findings_{cohort}_df'

if gen_findings_text:

    data = {}

    start = time.time()
    for i, study_id in enumerate(study_ids):

        try:
            report = cxr_reader.get_report(study_id)
            section = section_finding(report)
            data[study_id] = section
        except:
            continue

        if i%1000 == 0:
            end = time.time()
            print(f'Iter={i}\tTime Elapsed={end-start:.3f}')
            start = time.time()


    df = pd.DataFrame.from_dict(data, columns=['findings'],orient='index')
    df.to_csv(findings_df_path, index=True)
    print(len(df))
    print(f'write to {findings_df_path}')
else:
    df = pd.read_csv(findings_df_path, index_col=0)

df['study_id'] = df.index
df = df.reset_index(drop=True)

In [121]:
edema_df = cxr_base.get_edema_df()
edema_df

Unnamed: 0,study_id,dicom_id,subject_id,EdemaSeverity,split
0,54577367,cfb03587-782edf6c-1bf392e1-98196cd5-365d69e8,10000980.0,0,train
1,54980801,a75a1fbe-802065ad-717eb7c1-e2ce3552-646276a6,10000980.0,0,train
2,59988438,925b9496-a956d7b2-05185e52-bb33313b-c06ee522,10000980.0,0,train
3,50109051,57dbb610-aad7676f-da4741ea-db34ef0e-773492f3,10011938.0,1,train
4,51895247,bf724128-9131f33a-6fd065d5-19041750-9e7f8707,10011938.0,0,train
...,...,...,...,...,...
7519,58763598,7bdf562a-8945bd11-382bb25a-36306d5b-4c1f31f5,19997367.0,0,train
7521,59159686,a06c18fa-0be7ccf1-5b99ff5c-429949f2-86361e99,19997367.0,0,train
7522,54878259,a0312c46-db145040-b7bc5dc0-7bf8a213-393a3ab3,19998330.0,1,train
7523,59281793,b7fea537-87829311-389dc1c3-0947f202-bcf5f743,19998330.0,1,train


In [145]:
dfm = pd.merge(df, edema_df, how='right', on=['study_id'])


Unnamed: 0,findings,study_id,dicom_id,subject_id,EdemaSeverity,split
0,\n \n Mild to moderate enlargement of the card...,54577367,cfb03587-782edf6c-1bf392e1-98196cd5-365d69e8,10000980.0,0,train
1,"\n \n The lungs are clear of consolidation, ef...",54980801,a75a1fbe-802065ad-717eb7c1-e2ce3552-646276a6,10000980.0,0,train
2,PA and lateral views of the chest demonstrate ...,59988438,925b9496-a956d7b2-05185e52-bb33313b-c06ee522,10000980.0,0,train
4,,51895247,bf724128-9131f33a-6fd065d5-19041750-9e7f8707,10011938.0,0,train
9,"As compared to the previous radiograph, there...",50515796,906744d3-04cbdaa9-9b97b8ff-e89b52d6-be2d0f35,10018081.0,0,train
...,...,...,...,...,...,...
7215,,57557006,00cf50b3-4a8454a1-9117821e-e77b0df1-af78dcfd,19997367.0,0,train
7216,\n \n PA and lateral views of the chest provid...,57894530,ed9f946b-d194e14a-8091f301-4bb11421-dcab0a36,19997367.0,0,train
7217,\n The patient is status post median sternoto...,58549312,f72c9efb-07136e22-c4749961-b5e6c98a-a0804543,19997367.0,0,train
7218,Comparison is made to previous radiographs fr...,58763598,7bdf562a-8945bd11-382bb25a-36306d5b-4c1f31f5,19997367.0,0,train


In [174]:
import nltk

from nltk.collocations import *

def top_ngrams(text, n=2, freq_threshold=50, topn=100):
    tokens = nltk.wordpunct_tokenize(text)
    
    if n == 2:
        ngram_measures = nltk.collocations.BigramAssocMeasures()
        finder = BigramCollocationFinder.from_words(tokens)
    else:
        ngram_measures = nltk.collocations.TrigramAssocMeasures()
        finder = TrigramCollocationFinder.from_words(tokens)
        
    # occurs n_freq times
    finder.apply_freq_filter(freq_threshold)
    
    # with highest pmi
    ngrams  = finder.nbest(ngram_measures.pmi, topn)
    counts = [finder.ngram_fd[x] for x in ngrams]
    
    return list(zip(ngrams, counts))

data = {}
for i in range(4):
    print(f'Severity={i}')
    text = dfm[dfm['EdemaSeverity']==i]
    text = '\n'.join([x for x in text['findings'].to_list() if not isinstance(x, float)])
    ngrams = top_ngrams(text,n=2)
    data[i] = ngrams
    
    
data

    

    
    

Severity=0
Severity=1
Severity=2
Severity=3


{0: [(("'", 's'), 52),
  (('costophrenic', 'angle'), 66),
  (('PICC', 'line'), 50),
  (('central', 'venous'), 52),
  (('status', 'post'), 136),
  (('In', 'comparison'), 131),
  (('rib', 'fractures'), 67),
  (('relevant', 'change'), 64),
  (('Atherosclerotic', 'calcifications'), 52),
  (('projecting', 'over'), 92),
  (('pacing', 'device'), 57),
  (('were', 'obtained'), 96),
  (('aortic', 'arch'), 82),
  (('aortic', 'knob'), 60),
  (('post', 'median'), 85),
  (('median', 'sternotomy'), 135),
  (('Median', 'sternotomy'), 95),
  (('concerning', 'for'), 80),
  (('well', 'expanded'), 94),
  (('degenerative', 'changes'), 113),
  (('osseous', 'abnormalities'), 190),
  (('cannot', 'be'), 60),
  (('Lung', 'volumes'), 125),
  (('has', 'been'), 177),
  (('As', 'compared'), 195),
  (('vascular', 'congestion'), 347),
  (('sternotomy', 'wires'), 157),
  (('acute', 'osseous'), 258),
  (('leads', 'terminating'), 60),
  (('upper', 'abdomen'), 67),
  (('osseous', 'abnormality'), 70),
  (('thoracic', 'spi

In [175]:
data

{0: [(("'", 's'), 52),
  (('costophrenic', 'angle'), 66),
  (('PICC', 'line'), 50),
  (('central', 'venous'), 52),
  (('status', 'post'), 136),
  (('In', 'comparison'), 131),
  (('rib', 'fractures'), 67),
  (('relevant', 'change'), 64),
  (('Atherosclerotic', 'calcifications'), 52),
  (('projecting', 'over'), 92),
  (('pacing', 'device'), 57),
  (('were', 'obtained'), 96),
  (('aortic', 'arch'), 82),
  (('aortic', 'knob'), 60),
  (('post', 'median'), 85),
  (('median', 'sternotomy'), 135),
  (('Median', 'sternotomy'), 95),
  (('concerning', 'for'), 80),
  (('well', 'expanded'), 94),
  (('degenerative', 'changes'), 113),
  (('osseous', 'abnormalities'), 190),
  (('cannot', 'be'), 60),
  (('Lung', 'volumes'), 125),
  (('has', 'been'), 177),
  (('As', 'compared'), 195),
  (('vascular', 'congestion'), 347),
  (('sternotomy', 'wires'), 157),
  (('acute', 'osseous'), 258),
  (('leads', 'terminating'), 60),
  (('upper', 'abdomen'), 67),
  (('osseous', 'abnormality'), 70),
  (('thoracic', 'spi

In [170]:
from nltk.tokenize import RegexpTokenizer


text = '\n'.join([x for x in dfm['findings'].to_list() if not isinstance(x, float)])
print(len(text))

## tokenization
# tokens = nltk.wordpunct_tokenize(text)
# print(len(tokens))
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(text)


bigram_measures = nltk.collocations.BigramAssocMeasures()

# bigram pmi
finder = BigramCollocationFinder.from_words(tokens)

# finder
# # # only bigrams that appear 3+ times
finder.apply_freq_filter(100)

# return the 10 n-grams with the highest PMI
bigrams = finder.nbest(bigram_measures.pmi, 1000)
fd_counts = [finder.ngram_fd[x] for x in bigrams]
list(zip(bigrams, fd_counts))


2222280


[(('clinical', 'setting'), 101),
 (('significantly', 'changed'), 109),
 (('valve', 'replacement'), 101),
 (('costophrenic', 'angles'), 131),
 (('accompanied', 'by'), 133),
 (('greater', 'than'), 145),
 (('rib', 'fractures'), 138),
 (('costophrenic', 'angle'), 163),
 (('re', 'demonstrated'), 168),
 (('In', 'comparison'), 265),
 (('projecting', 'over'), 187),
 (('status', 'post'), 448),
 (('Bony', 'structures'), 203),
 (('were', 'obtained'), 270),
 (('aortic', 'knob'), 168),
 (('concerning', 'for'), 189),
 (('aortic', 'arch'), 162),
 (('central', 'venous'), 139),
 (('well', 'expanded'), 164),
 (('degenerative', 'changes'), 272),
 (('venous', 'catheter'), 111),
 (('post', 'median'), 290),
 (('not', 'significantly'), 110),
 (('median', 'sternotomy'), 405),
 (('Median', 'sternotomy'), 176),
 (('osseous', 'abnormalities'), 412),
 (('AP', 'upright'), 137),
 (('single', 'lead'), 107),
 (('Lung', 'volumes'), 279),
 (('dual', 'lead'), 119),
 (('acute', 'osseous'), 573),
 (('patient', 's'), 109),

In [99]:

finder = TrigramCollocationFinder.from_words(tokens)

# finder
# # # only bigrams that appear 3+ times
finder.apply_freq_filter(500)

# return the 10 n-grams with the highest PMI
trigrams = finder.nbest(trigram_measures.pmi, 100)
trigrams


[('acute', 'osseous', 'abnormalities'),
 ('within', 'normal', 'limits'),
 ('In', 'comparison', 'with'),
 ('No', 'acute', 'osseous'),
 ('As', 'compared', 'to'),
 ('pulmonary', 'vascular', 'congestion'),
 ('is', 'status', 'post'),
 ('PA', 'and', 'lateral'),
 ('Frontal', 'and', 'lateral'),
 ('and', 'lateral', 'views'),
 ('lateral', 'views', 'of'),
 ('bilateral', 'pleural', 'effusions'),
 ('and', 'hilar', 'contours'),
 ('previous', 'radiograph', ','),
 ('hilar', 'contours', 'are'),
 ('.', 'In', 'comparison'),
 ('patient', 'is', 'status'),
 ('The', 'cardiomediastinal', 'silhouette'),
 ('.', 'Bony', 'structures'),
 ('lungs', 'are', 'clear'),
 ('No', 'focal', 'consolidation'),
 ('mediastinal', 'and', 'hilar'),
 ('left', '-', 'sided'),
 ('the', 'previous', 'radiograph'),
 ('.', 'Lung', 'volumes'),
 ('no', 'focal', 'consolidation'),
 ('effusion', 'or', 'pneumothorax'),
 ('The', 'cardiac', 'silhouette'),
 ('Heart', 'size', 'is'),
 ('right', '-', 'sided'),
 ('.', 'As', 'compared'),
 ('the', 'ches

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_contingency',
 '_expected_values',
 '_marginals',
 '_n',
 'chi_sq',
 'dice',
 'fisher',
 'jaccard',
 'likelihood_ratio',
 'mi_like',
 'phi_sq',
 'pmi',
 'poisson_stirling',
 'raw_freq',
 'student_t']