In [44]:
from medvqa.utils.files import load_jsonl, load_pickle
from collections import Counter

In [4]:
import os
import datetime

def list_files_with_prefix_and_timestamps(prefix, directory='.'):
    matching_files = []
    
    for root, _, files in os.walk(directory):
        for filename in files:
            if filename.startswith(prefix):
                full_path = os.path.join(root, filename)
                creation_timestamp = os.path.getctime(full_path)
                timestamp_human_readable = datetime.datetime.fromtimestamp(creation_timestamp).strftime('%Y-%m-%d %H:%M:%S')
                matching_files.append((full_path, timestamp_human_readable))
    
    matching_files.sort(key=lambda x:x[1], reverse=True)
    return matching_files

# # Example usage:
# prefix = "example"  # Replace with your desired prefix
# directory_path = "/path/to/your/directory"  # Replace with the directory you want to search in

# result = list_files_with_prefix_and_timestamps(prefix, directory_path)

# if not result:
#     print(f"No files found with the prefix '{prefix}' in the directory '{directory_path}'.")
# else:
#     print(f"Files with the prefix '{prefix}' and their creation timestamps:")
#     for file_path, timestamp in result:
#         print(f"File: {file_path}\nTimestamp: {timestamp}\n")

In [5]:
list_files_with_prefix_and_timestamps(
    "integrated_fact_metadata(",
    "/home/pamessina/medvqa-workspace/cache/mimiccxr/",
)

[('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_fact_metadata(595880,60579117).improved_comparison(6741113).jsonl',
  '2023-09-10 16:30:58'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_fact_metadata(595880,60579117).jsonl',
  '2023-09-09 21:42:49'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_fact_metadata(578733,58628071).improved_comparison(6526297).jsonl',
  '2023-07-18 11:46:52'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_fact_metadata(578733,58628071).improved_comparison(6595181).jsonl',
  '2023-07-18 00:18:14'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_fact_metadata(578733,58628071).jsonl',
  '2023-07-12 14:51:08'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_fact_metadata(578733,58628252).jsonl',
  '2023-07-12 14:14:39'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_fact_metadata(578718,58626719).jsonl',
  '2023-07-12 12:55:34'),
 ('/home/pamessina/medvq

# Facts

In [6]:
list_files_with_prefix_and_timestamps(
    "integrated_sentence_facts(",
    "/home/pamessina/medvqa-workspace/cache/mimiccxr/",
)

[('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_sentence_facts(58655550,50135857).jsonl',
  '2023-09-09 21:42:41'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_sentence_facts(58655550,49840052).jsonl',
  '2023-07-12 14:51:01'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_sentence_facts(58655550,49840759).jsonl',
  '2023-07-12 14:14:27'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_sentence_facts(58655550,49843061).jsonl',
  '2023-07-10 10:56:53'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_sentence_facts(58655550,46441815).jsonl',
  '2023-07-08 14:14:29'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_sentence_facts(68493868,63882717).jsonl',
  '2023-06-28 11:17:07'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_sentence_facts(68493868,63886474).jsonl',
  '2023-06-26 17:31:31')]

In [9]:
tmp = load_jsonl("/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_sentence_facts(58655550,50135857).jsonl")

In [10]:
len(tmp)

677694

In [12]:
Counter(x['extraction_method'] for x in tmp)

Counter({'gpt-4-0613': 14999,
         'gpt-3.5-turbo-0613': 69936,
         't5-small-finetuned': 592759})

In [14]:
total_facts = sum(len(x['facts']) for x in tmp)
total_facts

1323687

In [15]:
unique_facts = set()
for x in tmp:
    unique_facts.update(x['facts'])
len(unique_facts)

583202

# Metadata per fact

In [17]:
list_files_with_prefix_and_timestamps(
    "integrated_fact",
    "/home/pamessina/medvqa-workspace/cache/mimiccxr/",
)

[('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_fact_metadata(595880,60579117).improved_comparison(6741113).jsonl',
  '2023-09-10 16:30:58'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_fact_metadata(595880,60579117).jsonl',
  '2023-09-09 21:42:49'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_fact_metadata(578733,58628071).improved_comparison(6526297).jsonl',
  '2023-07-18 11:46:52'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_fact_metadata(578733,58628071).improved_comparison(6595181).jsonl',
  '2023-07-18 00:18:14'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_fact_metadata(578733,58628071).jsonl',
  '2023-07-12 14:51:08'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_fact_metadata(578733,58628252).jsonl',
  '2023-07-12 14:14:39'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_fact_metadata(578718,58626719).jsonl',
  '2023-07-12 12:55:34'),
 ('/home/pamessina/medvq

In [18]:
tmp2 = load_jsonl('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_fact_metadata(595880,60579117).improved_comparison(6741113).jsonl')

In [19]:
len(tmp2)

595880

In [31]:
tmp2[1009]

{'fact': 'infiltrated/extravasated contrast from contrast-enhanced CT in the right supraclavicular soft tissue',
 'metadata': {'anatomical location': 'right supraclavicular soft tissue',
  'detailed observation': 'infiltrated/extravasated contrast from contrast-enhanced CT',
  'short observation': 'infiltrated/extravasated contrast',
  'category': 'technical assessment',
  'health status': 'abnormal',
  'prev_study_comparison?': 'no',
  'comparison status': ''},
 'extraction_method': 'gpt-3.5-turbo-0613'}

In [38]:
import json

In [41]:
idx = 10001
print(tmp2[idx]['fact'])
print(json.dumps(tmp2[idx]['metadata'],indent=1))

bilateral effusions equivocally getting worse
{
 "anatomical location": "bilateral",
 "detailed observation": "effusions equivocally getting worse",
 "short observation": "effusions getting worse",
 "category": "anatomical finding",
 "health status": "abnormal",
 "prev_study_comparison?": "yes",
 "comparison status": "worsened"
}


In [21]:
Counter(x['extraction_method'] for x in tmp2)

Counter({'gpt-3.5-turbo-0613': 59921, 't5-small-finetuned': 535959})

# Observations per fact

In [43]:
list_files_with_prefix_and_timestamps(
    "integrated_chest_imagenome_observations",
    "/home/pamessina/medvqa-workspace/cache/mimiccxr/",
)

[('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_chest_imagenome_observations(9).pkl',
  '2023-09-13 22:09:44')]

In [45]:
tmp3 = load_pickle('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_chest_imagenome_observations(9).pkl')

In [47]:
tmp3.keys()

dict_keys(['label_names', 'groups'])

In [48]:
len(tmp3['groups'])

9

In [49]:
tmp3['groups'][0].keys()

dict_keys(['sentences', 'labels', 'extraction_method'])

In [53]:
', '.join(tmp3['label_names'])

'airspace opacity, atelectasis, bone lesion, bronchiectasis, calcified nodule, clavicle fracture, consolidation, costophrenic angle blunting, cyst/bullae, diaphragmatic eventration (benign), elevated hemidiaphragm, enlarged cardiac silhouette, enlarged hilum, hernia, hydropneumothorax, hyperaeration, increased reticular markings/ild pattern, infiltration, linear/patchy atelectasis, lobar/segmental collapse, lung lesion, lung opacity, mass/nodule (not otherwise specified), mediastinal displacement, mediastinal widening, multiple masses/nodules, pleural effusion, pleural/parenchymal scarring, pneumomediastinum, pneumothorax, pulmonary edema/hazy opacity, rib fracture, scoliosis, shoulder osteoarthritis, spinal degenerative changes, spinal fracture, sub-diaphragmatic air, subcutaneous air, superior mediastinal mass/enlargement, tortuous aorta, vascular calcification, vascular congestion, vascular redistribution, aortic graft/repair, cabg grafts, cardiac pacer and wires, prosthetic valve, 

In [110]:
g = tmp3['groups'][-1]
idx = 101
print(g['extraction_method'])
print()
print(g['sentences'][idx])
print()
obs = []
for i, l in enumerate(g['labels'][idx]):
    if l:
        obs.append(tmp3['label_names'][i])
print(json.dumps(obs, indent=1))

gpt-3.5-turbo-0613

Overwhelming growth in the vicinity of the hilum of the right lung

[
 "enlarged hilum",
 "lung lesion"
]


In [51]:
for g in tmp3['groups']:
    print(g['extraction_method'])
    print(g['labels'].shape)
    print()

chest-imagenome
(556111, 74)

t5-small-finetuned
(2816982, 74)

gpt-4-0613
(5000, 74)

gpt-3.5-turbo-0613
(14859, 74)

gpt-3.5-turbo-0613
(19952, 74)

gpt-3.5-turbo-0613
(19959, 74)

gpt-3.5-turbo-0613
(9977, 74)

gpt-3.5-turbo-0613
(9974, 74)

gpt-3.5-turbo-0613
(9987, 74)



# Anatomical locations per fact

In [57]:
list_files_with_prefix_and_timestamps(
    "integrated_chest_im",
    "/home/pamessina/medvqa-workspace/cache/mimiccxr/",
)

[('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_chest_imagenome_observations(9).pkl',
  '2023-09-13 22:09:44'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_chest_imagenome_labels(9).pkl',
  '2023-09-13 05:23:48'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_chest_imagenome_anatomical_locations(5).pkl',
  '2023-09-11 19:38:10'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_chest_imagenome_labels(6).pkl',
  '2023-09-11 16:19:55')]

In [58]:
tmp4 = load_pickle('/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_chest_imagenome_anatomical_locations(5).pkl')

In [59]:
tmp4.keys()

dict_keys(['label_names', 'groups'])

In [61]:
', '.join(tmp4['label_names'])

'abdomen, aortic arch, cardiac silhouette, carina, cavoatrial junction, left apical zone, left arm, left breast, left chest wall, left clavicle, left costophrenic angle, left hemidiaphragm, left hilar structures, left lower lung zone, left lung, left mid lung zone, left shoulder, left upper lung zone, mediastinum, neck, right apical zone, right arm, right atrium, right breast, right chest wall, right clavicle, right costophrenic angle, right hemidiaphragm, right hilar structures, right lower lung zone, right lung, right mid lung zone, right shoulder, right upper lung zone, spine, svc, trachea, upper mediastinum'

In [115]:
g = tmp4['groups'][-1]
idx = 11
print(g['extraction_method'])
print()
print(g['sentences'][idx])
print()
anat = []
for i, l in enumerate(g['labels'][idx]):
    if l:
        anat.append(tmp4['label_names'][i])
print(json.dumps(anat, indent=1))

gpt-3.5-turbo-0613

impacted foreign body within the upper esophagus

[
 "neck"
]


In [62]:
for g in tmp4['groups']:
    print(g['extraction_method'])
    print(g['labels'].shape)
    print()

chest-imagenome
(556111, 38)

t5-small-finetuned
(2598778, 38)

gpt-3.5-turbo-0613
(24068, 38)

gpt-3.5-turbo-0613
(23895, 38)

gpt-3.5-turbo-0613
(24437, 38)



# Paraphrases

In [84]:
list_files_with_prefix_and_timestamps(
    "",
    "/home/pamessina/medvqa-workspace/cache/mimiccxr/openai/",
)

[('/home/pamessina/medvqa-workspace/cache/mimiccxr/openai/gpt-4-0613_nli_contradiction_examples_part1.jsonl',
  '2023-09-25 01:26:54'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/openai/gpt-4-0613_nli_contradiction_examples_part2.jsonl',
  '2023-09-25 01:20:49'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/openai/gpt-4-0613_nli_contradiction_examples_part3.jsonl',
  '2023-09-25 01:16:40'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/openai/gpt-4-0613_nli_queries_from_clusters_improved_queries_part2.jsonl',
  '2023-09-24 20:04:11'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/openai/gpt-4-0613_nli_queries_from_clusters_improved_queries_part1.jsonl',
  '2023-09-24 20:02:24'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/openai/gpt-4-0613_nli_queries_from_clusters_improved_queries_part3.jsonl',
  '2023-09-24 19:59:59'),
 ('/home/pamessina/medvqa-workspace/cache/mimiccxr/openai/gpt-4-0613_radnli_queries.jsonl',
  '2023-09-24 19:27:51'),
 ('/home/pamessina

In [95]:
paraphrases_filepaths = [
"/home/pamessina/medvqa-workspace/cache/mimiccxr/openai/gpt-3.5-turbo-0613_paraphrased_anatomical_locations__part1.jsonl",
"/home/pamessina/medvqa-workspace/cache/mimiccxr/openai/gpt-3.5-turbo-0613_paraphrased_anatomical_locations__part2.jsonl",
"/home/pamessina/medvqa-workspace/cache/mimiccxr/openai/gpt-3.5-turbo-16k-0613_paraphrased_anatomical_locations__part3.jsonl",
"/home/pamessina/medvqa-workspace/cache/mimiccxr/openai/gpt-3.5-turbo-0613_paraphrased_observations__single-words.jsonl",
"/home/pamessina/medvqa-workspace/cache/mimiccxr/openai/gpt-3.5-turbo-0613_paraphrased_observations__two-or-more-words__part1.jsonl",
"/home/pamessina/medvqa-workspace/cache/mimiccxr/openai/gpt-3.5-turbo-16k-0613_paraphrased_observations__two-or-more-words__part2.jsonl",
"/home/pamessina/medvqa-workspace/cache/mimiccxr/openai/gpt-3.5-turbo-0613_paraphrased_observations__two-or-more-words__part3.jsonl",
"/home/pamessina/medvqa-workspace/cache/mimiccxr/openai/gpt-3.5-turbo-16k-0613_paraphrased_observations__two-or-more-words__part4.jsonl",
"/home/pamessina/medvqa-workspace/cache/mimiccxr/openai/gpt-4-0613_paraphrased_observations__two-or-more-words_cluster-balanced.jsonl",
"/home/pamessina/medvqa-workspace/cache/mimiccxr/openai/gpt-3.5-turbo-0613_paraphrased_observations__two-or-more-words_cluster-balanced_1of5.jsonl",
"/home/pamessina/medvqa-workspace/cache/mimiccxr/openai/gpt-3.5-turbo-0613_paraphrased_observations__two-or-more-words_cluster-balanced_2of5.jsonl",
"/home/pamessina/medvqa-workspace/cache/mimiccxr/openai/gpt-3.5-turbo-0613_paraphrased_observations__two-or-more-words_cluster-balanced_3of5.jsonl",
"/home/pamessina/medvqa-workspace/cache/mimiccxr/openai/gpt-3.5-turbo-0613_paraphrased_observations__two-or-more-words_cluster-balanced_4of5.jsonl",
"/home/pamessina/medvqa-workspace/cache/mimiccxr/openai/gpt-3.5-turbo-0613_hard_triplets_from_facts(cluster-balanced,hardest)-part1.jsonl",
"/home/pamessina/medvqa-workspace/cache/mimiccxr/openai/gpt-3.5-turbo-0613_hard_triplets_from_facts(cluster-balanced,hardest)-part2.jsonl",
"/home/pamessina/medvqa-workspace/cache/mimiccxr/openai/gpt-3.5-turbo-0613_hard_triplets_from_facts(cluster-balanced,hardest)-part3.jsonl",
]

In [96]:
paraphrases_list = [load_jsonl(x) for x in paraphrases_filepaths]

In [97]:
for x in paraphrases_list:
    print(len(x))

9891
9890
8511
1864
14993
14971
14972
14965
10000
9999
9997
9994
9996
19937
19937
19943


In [98]:
print(sum(len(x) for x in paraphrases_list))

199860


In [99]:
for x in paraphrases_list:
    print(x[-1])
    print('-' * 50)

{'metadata': {'anatomical location': 'adjacent right basilar'}, 'parsed_response': ['right adjacent basilar', 'adjacent basilar on the right side', 'right basilar region next to', 'right basilar area adjacent to', 'right basilar region in close proximity to', 'right basilar region neighboring', 'right basilar region bordering', 'right basilar region nearby', 'right basilar region alongside', 'right basilar region beside']}
--------------------------------------------------
{'metadata': {'anatomical location': 'perihilar right-sided lung areas'}, 'parsed_response': ['right perihilar lung regions', 'right perihilar lung zones', 'right perihilar lung fields', 'right perihilar lung segments', 'right perihilar lung lobes', 'right perihilar pulmonary areas', 'right perihilar pulmonary regions', 'right perihilar pulmonary zones', 'right perihilar pulmonary fields', 'right perihilar pulmonary segments', 'right perihilar pulmonary lobes', 'right peri-hilum lung areas', 'right peri-hilum lung re

In [102]:
count = 0
for x in paraphrases_list:
    if type(x[0]['parsed_response']) == list:
        for y in x:
            assert type(y['parsed_response']) == list
            count += len(y['parsed_response'])
    else:
        for y in x:
            count += len(y['parsed_response']['positives'])
count

2093173

In [128]:
a = -4
b = 100
print(paraphrases_list[a][b]['metadata']['query'])
print()
print(json.dumps(paraphrases_list[a][b]['parsed_response'], indent=1))

ET tube removed

[
 "The endotracheal tube was taken out",
 "The ET tube was extracted",
 "The patient's endotracheal tube was removed",
 "The breathing tube was removed",
 "The tube used for intubation was taken out",
 "The patient's airway tube was removed",
 "The endotracheal tube was disconnected",
 "The ET tube was removed from the patient's airway",
 "The breathing tube was withdrawn",
 "The endotracheal tube was extracted from the patient's trachea",
 "The patient's intubation tube was removed",
 "The tube used for ventilation was taken out",
 "The endotracheal tube was removed from the patient's throat",
 "The ET tube was pulled out"
]


# MedNLI

In [9]:
from medvqa.datasets.nli import MEDNLI_DEV_JSONL_PATH, MEDNLI_TEST_JSONL_PATH, MEDNLI_TRAIN_JSONL_PATH
from medvqa.utils.files import load_jsonl
from collections import Counter

In [3]:
mednli_dev = load_jsonl(MEDNLI_DEV_JSONL_PATH)
mednli_test = load_jsonl(MEDNLI_TEST_JSONL_PATH)
mednli_train = load_jsonl(MEDNLI_TRAIN_JSONL_PATH)

In [5]:
len(mednli_dev), len(mednli_test), len(mednli_train)

(1395, 1422, 11232)

In [7]:
rows = mednli_dev + mednli_test + mednli_train

In [8]:
len(rows)

14049

In [10]:
rows[0]

{'sentence1': 'No history of blood clots or DVTs, has never had chest pain prior to one week ago.',
 'pairID': '1892c19a-66c7-11e7-be8f-f45c89b91419',
 'sentence1_parse': '(ROOT (S (NP (NP (DT No) (NN history)) (PP (IN of) (NP (NP (NN blood) (NNS clots)) (CC or) (NP (NNS DVTs))))) (, ,) (VP (VBZ has) (ADVP (RB never)) (VP (VBN had) (NP (NN chest) (NN pain)) (ADVP (RB prior) (S (VP (TO to) (VP (ADVP (NP (CD one) (NN week)) (RB ago)))))))) (. .)))',
 'sentence1_binary_parse': '( ( ( No history ) ( of ( ( ( blood clots ) or ) DVTs ) ) ) ( , ( ( ( has never ) ( ( had ( chest pain ) ) ( prior ( to ( ( one week ) ago ) ) ) ) ) . ) ) )',
 'sentence2': ' Patient has angina',
 'sentence2_parse': '(ROOT (S (NP (NN Patient)) (VP (VBZ has) (NP (NN angina)))))',
 'sentence2_binary_parse': '( Patient ( has angina ) )',
 'gold_label': 'entailment'}

In [11]:
Counter(x['gold_label'] for x in rows)

Counter({'entailment': 4683, 'contradiction': 4683, 'neutral': 4683})

# RadNLI

In [19]:
from medvqa.datasets.nli import RADNLI_DEV_JSONL_PATH
import json

In [13]:
radnli_dev = load_jsonl(RADNLI_DEV_JSONL_PATH)

In [21]:
x = radnli_dev[-1]
json.dumps({'P': x['sentence1'], 'H': x['sentence2'], 'L': x['gold_label']})

'{"P": "Lungs are well-expanded and clear without focal consolidation concerning for pneumonia.", "H": "Lungs are hyperinflated but clear.", "L": "contradiction"}'

In [None]:
{'P': p, 'H': h, 'L': l}

# Integrated NLI examples

In [22]:
integrated_nli = load_jsonl("/home/pamessina/medvqa-workspace/cache/mimiccxr/integrated_nli_examples(162036,21465751).jsonl")

In [24]:
Counter(x['source'] for x in integrated_nli)

Counter({'gpt-4-0613': 147509,
         'mednli_train': 11231,
         'mednli_dev': 1395,
         'mednli_test': 1421,
         'radnli_dev': 480})

In [25]:
len(integrated_nli)

162036

In [27]:
Counter(x['label'] for x in integrated_nli)

Counter({'entailment': 26442, 'neutral': 39817, 'contradiction': 95777})

# Post-processed reports

In [9]:
from medvqa.utils.files import load_json, load_pickle

In [1]:
bfi =  load_json("/mnt/data/pamessina/workspaces/medvqa-workspace/cache/mimiccxr/background_findings_and_impression_20230612_174143.json")

In [4]:
empty_idxs = []
for i, r in enumerate(bfi):
    if not r['findings'] and not r['impression']:
        empty_idxs.append(i)

In [5]:
len(empty_idxs)

51

In [7]:
bfi[empty_idxs[0]]

{'background': 'history: Pancreatitis, pleural effusion followup.',
 'findings': '',
 'impression': '',
 'path': '/mnt/data/mimic-cxr/files/p10/p10060142/s52115374.txt'}

In [10]:
tmp = load_pickle("/mnt/workspace/pamessina/medvqa-workspace/cache/mimiccxr/integrated_report_annotations(hash=1005,3660907086036749586).pkl")

In [11]:
tmp.keys()

dict_keys(['reports', 'cluster_based_label_names', 'sentences', 'sentence_embeddings', 'sentence_cluster_ids', 'sentence_fact_idxs', 'facts', 'fact_embeddings', 'fact_cluster_ids', 'fact_metadata', 'anatomical_locations', 'anatomical_location_embeddings', 'anatomical_location_cluster_ids'])

In [19]:
tmp['reports'][empty_idxs[-1]]

{'path': 'mimic-cxr/files/p19/p19349312/s52035334.txt',
 'part_id': 'p19',
 'subject_id': 'p19349312',
 'study_id': 's52035334',
 'original_report': '                                 FINAL REPORT\n EXAMINATION:  CHEST (PA AND LAT)\n \n INDICATION:  ___ year old man s/p CABG  // eval for effusion      eval for\n effusion\n \n COMPARISON:  Chest radiographs ___ and ___.\n',
 'background': 'indication: year old man s/p CABG // eval for effusion eval for effusion.',
 'findings': '',
 'impression': '',
 'findings_sentence_idxs': [],
 'findings_fact_idxs': [],
 'impression_sentence_idxs': [],
 'impression_fact_idxs': [],
 'cluster_based_labels': []}

In [37]:
no_impression_idxs = []
for i, x in enumerate(tmp['reports']):
    if not x['impression']:
        no_impression_idxs.append(i)

In [20]:
empty_v2 = []
for i, r in enumerate(tmp['reports']):
    if not r['cluster_based_labels']:
        empty_v2.append(i)

In [21]:
len(empty_v2)

62

In [26]:
delta = list(set(empty_v2) - set(empty_idxs))

In [28]:
len(delta)

11

In [40]:
tmp['reports'][no_impression_idxs[2]]

{'path': 'mimic-cxr/files/p10/p10083833/s51949859.txt',
 'part_id': 'p10',
 'subject_id': 'p10083833',
 'study_id': 's51949859',
 'original_report': '                                 FINAL REPORT\n CHEST TWO VIEWS ON ___\n \n HISTORY:  Pulmonary fibrosis, check lead placement.\n \n FINDINGS:  There is a new dual-lead pacemaker with leads projecting over the\n expected location of the heart.  The diffuse increased interstitial markings,\n lower lobe greater than upper lobe are similar compared to the study from the\n prior week.  There is no new infiltrate.\n',
 'background': 'history: Pulmonary fibrosis, check lead placement.',
 'findings': 'There is a new dual-lead pacemaker with leads projecting over the expected location of the heart. The diffuse increased interstitial markings, lower lobe greater than upper lobe are similar compared to the study from the prior week. There is no new infiltrate.',
 'impression': '',
 'findings_sentence_idxs': [101, 102, 103],
 'findings_fact_idxs': [