In [1]:
import pandas as pd
import re

In [2]:
from pathlib import Path
import json
import pickle
from typing import NamedTuple
image_root = Path('/nfs/datasets/google_time/mimic/images/')
text_paths = Path('/nfs/datasets/google_time/mimic/data/parsed_texts.json')

class ImageText(NamedTuple):
    image_path: Path
    text: str

with open(text_paths, 'r') as f:
    text_paths_to_texts = json.load(f)

image_text_pairs = []
for text_path, texts in text_paths_to_texts.items():
    image_paths = image_root / str(text_path).replace(".txt", "")
    if not image_paths.exists():
        continue
    image_paths = list(image_paths.glob("*jpg"))
    if len(image_paths) > 0:
        for image_path in image_paths:
            for text in texts:
                pair = ImageText(image_path=image_path, text=text)
                image_text_pairs.append(pair)

In [3]:
train_data = [d for d in image_text_pairs if '/p13/' not in str(d.image_path)]
test_data = [d for d in image_text_pairs if '/p13/' in str(d.image_path)]

In [4]:
labels_list = ['Atelectasis',
               'Cardiomegaly',
               'Consolidation',
               'Edema',
               'Enlarged Cardiomediastinum',
               'Fracture',
               'Lung Lesion',
               'Lung Opacity',
               'No Finding',
               'Pleural Effusion',
               'Pleural Other',
               'Pneumonia',
               'Pneumothorax',
               'Support Devices']

captions_list = ['the patient is healthy' if label == 'No Finding' else f'the patient has {label}' for label in
                 labels_list]

gt_df = pd.read_csv('/nfs/datasets/google_time/mimic/data/mimic-cxr-2.0.0-chexpert.csv')

In [12]:
def data_to_df(data):
    data_dict = {'image_path': [str(d.image_path) for d in data],
                 'text': [d.text for d in data],
                 'study_id': [int(re.findall('s5[0-9]{7}', str(d.image_path))[0][1:]) for d in data]}
    
    df = pd.DataFrame(data_dict)
    df = df.merge(gt_df, on='study_id')
    for label in labels_list:
        df[label] = df[label].apply(lambda l: 1 if l == 1 else 0) 
    return df

In [13]:
train_df = data_to_df(train_data)

In [15]:
train_df

Unnamed: 0,image_path,text,study_id,subject_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,/nfs/datasets/google_time/mimic/images/p10/p10...,In comparison with the study of there has been...,55566145,10218444,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,/nfs/datasets/google_time/mimic/images/p10/p10...,No new areas of pneumonia or vascular congestion.,55566145,10218444,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,/nfs/datasets/google_time/mimic/images/p10/p10...,In comparison with the study of there has been...,55566145,10218444,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,/nfs/datasets/google_time/mimic/images/p10/p10...,No new areas of pneumonia or vascular congestion.,55566145,10218444,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,/nfs/datasets/google_time/mimic/images/p10/p10...,PA and lateral views of the chest provided,59439105,10218444,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
681158,/nfs/datasets/google_time/mimic/images/p11/p11...,There is no pleural effusion or pneumothorax,56262028,11560506,0,0,0,0,0,0,0,0,0,0,0,0,0,0
681159,/nfs/datasets/google_time/mimic/images/p11/p11...,No evidence of rib fracture,56262028,11560506,0,0,0,0,0,0,0,0,0,0,0,0,0,0
681160,/nfs/datasets/google_time/mimic/images/p11/p11...,No evidence of acute cardiopulmonary process,56262028,11560506,0,0,0,0,0,0,0,0,0,0,0,0,0,0
681161,/nfs/datasets/google_time/mimic/images/p11/p11...,Although no rib fracture is identifiedthis stu...,56262028,11560506,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
test_df = data_to_df(test_data)
test_df

Unnamed: 0,image_path,text,study_id,subject_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,/nfs/datasets/google_time/mimic/images/p13/p13...,Two PA and one lateral radiographs of the ches...,57882808,13367279,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,/nfs/datasets/google_time/mimic/images/p13/p13...,The lungs are well expanded and clear,57882808,13367279,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,/nfs/datasets/google_time/mimic/images/p13/p13...,There is no focal consolidationeffusionor pneu...,57882808,13367279,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,/nfs/datasets/google_time/mimic/images/p13/p13...,Cardiac contours are normal,57882808,13367279,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,/nfs/datasets/google_time/mimic/images/p13/p13...,No acute cardiopulmonary process.,57882808,13367279,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154312,/nfs/datasets/google_time/mimic/images/p13/p13...,The cardiomediastinal and hilar contours are w...,51857500,13283994,0,0,0,0,0,0,0,0,1,0,0,0,0,0
154313,/nfs/datasets/google_time/mimic/images/p13/p13...,The lungs are mildly hyperinflated but clear,51857500,13283994,0,0,0,0,0,0,0,0,1,0,0,0,0,0
154314,/nfs/datasets/google_time/mimic/images/p13/p13...,There is no evidence of pulmonary vascular con...,51857500,13283994,0,0,0,0,0,0,0,0,1,0,0,0,0,0
154315,/nfs/datasets/google_time/mimic/images/p13/p13...,There is no consolidationeffusion or pneumothorax,51857500,13283994,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [17]:
train_df.to_csv('/nfs/datasets/google_time/mimic/data/train_df_clip.csv')

In [18]:
test_df.to_csv('/nfs/datasets/google_time/mimic/data/test_df_clip.csv')