In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai import *
from fastai.vision import *
from pathlib import Path
import os

In [3]:
path_chexpert = Path('../../../../scratch/rl80/mimic-cxr-jpg-2.0.0.physionet.org/mimic-cxr-2.0.0-chexpert.csv.gz')
path_negbio = Path('../../../../scratch/rl80/mimic-cxr-jpg-2.0.0.physionet.org/mimic-cxr-2.0.0-negbio.csv.gz')
df_chexpert = pd.read_csv(path_chexpert)
df_negbio = pd.read_csv(path_negbio)
df_negbio.head()

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,10000032,50414267,,,,,,,,,1.0,,,,,
1,10000032,53189527,,,,,,,,,1.0,,,,,
2,10000032,53911762,,,,,,,,,1.0,,,,,
3,10000032,56699142,,,,,,,,,1.0,,,,,
4,10000764,57375967,,,1.0,,,,,,,,,-1.0,,


In [4]:
# Adapted from https://github.com/MIT-LCP/mimic-cxr
df = df_negbio.merge(
    df_chexpert,
    how='left',
    left_on=['subject_id','study_id'], right_on=['subject_id','study_id'],
    suffixes=('', '_cx')
)
df.head()

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,...,Enlarged Cardiomediastinum_cx,Fracture_cx,Lung Lesion_cx,Lung Opacity_cx,No Finding_cx,Pleural Effusion_cx,Pleural Other_cx,Pneumonia_cx,Pneumothorax_cx,Support Devices_cx
0,10000032,50414267,,,,,,,,,...,,,,,1.0,,,,,
1,10000032,53189527,,,,,,,,,...,,,,,1.0,,,,,
2,10000032,53911762,,,,,,,,,...,,,,,1.0,,,,,
3,10000032,56699142,,,,,,,,,...,,,,,1.0,,,,,
4,10000764,57375967,,,1.0,,,,,,...,,,,,,,,-1.0,,


In [5]:
# Preprocess data:
# Only use data that is a '1.0'
# Remove all disagreeing '1.0' data
# Remove all Pleural Other findings
# Original Length was 227827, New Length: 209253
for key in df.columns:
    if key in ('subject_id','study_id'):
        continue
    
    if key[-3:] == '_cx':
        continue
        
    # Remove data that is not a '1.0'
    df[key] = df[key].map({1:1})
    df[key + '_cx'] = df[key + '_cx'].map({1:1})
    
    # Remove all disagreeing '1.0' data
    agree_matrix = df[key].fillna(0) == df[key + '_cx'].fillna(0)
    df = df[agree_matrix]

# Remove all Pleural Other Data
keep = df['Pleural Other'].map({1: False}).fillna(True)
df = df[keep]

# Remove Columns
df.drop([key for key in df.columns if key[-3:] == '_cx'], axis=1, inplace=True)
df.drop('Pleural Other', axis=1, inplace=True)
df.head()


Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pneumonia,Pneumothorax,Support Devices
0,10000032,50414267,,,,,,,,,1.0,,,,
1,10000032,53189527,,,,,,,,,1.0,,,,
2,10000032,53911762,,,,,,,,,1.0,,,,
3,10000032,56699142,,,,,,,,,1.0,,,,
4,10000764,57375967,,,1.0,,,,,,,,,,


In [6]:
len(df.index)

209253