In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai import *
from fastai.vision import *
from pathlib import Path

In [3]:
# Load in data
path_chexpert = Path('../../../../scratch/rl80/mimic-cxr-jpg-2.0.0.physionet.org/mimic-cxr-2.0.0-chexpert.csv.gz')
path_negbio = Path('../../../../scratch/rl80/mimic-cxr-jpg-2.0.0.physionet.org/mimic-cxr-2.0.0-negbio.csv.gz')
path_metadata = Path('../../../../scratch/rl80/mimic-cxr-jpg-2.0.0.physionet.org/mimic-cxr-2.0.0-metadata.csv.gz')

df_chexpert = pd.read_csv(path_chexpert)
df_negbio = pd.read_csv(path_negbio)
df_metadata = pd.read_csv(path_metadata)

k_shot = 50
novel_labels = ['Lung Lesion', 'Enlarged Cardiomediastinum', 'Pleural Effusion']

In [4]:
# Merge relevant metadata, NegBio labels and Chexpert labels
df = df_negbio.merge(
    df_chexpert,
    how='left',
    left_on=['subject_id','study_id'], right_on=['subject_id','study_id'],
    suffixes=('', '_cx')
)

df_metadata.drop([
    'PerformedProcedureStepDescription', 
    'Rows', 
    'Columns', 
    'StudyDate', 
    'StudyTime', 
    'ProcedureCodeSequence_CodeMeaning', 
    'ViewCodeSequence_CodeMeaning', 
    'PatientOrientationCodeSequence_CodeMeaning'
],axis=1, inplace=True)

df = df_metadata.merge(
    df,
    how='left',
    left_on=['subject_id','study_id'], right_on=['subject_id','study_id'],
)
df.head()

Unnamed: 0,dicom_id,subject_id,study_id,ViewPosition,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,...,Enlarged Cardiomediastinum_cx,Fracture_cx,Lung Lesion_cx,Lung Opacity_cx,No Finding_cx,Pleural Effusion_cx,Pleural Other_cx,Pneumonia_cx,Pneumothorax_cx,Support Devices_cx
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,PA,,,,,,,...,,,,,1.0,,,,,
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,10000032,50414267,LATERAL,,,,,,,...,,,,,1.0,,,,,
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,PA,,,,,,,...,,,,,1.0,,,,,
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,10000032,53189527,LATERAL,,,,,,,...,,,,,1.0,,,,,
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,AP,,,,,,,...,,,,,1.0,,,,,


In [5]:
# Preprocess data:
# Only use data that is a '1.0'
# Remove all disagreeing '1.0' data
# Remove all Pleural Other findings
# Remove all non antero-posterior (AP) data
for key in df.columns:
    if key in ('dicom_id','subject_id','study_id', 'ViewPosition'):
        continue
    
    if key[-3:] == '_cx':
        continue
        
    # Remove data that is not a '1.0'
    df[key] = df[key].map({1:key})
    df[key + '_cx'] = df[key + '_cx'].map({1:key})
    
    # Remove all disagreeing '1.0' data
    agree_matrix = df[key].fillna(0) == df[key + '_cx'].fillna(0)
    df = df[agree_matrix]

# Remove all Pleural Other Data
keep = df['Pleural Other'].map({'Pleural Other': False}).fillna(True)
df = df[keep]

# Remove all non antero-posterior (AP) data
keep = df['ViewPosition'].map({'AP': True}).fillna(False)
df = df[keep]

# Remove Columns
df.drop([key for key in df.columns if key[-3:] == '_cx'], axis=1, inplace=True)
df.drop(['ViewPosition','Pleural Other'], axis=1, inplace=True)
df.head()


Unnamed: 0,dicom_id,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pneumonia,Pneumothorax,Support Devices
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,,,,,,,,,No Finding,,,,
5,fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818,10000032,53911762,,,,,,,,,No Finding,,,,
6,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,10000032,56699142,,,,,,,,,No Finding,,,,
7,096052b7-d256dc40-453a102b-fa7d01c6-1b22c6b4,10000764,57375967,,,Consolidation,,,,,,,,,,
15,d0b71acc-b5a62046-bbb5f6b8-7b173b85-65cdf738,10000935,50578979,,,,,,,,,,Pleural Effusion,Pneumonia,,


In [6]:
# Separate columns into path and labels
df_labels = df.copy()
cols_path = [key for key in df.columns if key in ('dicom_id', 'subject_id', 'study_id')]
cols_labels = [key for key in df.columns if key not in ('dicom_id', 'subject_id', 'study_id')]

# Combine columns into a file path and labels
df_labels['file_path'] = df_labels[cols_path].apply(lambda x: f"p{str(x.values[1])[:2]}/p{x.values[1]}/s{x.values[2]}/{x.values[0]}.jpg", axis=1)
df_labels['labels'] = df_labels[cols_labels].apply(lambda x: ','.join(x.dropna().values.tolist()), axis=1)
df_labels.drop(df.columns, axis=1, inplace=True)

# Remove all data that does not have a label
df_labels = df_labels[~(df_labels['labels']=='')]
df_labels

Unnamed: 0,file_path,labels
4,p10/p10000032/s53911762/68b5c4b1-227d0485-9cc3...,No Finding
5,p10/p10000032/s53911762/fffabebf-74fd3a1f-673b...,No Finding
6,p10/p10000032/s56699142/ea030e7a-2e3b1346-bc51...,No Finding
7,p10/p10000764/s57375967/096052b7-d256dc40-453a...,Consolidation
15,p10/p10000935/s50578979/d0b71acc-b5a62046-bbb5...,"Pleural Effusion,Pneumonia"
...,...,...
377102,p19/p19999442/s58497551/ee9155f3-944c056b-c76c...,"Atelectasis,Lung Opacity"
377103,p19/p19999442/s58708861/16b6c70f-6d36bd77-89d2...,"No Finding,Support Devices"
377107,p19/p19999987/s55368167/58766883-376a15ce-3b32...,Atelectasis
377108,p19/p19999987/s58621812/7ba273af-3d290f8d-e28d...,"Atelectasis,Support Devices"


In [7]:
# Remove all multi label data
keep = df_labels['labels'].apply(lambda x: ',' not in x)
df_single_labels = df_labels[keep]

# Get the resulting samples per class to aid in deciding the size of the sets
dict_count = {}
for label in df_single_labels['labels']:
    if label in dict_count.keys():
        dict_count[label] += 1
    else:
        dict_count[label] = 1

dict_count

{'Atelectasis': 3240,
 'Cardiomegaly': 3562,
 'Consolidation': 485,
 'Edema': 2407,
 'Enlarged Cardiomediastinum': 428,
 'Fracture': 410,
 'Lung Lesion': 462,
 'Lung Opacity': 5409,
 'No Finding': 21110,
 'Pleural Effusion': 2242,
 'Pneumonia': 952,
 'Pneumothorax': 1029,
 'Support Devices': 2325}

In [8]:
df_splits = df_single_labels.copy()

# Create splits: 80% Training and 20% Validation per base class
#                100 training and 300 validation samples per novel class
for label in cols_labels:
    df_unsplit = df_splits[df_splits['labels'].apply(lambda x: x == label)]
    
    # Base Classes
    if label not in novel_labels:
        df_train = df_unsplit.sample(frac=0.8, random_state=1)
        df_validate = df_unsplit.drop(df_train.index)
        
        # Give split designation and merge back into main dataframe
        df_train['split'] = 'base_train'
        df_validate['split'] = 'base_validate'
        df_train.drop(['file_path', 'labels'], axis=1, inplace=True)
        df_validate.drop(['file_path', 'labels'], axis=1, inplace=True)
    
        df_splits = df_splits.merge(
            df_train,
            how='left',
            left_index=True,
            right_index=True,
            suffixes=('', '_x')
        )
    
        if 'split_x' in df_splits.columns:
            df_splits['split'] = df_splits[['split', 'split_x']].apply(lambda x: ''.join(x.dropna().values.tolist()), axis=1)
            df_splits.drop('split_x', axis=1, inplace=True)
    
        df_splits = df_splits.merge(
            df_validate,
            how='left',
            left_index=True,
            right_index=True,
            suffixes=('', '_x')
        )
    
        if 'split_x' in df_splits.columns:
            df_splits['split'] = df_splits[['split', 'split_x']].apply(lambda x: ''.join(x.dropna().values.tolist()), axis=1)
            df_splits.drop('split_x', axis=1, inplace=True)
    
    # Novel Classes
    else:
        df_unsplit = df_unsplit.sample(n=k_shot+300, random_state=1)
    
        df_unsplit['split'] = ''
        df_unsplit['split'][:k_shot] = 'novel_train'
        df_unsplit['split'][k_shot:] = 'novel_validate'
        df_unsplit.drop(['file_path', 'labels'], axis=1, inplace=True)

        df_splits = df_splits.merge(
            df_unsplit,
            how='left',
            left_index=True,
            right_index=True,
            suffixes=('', '_x')
        )
    
        if 'split_x' in df_splits.columns:
            df_splits['split'] = df_splits[['split', 'split_x']].apply(lambda x: ''.join(x.dropna().values.tolist()), axis=1)
            df_splits.drop('split_x', axis=1, inplace=True)


In [9]:
# Show Number of Samples for Debugging
dict_train = {}
dict_validate = {}
for index, row in df_splits.iterrows():
    if (row['split'] == 'base_train') or (row['split'] == 'novel_train'):
        if row['labels'] in dict_train.keys():
            dict_train[row['labels']] += 1
        else:
            dict_train[row['labels']] = 1
    elif (row['split'] == 'base_validate') or (row['split'] == 'novel_validate'):
        if row['labels'] in dict_validate.keys():
            dict_validate[row['labels']] += 1
        else:
            dict_validate[row['labels']] = 1

In [10]:
dict_train

{'Atelectasis': 2592,
 'Cardiomegaly': 2850,
 'Consolidation': 388,
 'Edema': 1926,
 'Enlarged Cardiomediastinum': 50,
 'Fracture': 328,
 'Lung Lesion': 50,
 'Lung Opacity': 4327,
 'No Finding': 16888,
 'Pleural Effusion': 50,
 'Pneumonia': 762,
 'Pneumothorax': 823,
 'Support Devices': 1860}

In [11]:
dict_validate

{'Atelectasis': 648,
 'Cardiomegaly': 712,
 'Consolidation': 97,
 'Edema': 481,
 'Enlarged Cardiomediastinum': 300,
 'Fracture': 82,
 'Lung Lesion': 300,
 'Lung Opacity': 1082,
 'No Finding': 4222,
 'Pleural Effusion': 300,
 'Pneumonia': 190,
 'Pneumothorax': 206,
 'Support Devices': 465}

In [6]:
import os
import pandas as pd
from pathlib import Path

def create_splits(k_shot, path_splits):
    """
    Create training and validation splits for the MIMIC-CXR-JPG Database. This function also performs the following:
        Keeps only affirmative data,
        Merges the two set of structured labels
        Removes disagreeing samples and multi-class samples
        Removes the Pleural Other and Support Devices class
        Keeps only Antero-posterior oriented samples
        Undersamples the No Finding class to 5000 samples
        Exports the splits into a csv file

    Input:
            k_shot: The number of samples per class for the novel classes
    """
    novel_labels = ['Lung Lesion', 'Enlarged Cardiomediastinum', 'Pleural Effusion']

    # Load in data
    path_chexpert = Path('../../../../scratch/rl80/mimic-cxr-jpg-2.0.0.physionet.org/mimic-cxr-2.0.0-chexpert.csv.gz')
    path_negbio = Path('../../../../scratch/rl80/mimic-cxr-jpg-2.0.0.physionet.org/mimic-cxr-2.0.0-negbio.csv.gz')
    path_metadata = Path('../../../../scratch/rl80/mimic-cxr-jpg-2.0.0.physionet.org/mimic-cxr-2.0.0-metadata.csv.gz')

    df_chexpert = pd.read_csv(path_chexpert)
    df_negbio = pd.read_csv(path_negbio)
    df_metadata = pd.read_csv(path_metadata)

    # Merge relevant metadata, NegBio labels and Chexpert labels
    df = df_negbio.merge(
        df_chexpert,
        how='left',
        left_on=['subject_id','study_id'], right_on=['subject_id','study_id'],
        suffixes=('', '_cx')
    )

    df_metadata.drop([
        'PerformedProcedureStepDescription',
        'Rows',
        'Columns',
        'StudyDate',
        'StudyTime',
        'ProcedureCodeSequence_CodeMeaning',
        'ViewCodeSequence_CodeMeaning',
        'PatientOrientationCodeSequence_CodeMeaning'
    ],axis=1, inplace=True)

    df = df_metadata.merge(
        df,
        how='left',
        left_on=['subject_id','study_id'], right_on=['subject_id','study_id'],
    )

    # Preprocess data:
    # Only use data that is a '1.0'
    # Remove all disagreeing '1.0' data
    # Remove all Pleural Other findings
    # Remove all non antero-posterior (AP) data
    for key in df.columns:
        if key in ('dicom_id', 'subject_id', 'study_id', 'ViewPosition'):
            continue

        if key[-3:] == '_cx':
            continue

        # Remove data that is not a '1.0'
        df[key] = df[key].map({1: key})
        df[key + '_cx'] = df[key + '_cx'].map({1: key})

        # Remove all disagreeing '1.0' data
        agree_matrix = df[key].fillna(0) == df[key + '_cx'].fillna(0)
        df = df[agree_matrix]

    # Remove all Pleural Other Data
    keep = df['Pleural Other'].map({'Pleural Other': False}).fillna(True)
    df = df[keep]
    
    # Remove all Support Devices Data
    keep = df['Support Devices'].map({'Support Devices': False}).fillna(True)
    df = df[keep]

    # Remove all non antero-posterior (AP) data
    keep = df['ViewPosition'].map({'AP': True}).fillna(False)
    df = df[keep]

    # Remove Columns
    df.drop([key for key in df.columns if key[-3:] == '_cx'], axis=1, inplace=True)
    df.drop(['ViewPosition', 'Pleural Other', 'Support Devices'], axis=1, inplace=True)

    # Separate columns into path and labels
    df_labels = df.copy()
    cols_path = [key for key in df.columns if key in ('dicom_id', 'subject_id', 'study_id')]
    cols_labels = [key for key in df.columns if key not in ('dicom_id', 'subject_id', 'study_id')]

    # Combine columns into a file path and labels
    df_labels['file_path'] = df_labels[cols_path].apply(
        lambda x: f"p{str(x.values[1])[:2]}/p{x.values[1]}/s{x.values[2]}/{x.values[0]}.jpg", axis=1)
    df_labels['labels'] = df_labels[cols_labels].apply(lambda x: ','.join(x.dropna().values.tolist()), axis=1)
    df_labels.drop(df.columns, axis=1, inplace=True)

    # Remove all data that does not have a label
    df_labels = df_labels[~(df_labels['labels'] == '')]

    # Remove all multi label data
    keep = df_labels['labels'].apply(lambda x: ',' not in x)
    df_single_labels = df_labels[keep]

    df_splits = df_single_labels.copy()

    # Create splits: 80% Training and 20% Validation per base class
    #                100 training and 300 validation samples per novel class
    for label in cols_labels:
        df_unsplit = df_splits[df_splits['labels'].apply(lambda x: x == label)]

        # Base Classes
        if label not in novel_labels:

            # Undersample the 'No Finding' Class to 5000 samples
            if label == 'No Finding':
                df_unsplit = df_unsplit.sample(5000, random_state=1)

            df_train = df_unsplit.sample(frac=0.8, random_state=1)
            df_validate = df_unsplit.drop(df_train.index)

            # Give split designation and merge back into main dataframe
            df_train['split'] = 'base_train'
            df_validate['split'] = 'base_validate'
            df_train.drop(['file_path', 'labels'], axis=1, inplace=True)
            df_validate.drop(['file_path', 'labels'], axis=1, inplace=True)

            df_splits = df_splits.merge(
                df_train,
                how='left',
                left_index=True,
                right_index=True,
                suffixes=('', '_x')
            )

            if 'split_x' in df_splits.columns:
                df_splits['split'] = df_splits[['split', 'split_x']].apply(
                    lambda x: ''.join(x.dropna().values.tolist()), axis=1)
                df_splits.drop('split_x', axis=1, inplace=True)

            df_splits = df_splits.merge(
                df_validate,
                how='left',
                left_index=True,
                right_index=True,
                suffixes=('', '_x')
            )

            if 'split_x' in df_splits.columns:
                df_splits['split'] = df_splits[['split', 'split_x']].apply(
                    lambda x: ''.join(x.dropna().values.tolist()), axis=1)
                df_splits.drop('split_x', axis=1, inplace=True)

        # Novel Classes
        else:
            df_unsplit = df_unsplit.sample(n=k_shot + 300, random_state=1)

            df_unsplit['split'] = ''
            df_unsplit['split'][:k_shot] = 'novel_train'
            df_unsplit['split'][k_shot:] = 'novel_validate'
            df_unsplit.drop(['file_path', 'labels'], axis=1, inplace=True)

            df_splits = df_splits.merge(
                df_unsplit,
                how='left',
                left_index=True,
                right_index=True,
                suffixes=('', '_x')
            )

            if 'split_x' in df_splits.columns:
                df_splits['split'] = df_splits[['split', 'split_x']].apply(
                    lambda x: ''.join(x.dropna().values.tolist()), axis=1)
                df_splits.drop('split_x', axis=1, inplace=True)

    df_splits.to_csv(os.path.join(path_splits, f'{k_shot}_shot.csv'), index=False)


def check_splits(df_csv):
    """
    Sums up the number of training and validation samples per class

    Input:
            df_csv: A dataframe containing training validation split data
    Output: An array containing two dictionaries stating the amount of training and validation samples
    """
    df_splits = df_csv
    dict_train = {}
    dict_validate = {}
    for index, row in df_splits.iterrows():
        if (row['split'] == 'base_train') or (row['split'] == 'novel_train'):
            if row['labels'] in dict_train.keys():
                dict_train[row['labels']] += 1
            else:
                dict_train[row['labels']] = 1
        elif (row['split'] == 'base_validate') or (row['split'] == 'novel_validate'):
            if row['labels'] in dict_validate.keys():
                dict_validate[row['labels']] += 1
            else:
                dict_validate[row['labels']] = 1

    return [dict_train, dict_validate]

if __name__ == '__main__':
    shot_list = [20, 10, 5, 3, 1]
    for k_shot in shot_list:
        create_splits(k_shot, '../splits')

In [7]:
create_splits(20)

In [3]:
import os
import pandas as pd
from pathlib import Path


def check_splits(df_csv):
    """
    Sums up the number of training and validation samples per class

    Input:
            df_csv: A dataframe containing training validation split data
    Output: An array containing two dictionaries stating the amount of training and validation samples
    """
    df_splits = df_csv
    dict_train = {}
    dict_validate = {}
    for index, row in df_splits.iterrows():
        if (row['split'] == 'base_train') or (row['split'] == 'novel_train'):
            if row['labels'] in dict_train.keys():
                dict_train[row['labels']] += 1
            else:
                dict_train[row['labels']] = 1
        elif (row['split'] == 'base_validate') or (row['split'] == 'novel_validate'):
            if row['labels'] in dict_validate.keys():
                dict_validate[row['labels']] += 1
            else:
                dict_validate[row['labels']] = 1

    return [dict_train, dict_validate]

for k_shot in [20, 10, 5, 3, 1]:
    df = pd.read_csv(f'../splits/{k_shot}_shot.csv')
    dict_train, dict_validate = check_splits(df)
    print(k_shot)
    print(dict_train)
    print(dict_validate)

20
{'Consolidation': 388, 'Lung Opacity': 4327, 'No Finding': 4000, 'Edema': 1926, 'Pneumonia': 762, 'Cardiomegaly': 2850, 'Pneumothorax': 823, 'Atelectasis': 2592, 'Support Devices': 1860, 'Lung Lesion': 20, 'Fracture': 328, 'Enlarged Cardiomediastinum': 20, 'Pleural Effusion': 20}
{'Lung Opacity': 1082, 'No Finding': 1000, 'Enlarged Cardiomediastinum': 300, 'Pleural Effusion': 300, 'Consolidation': 97, 'Edema': 481, 'Support Devices': 465, 'Lung Lesion': 300, 'Cardiomegaly': 712, 'Pneumonia': 190, 'Atelectasis': 648, 'Fracture': 82, 'Pneumothorax': 206}
10
{'Consolidation': 388, 'Lung Opacity': 4327, 'No Finding': 4000, 'Edema': 1926, 'Pneumonia': 762, 'Cardiomegaly': 2850, 'Pneumothorax': 823, 'Atelectasis': 2592, 'Support Devices': 1860, 'Fracture': 328, 'Enlarged Cardiomediastinum': 10, 'Pleural Effusion': 10, 'Lung Lesion': 10}
{'Lung Opacity': 1082, 'No Finding': 1000, 'Enlarged Cardiomediastinum': 300, 'Pleural Effusion': 300, 'Consolidation': 97, 'Edema': 481, 'Support Devices

In [1]:
print(1,2,3)

1 2 3
