# Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%reload_ext autoreload

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import transforms
import time
from itertools import islice
from dataclasses import dataclass
import torchvision
from torchvision.models import densenet161, DenseNet161_Weights, vit_b_16, ViT_B_16_Weights, densenet121, DenseNet121_Weights
import os
import sys
from pathlib import Path
from torchinfo import summary

In [4]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [5]:
from CheXpert.race_prediction.dataset import CheXpertRaceDataset
from CheXpert.disease_prediction.dataset import CheXpertDiseaseDataset
from shared_utils import vprint, to_gpu, add_mean_to_list
import shared_utils
from CheXpert.disease_prediction.utils import Configs as disease_configs
from CheXpert.race_prediction.utils import Configs as race_configs
from MIMIC_CXR.dataset import CXRDataset
from MIMIC_CXR.utils import Mode as cxr_mode, Configs as cxr_configs

# Configs

In [6]:
@dataclass
class Configs:
    CXR_DATA_DIR = os.path.join("data", "MIMIC-CXR-JPG")
    CXR_VALID_LABELS_FILENAME = "valid.csv"
    CXR_FILENAMES = cxr_configs.CXR_FILENAMES
    CHEXPERT_DATA_DIR = os.path.join("data", "CheXpert", "CheXpert-v1.0-small")
    CHEXPERT_DISEASE_TRAINED_MODELS_DIR = os.path.join("CheXpert", "disease_prediction", "trained_models")
    CHEXPERT_RACE_TRAINED_MODELS_DIR = os.path.join("CheXpert", "race_prediction", "trained_models")
    CHEXPERT_VALID_LABELS_FILENAME = "valid.csv"
    CHEXPERT_DEMO_FILENAME = "CHEXPERT DEMO.csv"
    DISEASE_ANNOTATIONS_COLUMNS = disease_configs.ANNOTATIONS_COLUMNS
    RACE_ANNOTATIONS_COLUMNS = race_configs.ANNOTATIONS_COLUMNS
    NUM_DISEASE_CLASSES = disease_configs.NUM_CLASSES
    NUM_RACE_CLASSES = race_configs.NUM_CLASSES
    BATCH_SIZE = 4
    SEED = 123
    VERBOSE = 1

In [7]:
shared_utils.set_seed(Configs.SEED)

In [8]:
if torch.cuda.is_available():
    vprint(f"Memory info: {torch.cuda.mem_get_info()[0]/10e8:.1f} GB free GPU.", Configs)
else: 
    vprint(f"No GPU Memory.", Configs)

2022-07-23 15:41: Memory info: 8.5 GB free GPU.


In [9]:
valid_transform = transforms.Compose([
    transforms.Resize((320,320)),
    transforms.ToTensor(), 
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Disease Prediction 

## Validaiton Dataloaders

In [10]:
cxp_disease_valid_dataset = CheXpertDiseaseDataset(data_dir=Configs.CHEXPERT_DATA_DIR, 
                                                   labels_filename=Configs.CHEXPERT_VALID_LABELS_FILENAME,
                                                   transform=valid_transform)
cxp_disease_valid_dataloader = DataLoader(cxp_disease_valid_dataset, batch_size=Configs.BATCH_SIZE, shuffle=False)
len(cxp_disease_valid_dataset)

234

In [None]:
cxr_disease_valid_dataset = CXRDataset.download_dataset(100, cxr_mode.Race, Configs.CXR_DATA_DIR,
                                                        Configs.CXR_VALID_LABELS_FILENAME, **Configs.CXR_FILENAMES,
                                                        transform=valid_transform, target_transform=None)
# cxr_disease_valid_dataset = CXRDataset(cxr_mode.Disease, Configs.CXR_DATA_DIR, Configs.CXR_VALID_LABELS_FILENAME,
#                                        transform=valid_transform)
cxr_disease_valid_dataloader = DataLoader(cxr_disease_valid_dataset, batch_size=Configs.BATCH_SIZE, shuffle=False)
len(cxr_disease_valid_dataset)

2022-07-23 15:41: Enter username
sharonpe
2022-07-23 15:41: Enter password
0524344058


--2022-07-23 15:41:27--  https://physionet.org/files/mimic-cxr-jpg/2.0.0/files/p10/p19859532/s50453930/a5369d17-994f6cf1-dee57320-252febcc-6906f561.jpg
Resolving physionet.org (physionet.org)... 18.18.42.54
Connecting to physionet.org (physionet.org)|18.18.42.54|:443... connected.
HTTP request sent, awaiting response... 401 Unauthorized
Authentication selected: Basic realm="PhysioNet", charset="UTF-8"
Reusing existing connection to physionet.org:443.
HTTP request sent, awaiting response... 404 Not Found
2022-07-23 15:41:28 ERROR 404: Not Found.

--2022-07-23 15:41:28--  https://physionet.org/files/mimic-cxr-jpg/2.0.0/files/p11/p19859532/s50453930/a5369d17-994f6cf1-dee57320-252febcc-6906f561.jpg
Resolving physionet.org (physionet.org)... 18.18.42.54
Connecting to physionet.org (physionet.org)|18.18.42.54|:443... connected.
HTTP request sent, awaiting response... 401 Unauthorized
Authentication selected: Basic realm="PhysioNet", charset="UTF-8"
Reusing existing connection to physionet.or

HTTP request sent, awaiting response... 404 Not Found
2022-07-23 15:41:31 ERROR 404: Not Found.

--2022-07-23 15:41:31--  https://physionet.org/files/mimic-cxr-jpg/2.0.0/files/p15/p17719678/s54809507/5d2b5624-28073582-14636856-8a3597b6-4cbc4bf3.jpg
Resolving physionet.org (physionet.org)... 18.18.42.54
Connecting to physionet.org (physionet.org)|18.18.42.54|:443... connected.
HTTP request sent, awaiting response... 401 Unauthorized
Authentication selected: Basic realm="PhysioNet", charset="UTF-8"
Reusing existing connection to physionet.org:443.
HTTP request sent, awaiting response... 404 Not Found
2022-07-23 15:41:31 ERROR 404: Not Found.

--2022-07-23 15:41:31--  https://physionet.org/files/mimic-cxr-jpg/2.0.0/files/p16/p17719678/s54809507/5d2b5624-28073582-14636856-8a3597b6-4cbc4bf3.jpg
Resolving physionet.org (physionet.org)... 18.18.42.54
Connecting to physionet.org (physionet.org)|18.18.42.54|:443... connected.
HTTP request sent, awaiting response... 401 Unauthorized
Authenticati

--2022-07-23 15:41:33--  https://physionet.org/files/mimic-cxr-jpg/2.0.0/files/p11/p16067111/s58762653/1e45e0b2-8bbf479c-e8f925c2-b6300acb-1219f4ca.jpg
Resolving physionet.org (physionet.org)... 18.18.42.54
Connecting to physionet.org (physionet.org)|18.18.42.54|:443... connected.
HTTP request sent, awaiting response... 401 Unauthorized
Authentication selected: Basic realm="PhysioNet", charset="UTF-8"
Reusing existing connection to physionet.org:443.
HTTP request sent, awaiting response... 404 Not Found
2022-07-23 15:41:34 ERROR 404: Not Found.

--2022-07-23 15:41:34--  https://physionet.org/files/mimic-cxr-jpg/2.0.0/files/p12/p16067111/s58762653/1e45e0b2-8bbf479c-e8f925c2-b6300acb-1219f4ca.jpg
Resolving physionet.org (physionet.org)... 18.18.42.54
Connecting to physionet.org (physionet.org)|18.18.42.54|:443... connected.
HTTP request sent, awaiting response... 401 Unauthorized
Authentication selected: Basic realm="PhysioNet", charset="UTF-8"
Reusing existing connection to physionet.or

HTTP request sent, awaiting response... 401 Unauthorized
Authentication selected: Basic realm="PhysioNet", charset="UTF-8"
Reusing existing connection to physionet.org:443.
HTTP request sent, awaiting response... 404 Not Found
2022-07-23 15:41:36 ERROR 404: Not Found.

--2022-07-23 15:41:36--  https://physionet.org/files/mimic-cxr-jpg/2.0.0/files/p11/p16965055/s54269203/844df259-b7263b7a-6f5d6153-9d77a784-116c34c8.jpg
Resolving physionet.org (physionet.org)... 18.18.42.54
Connecting to physionet.org (physionet.org)|18.18.42.54|:443... connected.
HTTP request sent, awaiting response... 401 Unauthorized
Authentication selected: Basic realm="PhysioNet", charset="UTF-8"
Reusing existing connection to physionet.org:443.
HTTP request sent, awaiting response... 404 Not Found
2022-07-23 15:41:36 ERROR 404: Not Found.

--2022-07-23 15:41:36--  https://physionet.org/files/mimic-cxr-jpg/2.0.0/files/p12/p16965055/s54269203/844df259-b7263b7a-6f5d6153-9d77a784-116c34c8.jpg
Resolving physionet.org (p

Connecting to physionet.org (physionet.org)|18.18.42.54|:443... connected.
HTTP request sent, awaiting response... 401 Unauthorized
Authentication selected: Basic realm="PhysioNet", charset="UTF-8"
Reusing existing connection to physionet.org:443.
HTTP request sent, awaiting response... 404 Not Found
2022-07-23 15:41:39 ERROR 404: Not Found.

--2022-07-23 15:41:39--  https://physionet.org/files/mimic-cxr-jpg/2.0.0/files/p11/p13007002/s52057348/b9a8c5db-b9a35e91-448aa20a-557196cb-86d2e6b1.jpg
Resolving physionet.org (physionet.org)... 18.18.42.54
Connecting to physionet.org (physionet.org)|18.18.42.54|:443... connected.
HTTP request sent, awaiting response... 401 Unauthorized
Authentication selected: Basic realm="PhysioNet", charset="UTF-8"
Reusing existing connection to physionet.org:443.
HTTP request sent, awaiting response... 404 Not Found
2022-07-23 15:41:39 ERROR 404: Not Found.

--2022-07-23 15:41:39--  https://physionet.org/files/mimic-cxr-jpg/2.0.0/files/p12/p13007002/s52057348/

--2022-07-23 15:41:41--  https://physionet.org/files/mimic-cxr-jpg/2.0.0/files/p17/p18617561/s56080116/11d501d0-8b61cb98-726c4169-0870d52b-1c2b93ec.jpg
Resolving physionet.org (physionet.org)... 18.18.42.54
Connecting to physionet.org (physionet.org)|18.18.42.54|:443... connected.
HTTP request sent, awaiting response... 401 Unauthorized
Authentication selected: Basic realm="PhysioNet", charset="UTF-8"
Reusing existing connection to physionet.org:443.
HTTP request sent, awaiting response... 404 Not Found
2022-07-23 15:41:41 ERROR 404: Not Found.

--2022-07-23 15:41:41--  https://physionet.org/files/mimic-cxr-jpg/2.0.0/files/p18/p18617561/s56080116/11d501d0-8b61cb98-726c4169-0870d52b-1c2b93ec.jpg
Resolving physionet.org (physionet.org)... 18.18.42.54
Connecting to physionet.org (physionet.org)|18.18.42.54|:443... connected.
HTTP request sent, awaiting response... 401 Unauthorized
Authentication selected: Basic realm="PhysioNet", charset="UTF-8"
Reusing existing connection to physionet.or

HTTP request sent, awaiting response... 200 OK
Length: 1762598 (1.7M) [image/jpeg]
Saving to: ‘data/MIMIC-CXR-JPG/physionet.org/files/mimic-cxr-jpg/2.0.0/files/p13/p13639861/s59541913/7a474e51-fcd2afe7-a5adfdce-b07ff918-0522b0e3.jpg’

     0K .......... .......... .......... .......... ..........  2% 2.85M 1s
    50K .......... .......... .......... .......... ..........  5% 4.13M 0s
   100K .......... .......... .......... .......... ..........  8% 4.99M 0s
   150K .......... .......... .......... .......... .......... 11% 4.93M 0s
   200K .......... .......... .......... .......... .......... 14% 5.05M 0s
   250K .......... .......... .......... .......... .......... 17% 5.04M 0s
   300K .......... .......... .......... .......... .......... 20% 4.93M 0s
   350K .......... .......... .......... .......... .......... 23% 3.89M 0s
   400K .......... .......... .......... .......... .......... 26% 4.88M 0s
   450K .......... .......... .......... .......... .......... 29% 5.11M 0s
   50

## Pretrained Models 

In [None]:
_, _, files = next(os.walk(Configs.CHEXPERT_DISEASE_TRAINED_MODELS_DIR))
disease_trained_models = [os.path.join(Configs.CHEXPERT_DISEASE_TRAINED_MODELS_DIR, file) for file in files]
len(disease_trained_models)

In [None]:
disease_model = densenet121()
num_features = disease_model.classifier.in_features
disease_model.classifier = nn.Sequential(
    nn.Linear(num_features, num_features, bias=True),
    nn.ReLU(),
    nn.Dropout(p=0.1),
    nn.Linear(in_features=num_features, out_features=Configs.NUM_DISEASE_CLASSES, bias=True)
)
disease_model.eval()
not disease_model.training

In [None]:
disease_model, results, _, _ = shared_utils.load_statedict(disease_model, disease_trained_models[0], Configs)
disease_model = to_gpu(disease_model)

## Predictions

In [None]:
df_res_disease = pd.DataFrame(columns=Configs.DISEASE_ANNOTATIONS_COLUMNS + ['mean'])
df_res_disease

In [None]:
cxp_disease_labels, cxp_disease_outputs = shared_utils.get_metric_tensors(disease_model, cxp_disease_valid_dataloader, Configs,
                                                  apply_on_outputs=lambda x: torch.sigmoid(x),
                                                  by_study=False, challenge_ann_only=None)
df_res_disease.loc['CXP'] = add_mean_to_list(shared_utils.auc_score(cxp_disease_labels, cxp_disease_outputs, per_class=True))

In [None]:
cxr_disease_labels, cxr_disease_outputs = shared_utils.get_metric_tensors(disease_model, cxr_disease_valid_dataloader, Configs,
                                                  apply_on_outputs=lambda x: torch.sigmoid(x),
                                                  by_study=False, challenge_ann_only=None)
df_res_disease.loc['CXR'] = add_mean_to_list(shared_utils.auc_score(cxr_disease_labels, cxr_disease_outputs, per_class=True))

In [None]:
df_res_disease.sort_values(by="mean", ascending=False, inplace=True)
df_res_disease = df_res_disease.round(2)

In [None]:
df_res_disease

# Race Prediction

## Validation Dataloaders

In [None]:
cxp_race_valid_dataset = CheXpertRaceDataset(data_dir=Configs.CHEXPERT_DATA_DIR, demo_filename=Configs.CHEXPERT_DEMO_FILENAME, 
                                             labels_filename=Configs.CHEXPERT_VALID_LABELS_FILENAME, transform=valid_transform)
cxp_race_valid_dataloader = DataLoader(cxp_race_valid_dataset, batch_size=Configs.BATCH_SIZE, shuffle=False)
len(cxp_race_valid_dataset)

In [None]:
# cxr_disease_valid_dataset = CXRDataset.download_dataset(10, cxr_mode.Race, Configs.CXR_DATA_DIR,
#                                                         Configs.CXR_VALID_LABELS_FILENAME, **Configs.CXR_FILENAMES,
#                                                         transform=valid_transform, target_transform=None)
cxr_race_valid_dataset = CXRDataset(cxr_mode.Race, Configs.CXR_DATA_DIR, Configs.CXR_VALID_LABELS_FILENAME,
                                    transform=valid_transform)
cxr_race_valid_dataloader = DataLoader(cxr_race_valid_dataset, batch_size=Configs.BATCH_SIZE, shuffle=False)
len(cxr_race_valid_dataset)

## Pretrained Models 

In [None]:
_, _, files = next(os.walk(Configs.CHEXPERT_RACE_TRAINED_MODELS_DIR))
race_trained_models = [os.path.join(Configs.CHEXPERT_RACE_TRAINED_MODELS_DIR, file) for file in files]
race_model_versions = [p.split('__')[1] for p in race_trained_models]
len(race_trained_models)

In [None]:
models_dict = {}
for model_version, model_path in zip(race_model_versions, race_trained_models):
    model = densenet121()
    if "shallow" in model_version:
        shallow_denseblock = int(model_version.split('_')[2][10:])
        layer_offset = 3 + 2 * shallow_denseblock
        num_features = model.features[layer_offset].norm.num_features
        model = model.features[:layer_offset]
        classifier_module = nn.Sequential(
            nn.BatchNorm2d(num_features, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1)),   
            nn.Flatten(start_dim=1),
            nn.Linear(in_features=num_features, out_features=num_features, bias=True),
            nn.Dropout(p=0.1),
            nn.Linear(in_features=num_features, out_features=Configs.NUM_RACE_CLASSES, bias=True))
        model.add_module('classifier', classifier_module)
    else:
        num_features = model.classifier.in_features
        model.classifier = nn.Sequential(
            nn.Linear(num_features, num_features, bias=True),
            nn.ReLU(),
            nn.Dropout(p=0.1),
            nn.Linear(in_features=num_features, out_features=Configs.NUM_RACE_CLASSES, bias=True))
    model.eval()
    model, results, _, _ = shared_utils.load_statedict(model, model_path, Configs, device='cpu')
    models_dict[model_version] = model

## Predictions 

In [None]:
df_res_race = pd.DataFrame(columns=Configs.RACE_ANNOTATIONS_COLUMNS + ['mean'])
df_res_race

In [None]:
for model_version, model in tqdm(models_dict.items()):
    model = to_gpu(model)
    cxp_race_labels, cxp_race_outputs = shared_utils.get_metric_tensors(model, cxp_race_valid_dataloader, Configs,
                                                                        apply_on_outputs=lambda x: torch.softmax(x, dim=1),
                                                                        by_study=False, challenge_ann_only=None)
    cxr_race_labels, cxr_race_outputs = shared_utils.get_metric_tensors(model, cxr_race_valid_dataloader, Configs,
                                                                        apply_on_outputs=lambda x: torch.softmax(x, dim=1),
                                                                        by_study=False, challenge_ann_only=None)
    df_res_race.loc[f"CXP_{model_version}"] = add_mean_to_list(shared_utils.auc_score(cxp_race_labels, cxp_race_outputs, per_class=True))
    df_res_race.loc[f"CXR_{model_version}"] = add_mean_to_list(shared_utils.auc_score(cxr_race_labels, cxr_race_outputs, per_class=True))
    model.cpu()
    break

In [None]:
df_res_race.sort_values(by="mean", ascending=False, inplace=True)
df_res_race = df_res_race.round(2)

In [None]:
df_res_race

# Disease Per Race Performance

In [None]:
# adding race to ChexPertDiseaseDatatset
cxp_race_df = cxp_race_valid_dataset.df_labels[['PATIENT', 'race'] + race_configs.ANNOTATIONS_COLUMNS].drop_duplicates()
cxp_disease_df_labels = cxp_disease_valid_dataset.df_labels.merge(cxp_race_df, how='left',
                                                                                left_on='patient_id', right_on='PATIENT')
cxp_disease_df_labels.race.fillna('Other', inplace=True)
cxp_disease_df_labels.head(2)

In [None]:
df_res_disease_per_race = df_res_disease.copy()

In [None]:
for race in cxp_disease_df_labels.race.unique():
    disease_race_labels = cxp_disease_labels[cxp_disease_df_labels.race==race]
    disease_race_outputs = cxp_disease_outputs[cxp_disease_df_labels.race==race]
    df_res_disease_per_race.loc[f"CXP_{race}"] = add_mean_to_list(shared_utils.auc_score(disease_race_labels, disease_race_outputs, per_class=True))
    disease_race_labels = cxr_disease_labels[cxr_disease_valid_dataset.df_labels.race==race]
    disease_race_outputs = cxr_disease_outputs[cxr_disease_valid_dataset.df_labels.race==race]
    df_res_disease_per_race.loc[f"CXR_{race}"] = add_mean_to_list(shared_utils.auc_score(disease_race_labels, disease_race_outputs, per_class=True))

In [None]:
df_res_disease_per_race.sort_values(by="mean", ascending=False, inplace=True)
df_res_disease_per_race = df_res_disease_per_race.round(2)

In [None]:
df_res_disease_per_race

In [None]:
def columns_to_multi_index(df, new_cols):
    df = df.copy()
    s = pd.Series(df_res_disease_per_race.index)
    for i, col in enumerate(new_cols):
        df.insert(i, col, s.apply(lambda ind: ind.split("_")[i] if len(list(ind.split("_")))>=(i+1) else np.nan).values)
    df.set_index(new_cols, inplace=True)
    df.sort_index(inplace=True)
    return df

columns_to_multi_index(df_res_disease_per_race, new_cols=['Dataset', 'Race'])

# MIMIC CXR JPG

## 