# Testing

This notebook includes the execution of the testing for trained models.

In [1]:
import pandas as pd
import torch
import os
import matplotlib.pyplot as plt
import torchvision.transforms as transforms

from model.one_model.one_stage_models import ResNet50OneStage, ResNet18OneStage
from model.multi_stage_model.multi_stage_model import ThreeStageModelFrontalLateralAPPA, TwoStageModelAPPA, TwoStageModelFrontalLateral
from data.dataset import CheXpertDataset

In [2]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots


os.environ['KMP_DUPLICATE_LIB_OK']='True' # To prevent the kernel from dying.

# Load result.csv file

In [56]:
result_xls = "C:/Users/flobr/OneDrive/Uni/Informatik_Master/ADLM/results.xlsx"
result = pd.read_excel(result_xls, sheet_name='Sheet1')

# Test Dataset

In [57]:
params_transform = {
    "resize": (256, 256),
}

transform = transforms.Compose([
    transforms.Resize(params_transform["resize"]),
    transforms.ToTensor(),
])

targets = {
            # "sex": 1,
            # "age": 2,
            # "frontal/lateral": 3,
            # "ap/pa": 4,
            # "no_finding": 5,
            # "enlarged_cardiomediastinum": 6,
            "cardiomegaly": 7,
            # "lung_opacity": 8,
            # "lung_lesion": 9,
            "edema": 10,
            "consolidation": 11,
            # "pneumonia": 12,
            "atelectasis": 13,
            # "pneumothorax": 14,
            "pleural_effusion": 15,
            # "pleural_other": 16,
            # "fracture": 17,
            # "support_devices": 18,
            # "fronal_lateral_map": 21,
            # "ap/pa map": 22,
        }

csv_file = "data/90_5_5/test_balanced.csv"

# If you want to test the model on a subset of the labels you can specify the labels here. The labels are the relative positions compared to the labels used in the training.
test_labels = None

# For All Label Trainings
#test_labels = [
#    2, # Cardiomegaly is the third element in all labels testing
#    5, # Edema is the sixth element in all labels testing
#    7, # Consolidation is the eighth element in all labels testing
#    9, # Atelectasis is the tenth element in all labels testing
#    11, # Pleural Effusion is the twelfth element in all labels testing
#]

# For 5 Label Trainings
#test_labels = [
#  0, # Cardiomegaly is the third element in all labels testing
#  1, # Edema is the sixth element in all labels testing
#  2, # Consolidation is the eighth element in all labels testing
#  3, # Atelectasis is the tenth element in all labels testing
#  4, # Pleural Effusion is the twelfth element in all labels testing
#]

test_dataset = CheXpertDataset(
    csv_file=csv_file,
    root_dir="../image_data/",
    targets=targets,
    transform=transform,
)
print(f"Test dataset size: {len(test_dataset)}")




Class weights: [np.float64(3.0794415416798357), np.float64(2.9201511490782153), np.float64(3.6933892622193722), np.float64(5.873849311314257), np.float64(1.674130760975404)]
Test dataset size: 4482


# One stage model

Testing of the one stage model.

## Define model

In [58]:
params = {
    "train_transfrom": params_transform,
    "lr": 0.001,
    "save_epoch": 5,
    "batch_size": 256,
    "num_epochs": 100,
    "input_channels": 1,
    "optimizer": "adam",
    "num_workers": 0,
    "loss_fn": "multilabel_focal_loss",
    "metrics": ["accuracy",
            "precision",
            "recall",
            "confusion_matrix",
            "auc",
            "auroc",
            "multilabel_accuracy",
            "multilabel_auprc",
            "multilabel_precision_recall_curve",
            "mcc"],
    "confidence_threshold": 0.5,
}

family = "Resnet50_Baseline"
name = "best_model_71"

weights = f"final_models/final_90_5_5/{family}/{name}.pth"

model = ResNet18OneStage(params=params, targets=targets, input_channels=params['input_channels'])
model.load_model(weights)

## Testing one stage model

In [59]:
res = model.test(test_dataset=test_dataset,name=name, test_labels=test_labels)
# append two columns: confidence_threshold and test_set to the result
res["confidence_threshold"] = params["confidence_threshold"]
res["test_set"] = csv_file
res["family"] = family

result = pd.concat([result, res], ignore_index=True)
# write result to excel
with pd.ExcelWriter(result_xls, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
    result.to_excel(writer, sheet_name="Sheet1", index=False)

Testing: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:14<00:00,  1.22it/s, test_loss=0.116757]


Test loss: 0.11675725394844588
Test cardiomegaly accuracy: 0.8728246092796326
Test cardiomegaly precision: 0.38957056403160095
Test cardiomegaly recall: 0.2550200819969177
Test cardiomegaly auroc: 0.769158209141788
Test cardiomegaly auc: 0.22163620591163635
Test cardiomegaly confusion_matrix: tensor([[3785.,  199.],
        [ 371.,  127.]])
Test edema accuracy: 0.680499792098999
Test edema precision: 0.8826073408126831
Test edema recall: 0.7308774590492249
Test edema auroc: 0.5696341328933052
Test edema auc: 0.02819528803229332
Test edema confusion_matrix: tensor([[ 193.,  380.],
        [1052., 2857.]])
Test consolidation accuracy: 0.9277108311653137
Test consolidation precision: 0.1551724076271057
Test consolidation recall: 0.03169013932347298
Test consolidation auroc: 0.6524476779687175
Test consolidation auc: 0.10357023775577545
Test consolidation confusion_matrix: tensor([[4149.,   49.],
        [ 275.,    9.]])
Test atelectasis accuracy: 0.9863899946212769
Test atelectasis precis

# Two stage model - AP/PA Split

Testing of the two stage model with ap/pa split.

## Define model

In [6]:
params_transform = {
    "resize": (256, 256),
}

transform = transforms.Compose([
    transforms.Resize(params_transform["resize"]),
    transforms.ToTensor(),
])

targets = {
            # "sex": 1,
            # "age": 2,
            # "frontal/lateral": 3,
            # "ap/pa": 4,
            # "no_finding": 5,
            # "enlarged_cardiomediastinum": 6,
            "cardiomegaly": 7,
            # "lung_opacity": 8,
            # "lung_lesion": 9,
            "edema": 10,
            "consolidation": 11,
            # "pneumonia": 12,
            "atelectasis": 13,
            # "pneumothorax": 14,
            "pleural_effusion": 15,
            # "pleural_other": 16,
            # "fracture": 17,
            # "support_devices": 18,
            # "fronal_lateral_map": 21,
            # "ap/pa map": 22,
        }

# If you want to test the model on a subset of the labels you can specify the labels here. The labels are the relative positions compared to the labels used in the training.
test_labels = None

csv_file = "data/90_5_5/fr_test.csv"

test_dataset = CheXpertDataset(
    csv_file=csv_file,
    root_dir="../image_data/",
    targets=targets,
    transform=transform,
)
print(f"Test dataset size: {len(test_dataset)}")

Class weights: [np.float64(2.9794206294995904), np.float64(3.3538481571785983), np.float64(3.603309595859006), np.float64(6.078663481969815), np.float64(1.355350237982541)]
Test dataset size: 9532


In [5]:
params = {
    "train_transfrom": params_transform,
    "lr": 0.001,
    "save_epoch": 5,
    "batch_size": 256,
    "num_epochs": 100,
    "input_channels": 1,
    "optimizer": "adam",
    "num_workers": 0,
    "loss_fn": "multilabel_focal_loss",
    "metrics": ["accuracy",
            "precision",
            "recall",
            "confusion_matrix",
            "auc",
            "auroc",
            "multilabel_accuracy",
            "multilabel_auprc",
            "multilabel_precision_recall_curve",
            "mcc"],
    "confidence_threshold": 0.5,
}

name = "best_model_mcc_pretrained"

family = "Two_Stage_AP/PA"

weights_first_stage = "final_models/final_90_5_5/ap-pa_split.pth"
weights_second_stage_ap = "final_models/final_90_5_5/AP_Pretrained/best_model_mcc_49.pth"
weights_second_stage_pa = "final_models/final_90_5_5/PA_Pretrained/best_model_mcc_30.pth"

weights = f"First: {weights_first_stage}, Second AP: {weights_second_stage_ap}, Second PA: {weights_second_stage_pa}"

model = TwoStageModelAPPA(
    params=params, 
    model_ap_pa_classification=weights_first_stage, 
    model_ap=weights_second_stage_ap, 
    model_pa=weights_second_stage_pa,
    targets=targets,
)

## Testing two stage model

In [7]:
res = model.test(test_dataset=test_dataset,name=name, test_labels=test_labels)
# append two columns: confidence_threshold and test_set to the result
res["confidence_threshold"] = params["confidence_threshold"]
res["test_set"] = csv_file
res["family"] = family

result = pd.concat([result, res], ignore_index=True)
# write result to excel
with pd.ExcelWriter(result_xls, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
    result.to_excel(writer, sheet_name="Sheet1", index=False)

Testing: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38/38 [00:27<00:00,  1.37it/s, test_loss=0.100768]


Test loss: 0.10076798897900856
Test cardiomegaly accuracy: 0.8692824244499207
Test cardiomegaly precision: 0.4438839852809906
Test cardiomegaly recall: 0.3042351007461548
Test cardiomegaly auroc: 0.7795436989641249
Test cardiomegaly auc: 0.2575719952583313
Test cardiomegaly confusion_matrix: tensor([[7934.,  441.],
        [ 805.,  352.]])
Test edema accuracy: 0.7880822420120239
Test edema precision: 0.9243366718292236
Test edema recall: 0.8364158272743225
Test edema auroc: 0.6307223815414149
Test edema auc: 0.030435457825660706
Test edema confusion_matrix: tensor([[ 231.,  596.],
        [1424., 7281.]])
Test consolidation accuracy: 0.881766676902771
Test consolidation precision: 0.14501510560512543
Test consolidation recall: 0.14611871540546417
Test consolidation auroc: 0.6623789739961841
Test consolidation auc: 0.051567938178777695
Test consolidation confusion_matrix: tensor([[8309.,  566.],
        [ 561.,   96.]])
Test atelectasis accuracy: 0.949328601360321
Test atelectasis preci

# Two stage model - Frontal/Lateral Split

Testing of the two stage model with fronal/lateral split.

## Define model

In [8]:
params_transform = {
    "resize": (256, 256),
}

transform = transforms.Compose([
    transforms.Resize(params_transform["resize"]),
    transforms.ToTensor(),
])

targets = {
            # "sex": 1,
            # "age": 2,
            # "frontal/lateral": 3,
            # "ap/pa": 4,
            # "no_finding": 5,
            # "enlarged_cardiomediastinum": 6,
            "cardiomegaly": 7,
            # "lung_opacity": 8,
            # "lung_lesion": 9,
            "edema": 10,
            "consolidation": 11,
            # "pneumonia": 12,
            "atelectasis": 13,
            # "pneumothorax": 14,
            "pleural_effusion": 15,
            # "pleural_other": 16,
            # "fracture": 17,
            # "support_devices": 18,
            # "fronal_lateral_map": 21,
            # "ap/pa map": 22,
        }

# If you want to test the model on a subset of the labels you can specify the labels here. The labels are the relative positions compared to the labels used in the training.
test_labels = None

csv_file = "data/90_5_5/test.csv"

test_dataset = CheXpertDataset(
    csv_file=csv_file,
    root_dir="../image_data/",
    targets=targets,
    transform=transform,
)
print(f"Test dataset size: {len(test_dataset)}")

Class weights: [np.float64(2.988169226925719), np.float64(3.256541154492639), np.float64(3.6312276324022994), np.float64(6.081854349242607), np.float64(1.4358135019244247)]
Test dataset size: 11183


In [12]:
params = {
    "train_transfrom": params_transform,
    "lr": 0.001,
    "save_epoch": 5,
    "batch_size": 256,
    "num_epochs": 100,
    "input_channels": 1,
    "optimizer": "adam",
    "num_workers": 0,
    # BCE with Sigmoid activation function
    "loss_fn": "torch.nn.BCEWithLogitsLoss()",
    # For multilabel: MultiLabelSoftMarginLoss
    "metrics": ["accuracy",
            "precision",
            "recall",
            "confusion_matrix",
            "auc",
            "auroc",
            "multilabel_accuracy",
            "multilabel_auprc",
            "multilabel_precision_recall_curve",
            "mcc"],
    "confidence_threshold": 0.5,
}

name = "best_model_mcc_pretrained"

family = "Two_Stage_Frontal/Lateral"

weights_first_stage = "final_models/final_90_5_5/fr-lat_split.pth"
weights_second_stage_frontal = "final_models/final_90_5_5/Fr_Pretrained/best_model_mcc_50.pth"
weights_second_stage_lateral = "final_models/final_90_5_5/Lat_Pretrained/best_model_mcc_45.pth"

weights = f"First: {weights_first_stage}, Second Frontal: {weights_second_stage_frontal}, Second Lateral: {weights_second_stage_lateral}"

model = TwoStageModelFrontalLateral(
    params=params, 
    model_frontal_lateral_classification=weights_first_stage,
    model_frontal=weights_second_stage_frontal,
    model_lateral=weights_second_stage_lateral,
    targets=targets,
)

## Testing two stage model

In [13]:
res = model.test(test_dataset=test_dataset,name=name, test_labels=test_labels)
# append two columns: confidence_threshold and test_set to the result
res["confidence_threshold"] = params["confidence_threshold"]
res["test_set"] = csv_file
res["family"] = family

result = pd.concat([result, res], ignore_index=True)
# write result to excel
with pd.ExcelWriter(result_xls, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
    result.to_excel(writer, sheet_name="Sheet1", index=False)

Testing: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 44/44 [00:30<00:00,  1.42it/s, test_loss=0.420495]


Test loss: 0.4204948039883262
Test cardiomegaly accuracy: 0.8810694813728333
Test cardiomegaly precision: 0.513731837272644
Test cardiomegaly recall: 0.23608018457889557
Test cardiomegaly auroc: 0.7749722773454966
Test cardiomegaly auc: 0.3155770003795624
Test cardiomegaly confusion_matrix: tensor([[9535.,  301.],
        [1029.,  318.]])
Test edema accuracy: 0.5477957725524902
Test edema precision: 0.9297590851783752
Test edema recall: 0.5413414835929871
Test edema auroc: 0.6176577623532438
Test edema auc: 0.03343889117240906
Test edema confusion_matrix: tensor([[ 646.,  414.],
        [4643., 5480.]])
Test consolidation accuracy: 0.9146919250488281
Test consolidation precision: 0.16721311211585999
Test consolidation recall: 0.06790945678949356
Test consolidation auroc: 0.6582017688072346
Test consolidation auc: 0.06264714896678925
Test consolidation confusion_matrix: tensor([[10178.,   254.],
        [  700.,    51.]])
Test atelectasis accuracy: 0.9712957143783569
Test atelectasis pr

# Three stage model - Lateral/AP/PA Split

Testing of the three stage model. First stage is the frontal/lateral split, second stage is the ap/pa split and third stage is the multilabel classification of the images.

## Define model

In [51]:
params_transform = {
    "resize": (256, 256),
}

transform = transforms.Compose([
    transforms.Resize(params_transform["resize"]),
    transforms.ToTensor(),
])

targets = {
            # "sex": 1,
            # "age": 2,
            # "frontal/lateral": 3,
            # "ap/pa": 4,
            # "no_finding": 5,
            # "enlarged_cardiomediastinum": 6,
            "cardiomegaly": 7,
            # "lung_opacity": 8,
            # "lung_lesion": 9,
            "edema": 10,
            "consolidation": 11,
            # "pneumonia": 12,
            "atelectasis": 13,
            # "pneumothorax": 14,
            "pleural_effusion": 15,
            # "pleural_other": 16,
            # "fracture": 17,
            # "support_devices": 18,
            # "fronal_lateral_map": 21,
            # "ap/pa map": 22,
        }

# If you want to test the model on a subset of the labels you can specify the labels here. The labels are the relative positions compared to the labels used in the training.
test_labels = None

csv_file = "data/90_5_5/test_balanced.csv"

test_dataset = CheXpertDataset(
    csv_file=csv_file,
    root_dir="../image_data/",
    targets=targets,
    transform=transform,
)
print(f"Test dataset size: {len(test_dataset)}")

Class weights: [np.float64(3.0794415416798357), np.float64(2.9201511490782153), np.float64(3.6933892622193722), np.float64(5.873849311314257), np.float64(1.674130760975404)]
Test dataset size: 4482


In [None]:
params = {
    "train_transfrom": params_transform,
    "lr": 0.001,
    "save_epoch": 5,
    "batch_size": 256,
    "num_epochs": 100,
    "input_channels": 1,
    "optimizer": "adam",
    "num_workers": 0,
    "loss_fn": "multilabel_focal_loss",
    "metrics": ["accuracy",
            "precision",
            "recall",
            "confusion_matrix",
            "auc",
            "auroc",
            "multilabel_accuracy",
            "multilabel_auprc",
            "multilabel_precision_recall_curve",
            "mcc"],
    "confidence_threshold": 0.5,
}

name = "best_model_pretrained"

family = "Three_Stage"


weights_fr_lat_classification = "final_models/final_90_5_5/fr-lat_split.pth"
weights_ap_pa_classification = "final_models/final_90_5_5/ap-pa_split.pth"
weights_frontal_ap = "final_models/final_90_5_5/AP_Pretrained/best_model_38.pth"
weights_frontal_pa = "final_models/final_90_5_5/PA_Pretrained/best_model_26.pth"
weights_lateral = "final_models/final_90_5_5/Lat_Pretrained/best_model_45.pth"

weights = f"Fr/Lat Classification: {weights_fr_lat_classification}, AP/PA Classification: {weights_ap_pa_classification}, Frontal AP: {weights_frontal_ap}, Frontal PA: {weights_frontal_pa}, Lateral: {weights_lateral}"


model = ThreeStageModelFrontalLateralAPPA(
    params=params, 
    model_frontal_lateral_classification = weights_fr_lat_classification,
    model_frontal_ap_pa_classification = weights_ap_pa_classification,
    model_frontal_ap = weights_frontal_ap,
    model_frontal_pa = weights_frontal_pa,
    model_lateral = weights_lateral,
    targets=targets,
)

## Testing three stage model

In [55]:
res = model.test(test_dataset=test_dataset,name=name, test_labels=test_labels)
# append two columns: confidence_threshold and test_set to the result
res["confidence_threshold"] = params["confidence_threshold"]
res["test_set"] = csv_file
res["family"] = family

result = pd.concat([result, res], ignore_index=True)
# write result to excel
with pd.ExcelWriter(result_xls, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
    result.to_excel(writer, sheet_name="Sheet1", index=False)

Testing: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:13<00:00,  1.30it/s, test_loss=0.396220]


Test loss: 0.39622024474067274
Test cardiomegaly accuracy: 0.8846496939659119
Test cardiomegaly precision: 0.4599156081676483
Test cardiomegaly recall: 0.21887549757957458
Test cardiomegaly auroc: 0.760410618377123
Test cardiomegaly auc: 0.28462865948677063
Test cardiomegaly confusion_matrix: tensor([[3856.,  128.],
        [ 389.,  109.]])
Test edema accuracy: 0.7634984254837036
Test edema precision: 0.8811880946159363
Test edema recall: 0.842414915561676
Test edema auroc: 0.5880612021213855
Test edema auc: 0.04004054144024849
Test edema confusion_matrix: tensor([[ 129.,  444.],
        [ 616., 3293.]])
Test consolidation accuracy: 0.9051762819290161
Test consolidation precision: 0.1492537260055542
Test consolidation recall: 0.10563380271196365
Test consolidation auroc: 0.6499586489877809
Test consolidation auc: 0.08941151201725006
Test consolidation confusion_matrix: tensor([[4027.,  171.],
        [ 254.,   30.]])
Test atelectasis accuracy: 0.9437751173973083
Test atelectasis precis