# Testing

This notebook includes the execution of the testing for trained models.

In [1]:
import pandas as pd
import torch
import os
import matplotlib.pyplot as plt
import torchvision.transforms as transforms

from model.one_model.one_stage_models import ResNet50OneStage, ResNet18OneStage
from model.multi_stage_model.multi_stage_model import ThreeStageModelFrontalLateralAPPA, TwoStageModelAPPA, TwoStageModelFrontalLateral
from data.dataset import CheXpertDataset

In [2]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots


os.environ['KMP_DUPLICATE_LIB_OK']='True' # To prevent the kernel from dying.

# Load result.csv file

In [95]:
result_xls = "C:/Users/flobr/OneDrive/Uni/Informatik_Master/ADLM/results.xlsx"
result = pd.read_excel(result_xls, sheet_name='Sheet1')

# Test Dataset

In [78]:
params_transform = {
    "resize": (256, 256),
}

transform = transforms.Compose([
    transforms.Resize(params_transform["resize"]),
    transforms.ToTensor(),
])

targets = {
            # "sex": 1,
            # "age": 2,
            # "frontal/lateral": 3,
            # "ap/pa": 4,
            # "no_finding": 5,
            # "enlarged_cardiomediastinum": 6,
            "cardiomegaly": 7,
            # "lung_opacity": 8,
            # "lung_lesion": 9,
            "edema": 10,
            "consolidation": 11,
            # "pneumonia": 12,
            "atelectasis": 13,
            # "pneumothorax": 14,
            "pleural_effusion": 15,
            # "pleural_other": 16,
            # "fracture": 17,
            # "support_devices": 18,
            # "fronal_lateral_map": 21,
            # "ap/pa map": 22,
        }

csv_file = "data/90_5_5/fr_lat_test_balanced.csv"

# If you want to test the model on a subset of the labels you can specify the labels here. The labels are the relative positions compared to the labels used in the training.
test_labels = None

# For All Label Trainings
#test_labels = [
#    2, # Cardiomegaly is the third element in all labels testing
#    5, # Edema is the sixth element in all labels testing
#    7, # Consolidation is the eighth element in all labels testing
#    9, # Atelectasis is the tenth element in all labels testing
#    11, # Pleural Effusion is the twelfth element in all labels testing
#]

# For 5 Label Trainings
#test_labels = [
#  0, # Cardiomegaly is the third element in all labels testing
#  1, # Edema is the sixth element in all labels testing
#  2, # Consolidation is the eighth element in all labels testing
#  3, # Atelectasis is the tenth element in all labels testing
#  4, # Pleural Effusion is the twelfth element in all labels testing
#]

test_dataset = CheXpertDataset(
    csv_file=csv_file,
    root_dir="../image_data/",
    targets=targets,
    transform=transform,
)
print(f"Test dataset size: {len(test_dataset)}")




Class weights: [np.float64(3.1172684010519984), np.float64(2.86429275487946), np.float64(3.807984255408364), np.float64(5.859138095502676), np.float64(1.7632622291880837)]
Test dataset size: 2988


# One stage model

Testing of the one stage model.

## Define model

In [89]:
params = {
    "train_transfrom": params_transform,
    "lr": 0.001,
    "save_epoch": 5,
    "batch_size": 256,
    "num_epochs": 100,
    "input_channels": 1,
    "optimizer": "adam",
    "num_workers": 0,
    "loss_fn": "multilabel_focal_loss",
    "metrics": ["accuracy",
            "precision",
            "recall",
            "confusion_matrix",
            "auc",
            "auroc",
            "multilabel_accuracy",
            "multilabel_auprc",
            "multilabel_precision_recall_curve",
            "mcc"],
    "confidence_threshold": 0.5,
}

family = "Resnet50_Baseline_Frontal"
name = "best_model_82"

weights = f"final_models/final_90_5_5/{family}/{name}.pth"

model = ResNet18OneStage(params=params, targets=targets, input_channels=params['input_channels'])
model.load_model(weights)

## Testing one stage model

In [90]:
res = model.test(test_dataset=test_dataset,name=name, test_labels=test_labels)
# append two columns: confidence_threshold and test_set to the result
res["confidence_threshold"] = params["confidence_threshold"]
res["test_set"] = csv_file
res["family"] = family

result = pd.concat([result, res], ignore_index=True)
# write result to excel
with pd.ExcelWriter(result_xls, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
    result.to_excel(writer, sheet_name="Sheet1", index=False)

Testing: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:09<00:00,  1.22it/s, test_loss=0.125585]


Test loss: 0.1255854035694672
Test cardiomegaly accuracy: 0.8872155547142029
Test cardiomegaly precision: 0.41489362716674805
Test cardiomegaly recall: 0.1214953288435936
Test cardiomegaly auroc: 0.7036456891486695
Test cardiomegaly auc: 0.16043971478939056
Test cardiomegaly confusion_matrix: tensor([[2612.,   55.],
        [ 282.,   39.]])
Test edema accuracy: 0.5327978730201721
Test edema precision: 0.8874430656433105
Test edema recall: 0.527251660823822
Test edema auroc: 0.5570139205523108
Test edema auc: 0.02513801120221615
Test edema confusion_matrix: tensor([[ 228.,  173.],
        [1223., 1364.]])
Test consolidation accuracy: 0.9377509951591492
Test consolidation precision: 0.19230769574642181
Test consolidation recall: 0.029411764815449715
Test consolidation auroc: 0.5718594748048261
Test consolidation auc: 0.02827896922826767
Test consolidation confusion_matrix: tensor([[2797.,   21.],
        [ 165.,    5.]])
Test atelectasis accuracy: 0.9364123344421387
Test atelectasis prec

# Two stage model - AP/PA Split

Testing of the two stage model with ap/pa split.

## Define model

In [47]:
params_transform = {
    "resize": (256, 256),
}

transform = transforms.Compose([
    transforms.Resize(params_transform["resize"]),
    transforms.ToTensor(),
])

targets = {
            # "sex": 1,
            # "age": 2,
            # "frontal/lateral": 3,
            # "ap/pa": 4,
            # "no_finding": 5,
            # "enlarged_cardiomediastinum": 6,
            "cardiomegaly": 7,
            # "lung_opacity": 8,
            # "lung_lesion": 9,
            "edema": 10,
            "consolidation": 11,
            # "pneumonia": 12,
            "atelectasis": 13,
            # "pneumothorax": 14,
            "pleural_effusion": 15,
            # "pleural_other": 16,
            # "fracture": 17,
            # "support_devices": 18,
            # "fronal_lateral_map": 21,
            # "ap/pa map": 22,
        }

# If you want to test the model on a subset of the labels you can specify the labels here. The labels are the relative positions compared to the labels used in the training.
test_labels = None

csv_file = "data/90_5_5/fr_test_balanced.csv"

test_dataset = CheXpertDataset(
    csv_file=csv_file,
    root_dir="../image_data/",
    targets=targets,
    transform=transform,
)
print(f"Test dataset size: {len(test_dataset)}")

Class weights: [np.float64(3.0794415416798357), np.float64(2.9784333407608736), np.float64(3.6347624053323777), np.float64(5.775081722705364), np.float64(1.5531789841019434)]
Test dataset size: 2988


In [57]:
params = {
    "train_transfrom": params_transform,
    "lr": 0.001,
    "save_epoch": 5,
    "batch_size": 256,
    "num_epochs": 100,
    "input_channels": 1,
    "optimizer": "adam",
    "num_workers": 0,
    "loss_fn": "multilabel_focal_loss",
    "metrics": ["accuracy",
            "precision",
            "recall",
            "confusion_matrix",
            "auc",
            "auroc",
            "multilabel_accuracy",
            "multilabel_auprc",
            "multilabel_precision_recall_curve",
            "mcc"],
    "confidence_threshold": 0.5,
}

name = "best_model_not_pretrained"

family = "Two_Stage_AP/PA"

weights_first_stage = "final_models/final_90_5_5/ap-pa_split.pth"
weights_second_stage_ap = "final_models/final_90_5_5/AP_Not_Pretrained/best_model_mcc_36.pth"
weights_second_stage_pa = "final_models/final_90_5_5/PA_Not_Pretrained/best_model_mcc_33.pth"

weights = f"First: {weights_first_stage}, Second AP: {weights_second_stage_ap}, Second PA: {weights_second_stage_pa}"

model = TwoStageModelAPPA(
    params=params, 
    model_ap_pa_classification=weights_first_stage, 
    model_ap=weights_second_stage_ap, 
    model_pa=weights_second_stage_pa,
    targets=targets,
)

## Testing two stage model

In [58]:
res = model.test(test_dataset=test_dataset,name=name, test_labels=test_labels)
# append two columns: confidence_threshold and test_set to the result
res["confidence_threshold"] = params["confidence_threshold"]
res["test_set"] = csv_file
res["family"] = family

result = pd.concat([result, res], ignore_index=True)
# write result to excel
with pd.ExcelWriter(result_xls, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
    result.to_excel(writer, sheet_name="Sheet1", index=False)

Testing: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:08<00:00,  1.42it/s, test_loss=0.113272]


Test loss: 0.1132721285932163
Test cardiomegaly accuracy: 0.8785140514373779
Test cardiomegaly precision: 0.41040462255477905
Test cardiomegaly recall: 0.21385541558265686
Test cardiomegaly auroc: 0.7739455563216723
Test cardiomegaly auc: 0.27219656109809875
Test cardiomegaly confusion_matrix: tensor([[2554.,  102.],
        [ 261.,   71.]])
Test edema accuracy: 0.6957831382751465
Test edema precision: 0.8928571343421936
Test edema recall: 0.7428571581840515
Test edema auroc: 0.5920377804014167
Test edema auc: 0.03415004909038544
Test edema confusion_matrix: tensor([[ 129.,  234.],
        [ 675., 1950.]])
Test consolidation accuracy: 0.9206827282905579
Test consolidation precision: 0.08888889104127884
Test consolidation recall: 0.019999999552965164
Test consolidation auroc: 0.6075609756097561
Test consolidation auc: 0.04190339148044586
Test consolidation confusion_matrix: tensor([[2747.,   41.],
        [ 196.,    4.]])
Test atelectasis accuracy: 0.9099732041358948
Test atelectasis pr

# Two stage model - Frontal/Lateral Split

Testing of the two stage model with fronal/lateral split.

## Define model

In [62]:
params_transform = {
    "resize": (256, 256),
}

transform = transforms.Compose([
    transforms.Resize(params_transform["resize"]),
    transforms.ToTensor(),
])

targets = {
            # "sex": 1,
            # "age": 2,
            # "frontal/lateral": 3,
            # "ap/pa": 4,
            # "no_finding": 5,
            # "enlarged_cardiomediastinum": 6,
            "cardiomegaly": 7,
            # "lung_opacity": 8,
            # "lung_lesion": 9,
            "edema": 10,
            "consolidation": 11,
            # "pneumonia": 12,
            "atelectasis": 13,
            # "pneumothorax": 14,
            "pleural_effusion": 15,
            # "pleural_other": 16,
            # "fracture": 17,
            # "support_devices": 18,
            # "fronal_lateral_map": 21,
            # "ap/pa map": 22,
        }

# If you want to test the model on a subset of the labels you can specify the labels here. The labels are the relative positions compared to the labels used in the training.
test_labels = None

csv_file = "data/90_5_5/fr_lat_test_balanced.csv"

test_dataset = CheXpertDataset(
    csv_file=csv_file,
    root_dir="../image_data/",
    targets=targets,
    transform=transform,
)
print(f"Test dataset size: {len(test_dataset)}")

Class weights: [np.float64(3.1172684010519984), np.float64(2.86429275487946), np.float64(3.807984255408364), np.float64(5.859138095502676), np.float64(1.7632622291880837)]
Test dataset size: 2988


In [76]:
params = {
    "train_transfrom": params_transform,
    "lr": 0.001,
    "save_epoch": 5,
    "batch_size": 256,
    "num_epochs": 100,
    "input_channels": 1,
    "optimizer": "adam",
    "num_workers": 0,
    # BCE with Sigmoid activation function
    "loss_fn": "torch.nn.BCEWithLogitsLoss()",
    # For multilabel: MultiLabelSoftMarginLoss
    "metrics": ["accuracy",
            "precision",
            "recall",
            "confusion_matrix",
            "auc",
            "auroc",
            "multilabel_accuracy",
            "multilabel_auprc",
            "multilabel_precision_recall_curve",
            "mcc"],
    "confidence_threshold": 0.5,
}

name = "best_model_auprc_not_pretrained"

family = "Two_Stage_Frontal/Lateral"

weights_first_stage = "final_models/final_90_5_5/fr-lat_split.pth"
weights_second_stage_frontal = "final_models/final_90_5_5/Fr_Not_Pretrained/best_model_39.pth"
weights_second_stage_lateral = "final_models/final_90_5_5/Lat_Not_Pretrained/best_model_26.pth"

weights = f"First: {weights_first_stage}, Second Frontal: {weights_second_stage_frontal}, Second Lateral: {weights_second_stage_lateral}"

model = TwoStageModelFrontalLateral(
    params=params, 
    model_frontal_lateral_classification=weights_first_stage,
    model_frontal=weights_second_stage_frontal,
    model_lateral=weights_second_stage_lateral,
    targets=targets,
)

## Testing two stage model

In [77]:
res = model.test(test_dataset=test_dataset,name=name, test_labels=test_labels)
# append two columns: confidence_threshold and test_set to the result
res["confidence_threshold"] = params["confidence_threshold"]
res["test_set"] = csv_file
res["family"] = family

result = pd.concat([result, res], ignore_index=True)
# write result to excel
with pd.ExcelWriter(result_xls, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
    result.to_excel(writer, sheet_name="Sheet1", index=False)

Testing: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:08<00:00,  1.34it/s, test_loss=0.428395]


Test loss: 0.4283947436541263
Test cardiomegaly accuracy: 0.8942436575889587
Test cardiomegaly precision: 0.5242718458175659
Test cardiomegaly recall: 0.1682243049144745
Test cardiomegaly auroc: 0.7706127855513388
Test cardiomegaly auc: 0.21317696571350098
Test cardiomegaly confusion_matrix: tensor([[2618.,   49.],
        [ 267.,   54.]])
Test edema accuracy: 0.45582330226898193
Test edema precision: 0.9027661085128784
Test edema recall: 0.4163123369216919
Test edema auroc: 0.58931912584214
Test edema auc: 0.04035171866416931
Test edema confusion_matrix: tensor([[ 285.,  116.],
        [1510., 1077.]])
Test consolidation accuracy: 0.9390897154808044
Test consolidation precision: 0.0714285746216774
Test consolidation recall: 0.0058823530562222
Test consolidation auroc: 0.632334154385672
Test consolidation auc: 0.07967347651720047
Test consolidation confusion_matrix: tensor([[2.8050e+03, 1.3000e+01],
        [1.6900e+02, 1.0000e+00]])
Test atelectasis accuracy: 0.9538152813911438
Test a

# Three stage model - Lateral/AP/PA Split

Testing of the three stage model. First stage is the frontal/lateral split, second stage is the ap/pa split and third stage is the multilabel classification of the images.

## Define model

In [94]:
params_transform = {
    "resize": (256, 256),
}

transform = transforms.Compose([
    transforms.Resize(params_transform["resize"]),
    transforms.ToTensor(),
])

targets = {
            # "sex": 1,
            # "age": 2,
            # "frontal/lateral": 3,
            # "ap/pa": 4,
            # "no_finding": 5,
            # "enlarged_cardiomediastinum": 6,
            "cardiomegaly": 7,
            # "lung_opacity": 8,
            # "lung_lesion": 9,
            "edema": 10,
            "consolidation": 11,
            # "pneumonia": 12,
            "atelectasis": 13,
            # "pneumothorax": 14,
            "pleural_effusion": 15,
            # "pleural_other": 16,
            # "fracture": 17,
            # "support_devices": 18,
            # "fronal_lateral_map": 21,
            # "ap/pa map": 22,
        }

# If you want to test the model on a subset of the labels you can specify the labels here. The labels are the relative positions compared to the labels used in the training.
test_labels = None

csv_file = "data/90_5_5/test_balanced.csv"

test_dataset = CheXpertDataset(
    csv_file=csv_file,
    root_dir="../image_data/",
    targets=targets,
    transform=transform,
)
print(f"Test dataset size: {len(test_dataset)}")

Class weights: [np.float64(3.0794415416798357), np.float64(2.9201511490782153), np.float64(3.6933892622193722), np.float64(5.873849311314257), np.float64(1.674130760975404)]
Test dataset size: 4482


In [98]:
params = {
    "train_transfrom": params_transform,
    "lr": 0.001,
    "save_epoch": 5,
    "batch_size": 256,
    "num_epochs": 100,
    "input_channels": 1,
    "optimizer": "adam",
    "num_workers": 0,
    "loss_fn": "multilabel_focal_loss",
    "metrics": ["accuracy",
            "precision",
            "recall",
            "confusion_matrix",
            "auc",
            "auroc",
            "multilabel_accuracy",
            "multilabel_auprc",
            "multilabel_precision_recall_curve",
            "mcc"],
    "confidence_threshold": 0.5,
}

name = "best_model_mcc_not_pretrained"

family = "Three_Stage"


weights_fr_lat_classification = "final_models/final_90_5_5/fr-lat_split.pth"
weights_ap_pa_classification = "final_models/final_90_5_5/ap-pa_split.pth"
weights_frontal_ap = "final_models/final_90_5_5/AP_Not_Pretrained/best_model_mcc_36.pth"
weights_frontal_pa = "final_models/final_90_5_5/PA_Not_Pretrained/best_model_mcc_33.pth"
weights_lateral = "final_models/final_90_5_5/Lat_Not_Pretrained/best_model_mcc_37.pth"

weights = f"Fr/Lat Classification: {weights_fr_lat_classification}, AP/PA Classification: {weights_ap_pa_classification}, Frontal AP: {weights_frontal_ap}, Frontal PA: {weights_frontal_pa}, Lateral: {weights_lateral}"


model = ThreeStageModelFrontalLateralAPPA(
    params=params, 
    model_frontal_lateral_classification = weights_fr_lat_classification,
    model_frontal_ap_pa_classification = weights_ap_pa_classification,
    model_frontal_ap = weights_frontal_ap,
    model_frontal_pa = weights_frontal_pa,
    model_lateral = weights_lateral,
    targets=targets,
)

## Testing three stage model

In [99]:
res = model.test(test_dataset=test_dataset,name=name, test_labels=test_labels)
# append two columns: confidence_threshold and test_set to the result
res["confidence_threshold"] = params["confidence_threshold"]
res["test_set"] = csv_file
res["family"] = family

result = pd.concat([result, res], ignore_index=True)
# write result to excel
with pd.ExcelWriter(result_xls, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
    result.to_excel(writer, sheet_name="Sheet1", index=False)

Testing: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:13<00:00,  1.31it/s, test_loss=0.106047]


Test loss: 0.10604689982626013
Test cardiomegaly accuracy: 0.8603302240371704
Test cardiomegaly precision: 0.3483412265777588
Test cardiomegaly recall: 0.29518070816993713
Test cardiomegaly auroc: 0.7535034717181981
Test cardiomegaly auc: 0.2508052587509155
Test cardiomegaly confusion_matrix: tensor([[3709.,  275.],
        [ 351.,  147.]])
Test edema accuracy: 0.7295849919319153
Test edema precision: 0.8829877972602844
Test edema recall: 0.7953440546989441
Test edema auroc: 0.5855516669144504
Test edema auc: 0.03464272618293762
Test edema confusion_matrix: tensor([[ 161.,  412.],
        [ 800., 3109.]])
Test consolidation accuracy: 0.9107541441917419
Test consolidation precision: 0.08571428805589676
Test consolidation recall: 0.0422535203397274
Test consolidation auroc: 0.6089527877124586
Test consolidation auc: 0.026301827281713486
Test consolidation confusion_matrix: tensor([[4070.,  128.],
        [ 272.,   12.]])
Test atelectasis accuracy: 0.9355198740959167
Test atelectasis prec