### Evaluate Models


##### Imports

In [1]:
import torch
import pickle
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn.functional as F
from sklearn import metrics

  from .autonotebook import tqdm as notebook_tqdm


##### Evaluation Parameters

In [2]:
threshold = 0.5 # currently we don't maximize val f1 to find the threshold... need to grab scores for all the val sets if we do this
num_std = 1.96
num_bootstrap = 1000
line_width = 2
alpha = 0.2
font_size = 16
legend_size = 10
x_size = 10
y_size = 10

##### Initialize Score, Model, and Color Arrays

In [3]:
# Define master lists of labels, scores, names, and colors
all_y_trues, all_y_scores, all_model_names, all_colors = [], [], [], []

##### Load Fine-Tuned Torch LM Results

In [4]:
# # ls llama 3 8b
# with open("ls-Meta-Llama-3-8B-msp-v2-mdace-20_raw_labels.pkl", "rb") as f:
#     ls_llama_8b_last_labels = pickle.load(f)
# with open("ls-Meta-Llama-3-8B-msp-v2-mdace-20_scores.pkl", "rb") as f:
#     ls_llama_8b_last_scores = pickle.load(f)

# ls_llama_8b_last_scores_transformed = torch.sigmoid(torch.tensor(ls_llama_8b_last_scores))
# all_model_names.append("LS Llama-3 8B (Last)")
# all_y_trues.append(ls_llama_8b_last_labels)
# all_y_scores.append(ls_llama_8b_last_scores_transformed)
# all_colors.append('#ab20fd')

# ls unllama 3 8b
with open("ls-unllama-Meta-Llama-3-8B-msp-v2-mdace-20_raw_labels.pkl", "rb") as f:
    ls_unllama_8b_max_labels = pickle.load(f)
with open("ls-unllama-Meta-Llama-3-8B-msp-v2-mdace-20_raw_scores.pkl", "rb") as f:
    ls_unllama_8b_max_scores = pickle.load(f)

ls_unllama_8b_max_scores_transformed = torch.sigmoid(torch.tensor(ls_unllama_8b_max_scores)).numpy()
all_model_names.append("LS UnLlama-3 8B (Max)")
all_y_trues.append(ls_unllama_8b_max_labels)
all_y_scores.append(ls_unllama_8b_max_scores_transformed)

# BELT Max 5 segments
with open("./BELT-BASELINE/bioclinicalroberta_belt_mdace20_510_step_128_max_5_labels.pkl", "rb") as f:
    belt_5_max_labels = pickle.load(f)
with open("./BELT-BASELINE/bioclinicalroberta_belt_mdace20_510_step_128_max_5_scores.pkl", "rb") as f:
    belt_5_max_scores = pickle.load(f)

all_model_names.append("BELT 128 step 5 seg (Max)")
all_y_trues.append(belt_5_max_labels)
all_y_scores.append(belt_5_max_scores)

# BELT Max 128 segments
with open("./BELT-BASELINE/bioclinicalroberta_belt_mdace20_510_step_448_max_128_labels.pkl", "rb") as f:
    belt_128_max_labels = pickle.load(f)
with open("./BELT-BASELINE/bioclinicalroberta_belt_mdace20_510_step_448_max_128_scores.pkl", "rb") as f:
    belt_128_max_scores = pickle.load(f)

all_model_names.append("BELT 448 step 128 seg (Max)")
all_y_trues.append(belt_128_max_labels)
all_y_scores.append(belt_128_max_scores)

##### Print Performance for all Metrics for all Models

In [5]:
def print_mean_ci_of_metric_list(metric_list, metric_name, num_std):
    mean_metric = np.mean(metric_list)
    std_metric = np.std(metric_list)
    metric_low = np.maximum(mean_metric - std_metric * num_std, 0)
    metric_high = np.minimum(mean_metric + std_metric * num_std, 1)

    print(
        f"{metric_name}: {round(mean_metric, 3)} ([{round(metric_low, 3)} - {round(metric_high, 3)}] 95% CI)"
    )

In [6]:
model2metric_df = {}
for y_trues, y_scores, name in zip(
    all_y_trues, all_y_scores, all_model_names
):
    
    micro_aps, macro_aps, micro_roc_aucs, macro_roc_aucs = [], [], [], []
    for i in range(num_bootstrap):
        
        # Sample N records with replacement where N is the total number of records
        sample_indices = np.random.choice(len(y_trues), len(y_trues))
        sample_labels = np.array(y_trues)[sample_indices]
        sample_scores = np.array(y_scores)[sample_indices]
        
        micro_ap = metrics.average_precision_score(y_true=sample_labels, y_score=sample_scores, average='micro')
        micro_aps.append(micro_ap)

        # macro_ap = metrics.average_precision_score(y_true=sample_labels, y_score=sample_scores, average='macro')
        # macro_aps.append(macro_ap)

        micro_roc_auc = metrics.roc_auc_score(y_true=sample_labels, y_score=sample_scores, average='micro')
        micro_roc_aucs.append(micro_roc_auc)

        # macro_roc_auc = metrics.roc_auc_score(y_true=sample_labels, y_score=sample_scores, average='macro')
        # macro_roc_aucs.append(macro_roc_auc)
            
    metric_df = pd.DataFrame({
        "micro_aps": micro_aps,
        "micro_roc_aucs": micro_roc_aucs,
    })
    model2metric_df[name] = metric_df

    print(f"\nResults for {name}\n")
    print_mean_ci_of_metric_list(micro_aps, metric_name="Micro Average Precision", num_std=num_std)
    print_mean_ci_of_metric_list(micro_roc_aucs, metric_name="Micro ROC AUC", num_std=num_std)


Results for LS UnLlama-3 8B (Max)

Micro Average Precision: 0.277 ([0.256 - 0.299] 95% CI)
Micro ROC AUC: 0.828 ([0.818 - 0.839] 95% CI)

Results for BELT 128 step 5 seg (Max)

Micro Average Precision: 0.707 ([0.698 - 0.716] 95% CI)
Micro ROC AUC: 0.942 ([0.94 - 0.944] 95% CI)

Results for BELT 448 step 128 seg (Max)

Micro Average Precision: 0.804 ([0.797 - 0.812] 95% CI)
Micro ROC AUC: 0.971 ([0.969 - 0.972] 95% CI)
