In [11]:
import pandas as pd
import numpy as np
import utils
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score, roc_curve, auc
import itertools
import json

# Setup

In [12]:
pred_prefix = "../ss/iWildCam_"
target_prefix = "../ss/target_iWildCam_"
unc_out_prefix = "ss/unc_"
pred_out_prefix = "ss/pred_"
num_classes = 182
test_sets=["ind","ood"]

Resnet18_ls = ["Resnet18ft0", "Resnet18ft2", "Resnet18ft3"] #, "Resnet18ft3", "Resnet18ft4"
Resnet34_ls = ["Resnet34ft0", "Resnet34ft1"]
Resnet50_ls = ["Resnet50ft0","Resnet50ft2"]
Resnet101_ls = ["Resnet101ft0","Resnet101ft1"]
Resnet152_ls = ["Resnet152ft0"]

M_lp_ls = ["Resnet50lp0","Resnet50lp2"] # deprecated
L_lp_ls = ["Resnet101lp0","Resnet101lp1"] # deprecated

Include_Ensembles = False
Include_Duos = True
Include_LPFT = False

S_model_ls = Resnet18_ls
M_model_ls = Resnet50_ls
L_model_ls = Resnet101_ls

In [13]:
# Initialize predictor_categories dictionary
predictor_categories = {
    "S": S_model_ls,
    "M": M_model_ls,
    "L": L_model_ls
}
if Include_LPFT:
    predictor_categories.update({
        "Mlp": M_lp_ls,
        "Llp": L_lp_ls
    })
if Include_Ensembles:
    predictor_categories.update({f"EnsM={m}": [] for m in range(2, len(S_model_ls) + 1)})
if Include_Duos:
    predictor_categories.update({
        "SM_duo": [],
        "SL_duo": [],
        "ML_duo": []
    })
if Include_LPFT:
    predictor_categories.update({
        "lpft_M": [],
        "lpft_L": []
    })
    
print(f"predictor_categories available based on inclusion settings: {predictor_categories.keys()}")

predictor_categories available based on inclusion settings: dict_keys(['S', 'M', 'L', 'SM_duo', 'SL_duo', 'ML_duo'])


## Create Ensembles / Duos / LPFTs:

In [14]:
for testtype in test_sets:
    print(f"Working on {testtype}:")
    
    if Include_Ensembles:
        print("Generating ensembles from S models")
        for m in range(2, len(S_model_ls)+1):
            print(f"{m=}...")
            for comb in itertools.combinations(S_model_ls, m):
                ensemble_pred = utils.softvote([pd.read_csv(f"{pred_prefix}{testtype}_{model}.csv") for model in comb])
                ensemble_name = utils.generate_ensemble_name(f"EnsS_{m}", comb)
                utils.save_ensemble_predictions(pred_prefix, testtype, ensemble_name, ensemble_pred)
                if testtype==test_sets[0]: 
                    predictor_categories[f"EnsM={m}"].append(ensemble_name)

    if Include_Duos:
        print("Generating SM Duos")
        for s_model in S_model_ls:
            for m_model in M_model_ls:
                sm_duo_pred = utils.softvote([pd.read_csv(f"{pred_prefix}{testtype}_{s_model}.csv"), pd.read_csv(f"{pred_prefix}{testtype}_{m_model}.csv")])
                sm_duo_name = utils.generate_ensemble_name("SM_duo", [s_model, m_model])
                utils.save_ensemble_predictions(pred_prefix, testtype, sm_duo_name, sm_duo_pred)
                if testtype == test_sets[0]:
                    predictor_categories["SM_duo"].append(sm_duo_name)
        print("Generating SL Duos") 
        for s_model in S_model_ls:
            for l_model in L_model_ls:
                sl_duo_pred = utils.softvote([pd.read_csv(f"{pred_prefix}{testtype}_{s_model}.csv"), pd.read_csv(f"{pred_prefix}{testtype}_{l_model}.csv")])
                sl_duo_name = utils.generate_ensemble_name("SL_duo", [s_model, l_model])
                utils.save_ensemble_predictions(pred_prefix, testtype, sl_duo_name, sl_duo_pred)
                if testtype == test_sets[0]:
                    predictor_categories["SL_duo"].append(sl_duo_name)
        print("Generating ML Duos") 
        for m_model in M_model_ls:
            for l_model in L_model_ls:
                ml_duo_pred = utils.softvote([pd.read_csv(f"{pred_prefix}{testtype}_{m_model}.csv"), pd.read_csv(f"{pred_prefix}{testtype}_{l_model}.csv")])
                ml_duo_name = utils.generate_ensemble_name("ML_duo", [m_model, l_model])
                utils.save_ensemble_predictions(pred_prefix, testtype, ml_duo_name, ml_duo_pred)
                if testtype == test_sets[0]:
                    predictor_categories["ML_duo"].append(ml_duo_name)

    if Include_LPFT:
        print("Generating lpft-M duos")
        for m_ft, m_lp in zip(M_model_ls, M_lp_ls):
            lpft_m_pred = utils.softvote([pd.read_csv(f"{pred_prefix}{testtype}_{m_lp}.csv"), pd.read_csv(f"{pred_prefix}{testtype}_{m_ft}.csv")])
            lpft_m_name = utils.generate_ensemble_name("lpft_M", [m_lp,m_ft])
            utils.save_ensemble_predictions(pred_prefix, testtype, lpft_m_name, lpft_m_pred)
            if testtype == test_sets[0]:
                predictor_categories["lpft_M"].append(lpft_m_name)
        print("Generating lpft-L duos")
        for l_ft, l_lp in zip(L_model_ls, L_lp_ls):
            lpft_l_pred = utils.softvote([pd.read_csv(f"{pred_prefix}{testtype}_{l_lp}.csv"), pd.read_csv(f"{pred_prefix}{testtype}_{l_ft}.csv")])
            lpft_l_name = utils.generate_ensemble_name("lpft_L", [l_lp,l_ft])
            utils.save_ensemble_predictions(pred_prefix, testtype, lpft_l_name, lpft_l_pred)
            if testtype == test_sets[0]:
                predictor_categories["lpft_L"].append(lpft_l_name)

# Save the predictor_categories dictionary for use in the evaluation script
with open(f"{pred_out_prefix}predictor_categories.json", "w") as file:
    json.dump(predictor_categories, file)
print(json.dumps(predictor_categories, indent=2))

Working on ind:
Generating SM Duos
Generating SL Duos
Generating ML Duos
Working on ood:
Generating SM Duos
Generating SL Duos
Generating ML Duos
{
  "S": [
    "Resnet50ft0",
    "Resnet50ft2"
  ],
  "M": [
    "Resnet101ft0",
    "Resnet101ft1"
  ],
  "L": [
    "Resnet152ft0"
  ],
  "SM_duo": [
    "SM_duo_Resnet50ft0_Resnet101ft0",
    "SM_duo_Resnet50ft0_Resnet101ft1",
    "SM_duo_Resnet50ft2_Resnet101ft0",
    "SM_duo_Resnet50ft2_Resnet101ft1"
  ],
  "SL_duo": [
    "SL_duo_Resnet50ft0_Resnet152ft0",
    "SL_duo_Resnet50ft2_Resnet152ft0"
  ],
  "ML_duo": [
    "ML_duo_Resnet101ft0_Resnet152ft0",
    "ML_duo_Resnet101ft1_Resnet152ft0"
  ]
}


## {F1, Acc, Brier, Ece, Mce} of raw models 

In [15]:

# Load the predictor_categories dictionary
with open(f"{pred_out_prefix}predictor_categories.json", "r") as file:
    predictor_categories = json.load(file)

metrics_dict = {
    "Model": [],
    "Test Set": [],
    "Acc": [],
    "F1": [],
    "Brier": [],
    "ECE": [],
    "MCE": [],
    "Predictor Category": []
}

for testtype in test_sets:
    print(f"{testtype=}")
    label = pd.read_csv(f"{target_prefix}{testtype}.csv")
    
    for category, models in predictor_categories.items():
        print(f"Evaluating {category}")
        metrics_dict = utils.evaluate_models(models, label, pred_prefix, testtype, metrics_dict, category, num_classes)

metrics_df = pd.DataFrame(metrics_dict)
display(metrics_df)
metrics_df.to_csv(f"{pred_out_prefix}metrics.csv", index=False)


testtype='ind'
Evaluating S
Evaluating M
Evaluating L
Evaluating SM_duo
Evaluating SL_duo
Evaluating ML_duo
testtype='ood'
Evaluating S
Evaluating M
Evaluating L
Evaluating SM_duo
Evaluating SL_duo
Evaluating ML_duo


Unnamed: 0,Model,Test Set,Acc,F1,Brier,ECE,MCE,Predictor Category
0,Resnet50ft0,ind,0.784155,0.453848,0.001948,0.160803,0.402448,S
1,Resnet50ft2,ind,0.774467,0.429172,0.002056,0.17575,0.486047,S
2,Resnet101ft0,ind,0.769438,0.439821,0.002136,0.193279,0.459607,M
3,Resnet101ft1,ind,0.788938,0.434513,0.001927,0.156467,0.34454,M
4,Resnet152ft0,ind,0.767844,0.440999,0.002178,0.217192,0.489624,L
5,SM_duo_Resnet50ft0_Resnet101ft0,ind,0.783419,0.459042,0.001989,0.174338,0.450863,SM_duo
6,SM_duo_Resnet50ft0_Resnet101ft1,ind,0.792494,0.456093,0.001877,0.156989,0.338561,SM_duo
7,SM_duo_Resnet50ft2_Resnet101ft0,ind,0.780231,0.440576,0.002019,0.175225,0.450724,SM_duo
8,SM_duo_Resnet50ft2_Resnet101ft1,ind,0.786608,0.437025,0.001918,0.162329,0.369944,SM_duo
9,SL_duo_Resnet50ft0_Resnet152ft0,ind,0.784155,0.459447,0.001999,0.183429,0.433786,SL_duo


In [16]:
# Load the predictor_categories dictionary
with open(f"{pred_out_prefix}predictor_categories.json", "r") as file:
    predictor_categories = json.load(file)

# Generate the uncertainty DataFrame
for testtype in test_sets:
    print(f"Generating uncertainty DataFrame for {testtype}...")
    unc = pd.DataFrame()
    
    # Calculate entropy for all models
    print(f"Calculating entropy for all models")
    for category, models in predictor_categories.items():
        for model in models:
            predictions = pd.read_csv(f"{pred_prefix}{testtype}_{model}.csv")
            unc[f"entr_{model}"] = utils.calc_entr_torch(predictions)
            unc[f"softmax_res_{model}"] = utils.softmax_response_unc(predictions)
    
    # Calculate cross-entropy for specific pairs
    if Include_Duos:
        print(f"Calculating cross-entropy(M||S)")
        for s_model in S_model_ls:
            for m_model in M_model_ls:
                m_predictions = pd.read_csv(f"{pred_prefix}{testtype}_{m_model}.csv")
                s_predictions = pd.read_csv(f"{pred_prefix}{testtype}_{s_model}.csv")
                unc[f"ce_{m_model}_{s_model}"] = utils.calc_cross_entr_torch(m_predictions, s_predictions)
                unc[f"entr_{m_model}+ce_{m_model}_{s_model}"] = unc[f"entr_{m_model}"] + unc[f"ce_{m_model}_{s_model}"]
                sm_duo_name = f"SM_duo_{s_model}_{m_model}"
                unc[f"entr_{sm_duo_name}+ce_{m_model}_{s_model}"] = unc[f"entr_{sm_duo_name}"] + unc[f"ce_{m_model}_{s_model}"]

        print(f"Calculating cross-entropy(L||S)")
        for s_model in S_model_ls:
            for l_model in L_model_ls:
                l_predictions = pd.read_csv(f"{pred_prefix}{testtype}_{l_model}.csv")
                s_predictions = pd.read_csv(f"{pred_prefix}{testtype}_{s_model}.csv")
                unc[f"ce_{l_model}_{s_model}"] = utils.calc_cross_entr_torch(l_predictions, s_predictions)
                unc[f"entr_{l_model}+ce_{l_model}_{s_model}"] = unc[f"entr_{l_model}"] + unc[f"ce_{l_model}_{s_model}"]
                sl_duo_name = f"SL_duo_{s_model}_{l_model}"
                unc[f"entr_{sl_duo_name}+ce_{l_model}_{s_model}"] = unc[f"entr_{sl_duo_name}"] + unc[f"ce_{l_model}_{s_model}"]

        print(f"Calculating cross-entropy(L||M)")
        for m_model in M_model_ls:
            for l_model in L_model_ls:
                l_predictions = pd.read_csv(f"{pred_prefix}{testtype}_{l_model}.csv")
                m_predictions = pd.read_csv(f"{pred_prefix}{testtype}_{m_model}.csv")
                unc[f"ce_{l_model}_{m_model}"] = utils.calc_cross_entr_torch(l_predictions, m_predictions)
                unc[f"entr_{l_model}+ce_{l_model}_{m_model}"] = unc[f"entr_{l_model}"] + unc[f"ce_{l_model}_{m_model}"]
                ml_duo_name = f"ML_duo_{m_model}_{l_model}"
                unc[f"entr_{ml_duo_name}+ce_{l_model}_{m_model}"] = unc[f"entr_{ml_duo_name}"] + unc[f"ce_{l_model}_{m_model}"]

    if Include_LPFT:
        print(f"Calculating cross-entropy(Mft||Mlp)")
        for m_ft, m_lp in zip(M_model_ls, M_lp_ls):
            m_ft_predictions = pd.read_csv(f"{pred_prefix}{testtype}_{m_ft}.csv")
            m_lp_predictions = pd.read_csv(f"{pred_prefix}{testtype}_{m_lp}.csv")
            unc[f"ce_{m_ft}_{m_lp}"] = utils.calc_cross_entr_torch(m_ft_predictions, m_lp_predictions)
            unc[f"entr_{m_ft}+ce_{m_ft}_{m_lp}"] = unc[f"entr_{m_ft}"] + unc[f"ce_{m_ft}_{m_lp}"]

        print(f"Calculating cross-entropyLft|Llp)")
        for l_ft, l_lp in zip(L_model_ls, L_lp_ls):
            l_ft_predictions = pd.read_csv(f"{pred_prefix}{testtype}_{l_ft}.csv")
            l_lp_predictions = pd.read_csv(f"{pred_prefix}{testtype}_{l_lp}.csv")
            unc[f"ce_{l_ft}_{l_lp}"] = utils.calc_cross_entr_torch(l_ft_predictions, l_lp_predictions)
            unc[f"entr_{l_ft}+ce_{l_ft}_{l_lp}"] = unc[f"entr_{l_ft}"] + unc[f"ce_{l_ft}_{l_lp}"]

    # Save the uncertainty DataFrame
    unc.to_csv(f"{unc_out_prefix}{testtype}.csv", index=False)
    print(f"Saved uncertainty DataFrame for {testtype}.")

Generating uncertainty DataFrame for ind...
Calculating entropy for all models
Calculating cross-entropy(M||S)
Calculating cross-entropy(L||S)
Calculating cross-entropy(L||M)
Saved uncertainty DataFrame for ind.
Generating uncertainty DataFrame for ood...
Calculating entropy for all models
Calculating cross-entropy(M||S)
Calculating cross-entropy(L||S)
Calculating cross-entropy(L||M)
Saved uncertainty DataFrame for ood.


In [17]:
# Load the predictor_categories dictionary
with open(f"{pred_out_prefix}predictor_categories.json", "r") as file:
    predictor_categories = json.load(file)

# Generate prediction DataFrame
for testtype in test_sets:
    print(f"Generating prediction DataFrame for {testtype}...")
    pred_df = pd.read_csv(f"{target_prefix}{testtype}.csv")

    for category, models in predictor_categories.items():
        for model in models:
            predictions = pd.read_csv(f"{pred_prefix}{testtype}_{model}.csv")
            pred_df[f"pred_{model}"] = predictions.idxmax(axis=1).str.extract('(\d+)')

    # Save the prediction DataFrame
    pred_df.to_csv(f"{pred_out_prefix}{testtype}.csv", index=False)
    print(f"Saved prediction DataFrame for {testtype}.")
    display(pred_df.head(3))

Generating prediction DataFrame for ind...
Saved prediction DataFrame for ind.


Unnamed: 0,target,pred_Resnet50ft0,pred_Resnet50ft2,pred_Resnet101ft0,pred_Resnet101ft1,pred_Resnet152ft0,pred_SM_duo_Resnet50ft0_Resnet101ft0,pred_SM_duo_Resnet50ft0_Resnet101ft1,pred_SM_duo_Resnet50ft2_Resnet101ft0,pred_SM_duo_Resnet50ft2_Resnet101ft1,pred_SL_duo_Resnet50ft0_Resnet152ft0,pred_SL_duo_Resnet50ft2_Resnet152ft0,pred_ML_duo_Resnet101ft0_Resnet152ft0,pred_ML_duo_Resnet101ft1_Resnet152ft0
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
2,1,1,1,1,1,1,1,1,1,1,1,1,1,1


Generating prediction DataFrame for ood...
Saved prediction DataFrame for ood.


Unnamed: 0,target,pred_Resnet50ft0,pred_Resnet50ft2,pred_Resnet101ft0,pred_Resnet101ft1,pred_Resnet152ft0,pred_SM_duo_Resnet50ft0_Resnet101ft0,pred_SM_duo_Resnet50ft0_Resnet101ft1,pred_SM_duo_Resnet50ft2_Resnet101ft0,pred_SM_duo_Resnet50ft2_Resnet101ft1,pred_SL_duo_Resnet50ft0_Resnet152ft0,pred_SL_duo_Resnet50ft2_Resnet152ft0,pred_ML_duo_Resnet101ft0_Resnet152ft0,pred_ML_duo_Resnet101ft1_Resnet152ft0
0,113,68,113,113,0,0,113,68,113,113,0,0,0,0
1,113,68,68,113,68,0,113,68,113,68,68,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Correctness Prediction AUROC

In [18]:
# Load predictor categories
with open(f"{pred_out_prefix}predictor_categories.json", "r") as file:
    predictor_categories = json.load(file)
    
# Initialize the AUROC results dictionary
auroc_results = {
    "Predictor Category": [],
    "Model": [],
    "Test Set": [],
    "Uncertainty Measure": [],
    "AUROC": []
}

# Main evaluation loop
for testtype in test_sets:
    print(f"Evaluating AUROC for {testtype}...")
    pred_df = pd.read_csv(f"{pred_out_prefix}{testtype}.csv")
    unc_df = pd.read_csv(f"{unc_out_prefix}{testtype}.csv")

    for category, models in predictor_categories.items():
        for model in models:
            pred_column = f"pred_{model}"
            unc_measures = [f"entr_{model}", f"softmax_res_{model}"]

            if category in ["S", "M", "L", "SM_duo", "SL_duo", "ML_duo", "lpft_M", "lpft_L"]:
                for unc_measure in unc_measures:
                    auroc_value = utils.calculate_auroc(pred_df, unc_df, pred_column, "target", unc_measure)
                    auroc_results = utils.update_auroc_results(auroc_results, model, testtype, category, unc_measure.split("_")[0], auroc_value)

            if category in ["SM_duo", "SL_duo", "ML_duo"]:
                model_ls = model.split('_')[2:]
                unc_larger_as_predictor = f"entr_{model_ls[1]}+ce_{model_ls[1]}_{model_ls[0]}"
                auroc_value = utils.calculate_auroc(pred_df, unc_df, pred_column, "target", unc_larger_as_predictor)
                auroc_results = utils.update_auroc_results(auroc_results, model, testtype, category.split("_")[0][1], "Entr(self)+CE(self||smaller)", auroc_value)
                
                unc_ens_as_predictor = f"entr_{model}+ce_{model_ls[1]}_{model_ls[0]}"
                auroc_value = utils.calculate_auroc(pred_df, unc_df, pred_column, "target", unc_ens_as_predictor)
                auroc_results = utils.update_auroc_results(auroc_results, model, testtype, category, "Entr(larger)+CE(self||smaller)", auroc_value)

# Convert the results dictionary to a DataFrame and display it
auroc_df = pd.DataFrame(auroc_results).sort_values(["Test Set","Predictor Category","Model","Uncertainty Measure"], ignore_index=True)
display(auroc_df)

# Save the AUROC results to a CSV file
auroc_df.to_csv(f"{unc_out_prefix}auroc_results.csv", index=False)

Evaluating AUROC for ind...
Evaluating AUROC for ood...


Unnamed: 0,Predictor Category,Model,Test Set,Uncertainty Measure,AUROC
0,L,ML_duo_Resnet101ft0_Resnet152ft0,ind,Entr(self)+CE(self||smaller),0.862590
1,L,ML_duo_Resnet101ft1_Resnet152ft0,ind,Entr(self)+CE(self||smaller),0.862810
2,L,Resnet152ft0,ind,entr,0.862642
3,L,Resnet152ft0,ind,softmax,0.860148
4,L,SL_duo_Resnet50ft0_Resnet152ft0,ind,Entr(self)+CE(self||smaller),0.858866
...,...,...,...,...,...
79,SM_duo,SM_duo_Resnet50ft2_Resnet101ft0,ood,entr,0.903631
80,SM_duo,SM_duo_Resnet50ft2_Resnet101ft0,ood,softmax,0.902074
81,SM_duo,SM_duo_Resnet50ft2_Resnet101ft1,ood,Entr(larger)+CE(self||smaller),0.887975
82,SM_duo,SM_duo_Resnet50ft2_Resnet101ft1,ood,entr,0.895630


## F1-Cov AUROC

In [19]:
# Load predictor categories
with open(f"{pred_out_prefix}predictor_categories.json", "r") as file:
    predictor_categories = json.load(file)
    
# Initialize the F1-Coverage tradeoff results dictionary
f1_cov_results = {
    "Predictor Category": [],
    "Model": [],
    "Test Set": [],
    "Uncertainty Measure": [],
    "F1-Cov AUC": []
}

cov_range = np.arange(1, 100)


# Main evaluation loop
for testtype in test_sets:
    print(f"Evaluating F1-Coverage tradeoff AUC for {testtype}...")
    pred_df = pd.read_csv(f"{pred_out_prefix}{testtype}.csv")
    unc_df = pd.read_csv(f"{unc_out_prefix}{testtype}.csv")

    for category, models in predictor_categories.items():
        for model in models:
            pred_column = f"pred_{model}"
            unc_measures = [f"entr_{model}", f"softmax_res_{model}"]

            # Calculate and update F1-Coverage AUC values for each uncertainty measure
            for unc_measure in unc_measures:
                f1_cov_auc_value = utils.calculate_f1_cov_auc(pred_df, unc_df, pred_column, "target", unc_measure, cov_range)
                f1_cov_results = utils.update_f1_cov_results(f1_cov_results, model, testtype, category, unc_measure.split("_")[0], f1_cov_auc_value)

            if category in ["SM_duo", "SL_duo", "ML_duo"]:
                model_ls = model.split('_')[2:]
                combined_unc_measure = f"entr_{model}+ce_{model_ls[1]}_{model_ls[0]}"
                f1_cov_auc_value_combined = utils.calculate_f1_cov_auc(pred_df, unc_df, pred_column, "target", combined_unc_measure, cov_range)
                f1_cov_results = utils.update_f1_cov_results(f1_cov_results, model, testtype, category, "Entr(self)+CE(larger||smaller)", f1_cov_auc_value_combined)

                larger_unc_measure = f"entr_{model_ls[1]}+ce_{model_ls[1]}_{model_ls[0]}"
                f1_cov_auc_value_larger = utils.calculate_f1_cov_auc(pred_df, unc_df, f"pred_{model_ls[1]}", "target", larger_unc_measure, cov_range)
                f1_cov_results = utils.update_f1_cov_results(f1_cov_results, model, testtype, category.split("_")[0][1], "Entr(larger)+CE(larger||smaller)", f1_cov_auc_value_larger)

# Convert the results dictionary to a DataFrame and display it
f1_cov_df = pd.DataFrame(f1_cov_results).sort_values(["Test Set","Predictor Category","Model","Uncertainty Measure"], ignore_index=True)
display(f1_cov_df)

# Save the F1-Coverage AUC results to a CSV file
f1_cov_df.to_csv(f"{unc_out_prefix}f1_cov_auc_results.csv", index=False)

Evaluating F1-Coverage tradeoff AUC for ind...
Evaluating F1-Coverage tradeoff AUC for ood...


Unnamed: 0,Predictor Category,Model,Test Set,Uncertainty Measure,F1-Cov AUC
0,L,ML_duo_Resnet101ft0_Resnet152ft0,ind,Entr(larger)+CE(larger||smaller),73.341543
1,L,ML_duo_Resnet101ft1_Resnet152ft0,ind,Entr(larger)+CE(larger||smaller),74.213089
2,L,Resnet152ft0,ind,entr,71.565166
3,L,Resnet152ft0,ind,softmax,63.546453
4,L,SL_duo_Resnet50ft0_Resnet152ft0,ind,Entr(larger)+CE(larger||smaller),73.772689
...,...,...,...,...,...
79,SM_duo,SM_duo_Resnet50ft2_Resnet101ft0,ood,entr,67.512915
80,SM_duo,SM_duo_Resnet50ft2_Resnet101ft0,ood,softmax,67.091710
81,SM_duo,SM_duo_Resnet50ft2_Resnet101ft1,ood,Entr(self)+CE(larger||smaller),68.219377
82,SM_duo,SM_duo_Resnet50ft2_Resnet101ft1,ood,entr,69.335904


## OOD Detection AUROC

In [20]:
# Load predictor categories
with open(f"{pred_out_prefix}predictor_categories.json", "r") as file:
    predictor_categories = json.load(file)
    
# Initialize the AUROC OOD results dictionary
auroc_ood_results = {
    "Predictor Category": [],
    "Model": [],
    "Test Set": [],
    "Uncertainty Measure": [],
    "AUROC OOD": []
}

# Combine in-distribution and out-of-distribution data
pred_df_ind = pd.read_csv(f"{pred_out_prefix}ind.csv")
unc_df_ind = pd.read_csv(f"{unc_out_prefix}ind.csv")
pred_df_ood = pd.read_csv(f"{pred_out_prefix}ood.csv")
unc_df_ood = pd.read_csv(f"{unc_out_prefix}ood.csv")

pred_df_ind["test_type"] = "ind"
pred_df_ood["test_type"] = "ood"

pred_df_combined = pd.concat([pred_df_ood, pred_df_ind], ignore_index=True)
unc_df_combined = pd.concat([unc_df_ood, unc_df_ind], ignore_index=True)

# Calculate AUROC for OOD detection
for category, models in predictor_categories.items():
    for model in models:
        pred_column = f"pred_{model}"
        unc_measures = [f"entr_{model}", f"softmax_res_{model}"]

        # Calculate and update AUROC OOD values for each uncertainty measure
        for unc_measure in unc_measures:
            auroc_ood_value = utils.calculate_auroc_ood(pred_df_combined, unc_df_combined, pred_column, unc_measure)
            auroc_ood_results = utils.update_auroc_ood_results(auroc_ood_results, model, category, unc_measure.split("_")[0], auroc_ood_value)

        if category in ["SM_duo", "SL_duo", "ML_duo", "lpft_M", "lpft_L"]:
            model_ls = model.split('_')[2:]
            combined_unc_measure = f"entr_{model}+ce_{model_ls[1]}_{model_ls[0]}"
            auroc_ood_value_combined = utils.calculate_auroc_ood(pred_df_combined, unc_df_combined, pred_column, combined_unc_measure)
            auroc_ood_results = utils.update_auroc_ood_results(auroc_ood_results, model, category, "Entr(self)+CE(larger||smaller)", auroc_ood_value_combined)

            larger_unc_measure = f"entr_{model_ls[1]}+ce_{model_ls[1]}_{model_ls[0]}"
            auroc_ood_value_larger = utils.calculate_auroc_ood(pred_df_combined, unc_df_combined, f"pred_{model_ls[1]}", larger_unc_measure)
            auroc_ood_results = utils.update_auroc_ood_results(auroc_ood_results, model, category.split("_")[0][1], "Entr(larger)+CE(larger||smaller)", auroc_ood_value_larger)

# Convert the results dictionary to a DataFrame and display it
auroc_ood_df = pd.DataFrame(auroc_ood_results).sort_values(["Test Set","Predictor Category","Model","Uncertainty Measure"], ignore_index=True)
display(auroc_ood_df)

# Save the AUROC OOD results to a CSV file
auroc_ood_df.to_csv(f"{unc_out_prefix}auroc_ood_results.csv", index=False)

Unnamed: 0,Predictor Category,Model,Test Set,Uncertainty Measure,AUROC OOD
0,L,ML_duo_Resnet101ft0_Resnet152ft0,combined,Entr(larger)+CE(larger||smaller),0.617089
1,L,ML_duo_Resnet101ft1_Resnet152ft0,combined,Entr(larger)+CE(larger||smaller),0.624123
2,L,Resnet152ft0,combined,entr,0.620199
3,L,Resnet152ft0,combined,softmax,0.613903
4,L,SL_duo_Resnet50ft0_Resnet152ft0,combined,Entr(larger)+CE(larger||smaller),0.631984
5,L,SL_duo_Resnet50ft2_Resnet152ft0,combined,Entr(larger)+CE(larger||smaller),0.622232
6,M,Resnet101ft0,combined,entr,0.628289
7,M,Resnet101ft0,combined,softmax,0.62167
8,M,Resnet101ft1,combined,entr,0.625223
9,M,Resnet101ft1,combined,softmax,0.617597
