In [1]:
import pandas as pd
import numpy as np
import utils
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score, roc_curve, auc
import itertools
import json

# Setup

In [2]:
pred_prefix = "../ss/iWildCam_"
target_prefix = "../ss/target_iWildCam_"
unc_out_prefix = "ss/unc_"
pred_out_prefix = "ss/pred_"
num_classes = 182
test_sets=["ind","ood"]

S_model_ls = ["Resnet18ft0", "Resnet18ft1", "Resnet18ft2", "Resnet18ft3", "Resnet18ft4"]
M_model_ls = ["Resnet50ft0","Resnet50ft2"]
L_model_ls = ["Resnet101ft0","Resnet101ft1"]
M_lp_ls = ["Resnet50lp0","Resnet50lp2"]
L_lp_ls = ["Resnet101lp0","Resnet101lp1"]

Include_Ensembles = False
Include_Duos = True
Include_LPFT = False

In [3]:
# Initialize categories dictionary
categories = {
    "S": S_model_ls,
    "M": M_model_ls,
    "L": L_model_ls
}
if Include_LPFT:
    categories.update({
        "Mlp": M_lp_ls,
        "Llp": L_lp_ls
    })
if Include_Ensembles:
    categories.update({f"EnsM={m}": [] for m in range(2, len(S_model_ls) + 1)})
if Include_Duos:
    categories.update({
        "SM_duo": [],
        "SL_duo": []
    })
if Include_LPFT:
    categories.update({
        "lpft_M": [],
        "lpft_L": []
    })
    
print(f"Categories available based on inclusion settings: {categories.keys()}")

Categories available based on inclusion settings: dict_keys(['S', 'M', 'L', 'SM_duo', 'SL_duo'])


## Create Ensemble as Baseline:

In [4]:
for testtype in test_sets:
    print(f"Working on {testtype}:")
    # Load raw predictions
    
    # S_predictions = utils.load_predictions(pred_prefix, testtype, S_model_ls)
    # M_predictions = utils.load_predictions(pred_prefix, testtype, M_model_ls)
    # L_predictions = utils.load_predictions(pred_prefix, testtype, L_model_ls)
    # M_lp_predictions = utils.load_predictions(pred_prefix, testtype, M_lp_ls)
    # L_lp_predictions = utils.load_predictions(pred_prefix, testtype, L_lp_ls)
    
    if Include_Ensembles:
        print("Generating ensembles from S models")
        for m in range(2, len(S_model_ls)+1):
            print(f"{m=}...")
            for comb in itertools.combinations(S_model_ls, m):
                ensemble_pred = utils.softvote([pd.read_csv(f"{pred_prefix}{testtype}_{model}.csv") for model in comb])
                ensemble_name = utils.generate_ensemble_name(f"EnsS_{m}", comb)
                utils.save_ensemble_predictions(pred_prefix, testtype, ensemble_name, ensemble_pred)
                if testtype==test_sets[0]: 
                    categories[f"EnsM={m}"].append(ensemble_name)

    if Include_Duos:
        print("Generating SM Duos")
        for s_model in S_model_ls:
            for m_model in M_model_ls:
                sm_duo_pred = utils.softvote([pd.read_csv(f"{pred_prefix}{testtype}_{s_model}.csv"), pd.read_csv(f"{pred_prefix}{testtype}_{m_model}.csv")])
                sm_duo_name = utils.generate_ensemble_name("SM_duo", [s_model, m_model])
                utils.save_ensemble_predictions(pred_prefix, testtype, sm_duo_name, sm_duo_pred)
                if testtype == test_sets[0]:
                    categories["SM_duo"].append(sm_duo_name)
        print("Generating SL Duos") 
        for s_model in S_model_ls:
            for l_model in L_model_ls:
                sl_duo_pred = utils.softvote([pd.read_csv(f"{pred_prefix}{testtype}_{s_model}.csv"), pd.read_csv(f"{pred_prefix}{testtype}_{l_model}.csv")])
                sl_duo_name = utils.generate_ensemble_name("SL_duo", [s_model, l_model])
                utils.save_ensemble_predictions(pred_prefix, testtype, sl_duo_name, sl_duo_pred)
                if testtype == test_sets[0]:
                    categories["SL_duo"].append(sl_duo_name)

    if Include_LPFT:
        print("Generating lpft-M duos")
        for m_ft, m_lp in zip(M_model_ls, M_lp_ls):
            lpft_m_pred = utils.softvote([pd.read_csv(f"{pred_prefix}{testtype}_{m_lp}.csv"), pd.read_csv(f"{pred_prefix}{testtype}_{m_ft}.csv")])
            lpft_m_name = utils.generate_ensemble_name("lpft_M", [m_lp,m_ft])
            utils.save_ensemble_predictions(pred_prefix, testtype, lpft_m_name, lpft_m_pred)
            if testtype == test_sets[0]:
                categories["lpft_M"].append(lpft_m_name)
        print("Generating lpft-L duos")
        for l_ft, l_lp in zip(L_model_ls, L_lp_ls):
            lpft_l_pred = utils.softvote([pd.read_csv(f"{pred_prefix}{testtype}_{l_lp}.csv"), pd.read_csv(f"{pred_prefix}{testtype}_{l_ft}.csv")])
            lpft_l_name = utils.generate_ensemble_name("lpft_L", [l_lp,l_ft])
            utils.save_ensemble_predictions(pred_prefix, testtype, lpft_l_name, lpft_l_pred)
            if testtype == test_sets[0]:
                categories["lpft_L"].append(lpft_l_name)

# Save the categories dictionary for use in the evaluation script
with open(f"{pred_out_prefix}categories.json", "w") as file:
    json.dump(categories, file)

Working on ind:
Generating SM Duos
Generating SL Duos
Working on ood:
Generating SM Duos
Generating SL Duos


## {F1, Acc, Brier, Ece, Mce} of raw models 

In [5]:

# Load the categories dictionary
with open(f"{pred_out_prefix}categories.json", "r") as file:
    categories = json.load(file)

metrics_dict = {
    "Model": [],
    "Test Set": [],
    "Acc": [],
    "F1": [],
    "Brier": [],
    "ECE": [],
    "MCE": [],
    "Category": []
}

for testtype in test_sets:
    print(f"Evaluating on {testtype}:")
    label = pd.read_csv(f"{target_prefix}{testtype}.csv")
    
    for category, models in categories.items():
        print(f"Evaluating {category}")
        metrics_dict = utils.evaluate_models(models, label, pred_prefix, testtype, metrics_dict, category, num_classes)

metrics_df = pd.DataFrame(metrics_dict)
display(metrics_df)
metrics_df.to_csv(f"{pred_out_prefix}metrics.csv", index=False)


Evaluating on ind:
Evaluating S
Evaluating M
Evaluating L
Evaluating SM_duo
Evaluating SL_duo
Evaluating on ood:
Evaluating S
Evaluating M
Evaluating L
Evaluating SM_duo
Evaluating SL_duo


Unnamed: 0,Model,Test Set,Acc,F1,Brier,ECE,MCE,Category
0,Resnet18ft0,ind,0.750307,0.404493,0.002178,0.155458,0.803209,S
1,Resnet18ft1,ind,0.749448,0.38929,0.002121,0.149154,0.335268,S
2,Resnet18ft2,ind,0.76024,0.413737,0.002062,0.139432,0.317058,S
3,Resnet18ft3,ind,0.753863,0.399845,0.00213,0.150373,0.31938,S
4,Resnet18ft4,ind,0.761835,0.419727,0.002055,0.144228,0.321982,S
5,Resnet50ft0,ind,0.784155,0.453848,0.001948,0.160803,0.402448,M
6,Resnet50ft2,ind,0.774467,0.429172,0.002056,0.17575,0.486047,M
7,Resnet101ft0,ind,0.769438,0.439821,0.002136,0.193279,0.459607,L
8,Resnet101ft1,ind,0.788938,0.434513,0.001927,0.156467,0.34454,L
9,SM_duo_Resnet18ft0_Resnet50ft0,ind,0.782929,0.44386,0.001954,0.147357,0.400335,SM_duo


In [7]:
# Load the categories dictionary
with open(f"{pred_out_prefix}categories.json", "r") as file:
    categories = json.load(file)

# Generate the uncertainty DataFrame
for testtype in test_sets:
    print(f"Generating uncertainty DataFrame for {testtype}...")
    unc = pd.DataFrame()
    
    # Calculate entropy for all models
    print(f"Calculating entropy for all models")
    for category, models in categories.items():
        for model in models:
            predictions = pd.read_csv(f"{pred_prefix}{testtype}_{model}.csv")
            unc[f"entr_{model}"] = utils.calc_entr_torch(predictions)
    
    # Calculate cross-entropy for specific pairs
    if Include_Duos:
        print(f"Calculating cross-entropy(M||S)")
        for s_model in S_model_ls:
            for m_model in M_model_ls:
                m_predictions = pd.read_csv(f"{pred_prefix}{testtype}_{m_model}.csv")
                s_predictions = pd.read_csv(f"{pred_prefix}{testtype}_{s_model}.csv")
                unc[f"ce_{m_model}_{s_model}"] = utils.calc_cross_entr_torch(m_predictions, s_predictions)
                unc[f"entr_{m_model}+ce_{m_model}_{s_model}"] = unc[f"entr_{m_model}"] + unc[f"ce_{m_model}_{s_model}"]
                sm_duo_name = f"SM_duo_{s_model}_{m_model}"
                unc[f"entr_{sm_duo_name}+ce_{m_model}_{s_model}"] = unc[f"entr_{sm_duo_name}"] + unc[f"ce_{m_model}_{s_model}"]

        print(f"Calculating cross-entropy(L||S)")
        for s_model in S_model_ls:
            for l_model in L_model_ls:
                l_predictions = pd.read_csv(f"{pred_prefix}{testtype}_{l_model}.csv")
                s_predictions = pd.read_csv(f"{pred_prefix}{testtype}_{s_model}.csv")
                unc[f"ce_{l_model}_{s_model}"] = utils.calc_cross_entr_torch(l_predictions, s_predictions)
                unc[f"entr_{l_model}+ce_{l_model}_{s_model}"] = unc[f"entr_{l_model}"] + unc[f"ce_{l_model}_{s_model}"]
                sl_duo_name = f"SL_duo_{s_model}_{l_model}"
                unc[f"entr_{sl_duo_name}+ce_{l_model}_{s_model}"] = unc[f"entr_{sl_duo_name}"] + unc[f"ce_{l_model}_{s_model}"]

    if Include_LPFT:
        print(f"Calculating cross-entropy(Mft||Mlp)")
        for m_ft, m_lp in zip(M_model_ls, M_lp_ls):
            m_ft_predictions = pd.read_csv(f"{pred_prefix}{testtype}_{m_ft}.csv")
            m_lp_predictions = pd.read_csv(f"{pred_prefix}{testtype}_{m_lp}.csv")
            unc[f"ce_{m_ft}_{m_lp}"] = utils.calc_cross_entr_torch(m_ft_predictions, m_lp_predictions)
            unc[f"entr_{m_ft}+ce_{m_ft}_{m_lp}"] = unc[f"entr_{m_ft}"] + unc[f"ce_{m_ft}_{m_lp}"]

        print(f"Calculating cross-entropyLft|Llp)")
        for l_ft, l_lp in zip(L_model_ls, L_lp_ls):
            l_ft_predictions = pd.read_csv(f"{pred_prefix}{testtype}_{l_ft}.csv")
            l_lp_predictions = pd.read_csv(f"{pred_prefix}{testtype}_{l_lp}.csv")
            unc[f"ce_{l_ft}_{l_lp}"] = utils.calc_cross_entr_torch(l_ft_predictions, l_lp_predictions)
            unc[f"entr_{l_ft}+ce_{l_ft}_{l_lp}"] = unc[f"entr_{l_ft}"] + unc[f"ce_{l_ft}_{l_lp}"]

    # Save the uncertainty DataFrame
    unc.to_csv(f"{unc_out_prefix}{testtype}.csv", index=False)
    print(f"Saved uncertainty DataFrame for {testtype}.")

Generating uncertainty DataFrame for ind...
Calculating entropy for all models
Calculating cross-entropy(M||S)
Calculating cross-entropy(L||S)
Saved uncertainty DataFrame for ind.
Generating uncertainty DataFrame for ood...
Calculating entropy for all models
Calculating cross-entropy(M||S)
Calculating cross-entropy(L||S)
Saved uncertainty DataFrame for ood.


In [8]:
# Load the categories dictionary
with open(f"{pred_out_prefix}categories.json", "r") as file:
    categories = json.load(file)

# Generate prediction DataFrame
for testtype in test_sets:
    print(f"Generating prediction DataFrame for {testtype}...")
    pred_df = pd.read_csv(f"{target_prefix}{testtype}.csv")

    for category, models in categories.items():
        for model in models:
            predictions = pd.read_csv(f"{pred_prefix}{testtype}_{model}.csv")
            pred_df[f"pred_{model}"] = predictions.idxmax(axis=1).str.extract('(\d+)')

    # Save the prediction DataFrame
    pred_df.to_csv(f"{pred_out_prefix}{testtype}.csv", index=False)
    print(f"Saved prediction DataFrame for {testtype}.")
    display(pred_df.head(3))

Generating prediction DataFrame for ind...
Saved prediction DataFrame for ind.


Unnamed: 0,target,pred_Resnet18ft0,pred_Resnet18ft1,pred_Resnet18ft2,pred_Resnet18ft3,pred_Resnet18ft4,pred_Resnet50ft0,pred_Resnet50ft2,pred_Resnet101ft0,pred_Resnet101ft1,...,pred_SL_duo_Resnet18ft0_Resnet101ft0,pred_SL_duo_Resnet18ft0_Resnet101ft1,pred_SL_duo_Resnet18ft1_Resnet101ft0,pred_SL_duo_Resnet18ft1_Resnet101ft1,pred_SL_duo_Resnet18ft2_Resnet101ft0,pred_SL_duo_Resnet18ft2_Resnet101ft1,pred_SL_duo_Resnet18ft3_Resnet101ft0,pred_SL_duo_Resnet18ft3_Resnet101ft1,pred_SL_duo_Resnet18ft4_Resnet101ft0,pred_SL_duo_Resnet18ft4_Resnet101ft1
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


Generating prediction DataFrame for ood...
Saved prediction DataFrame for ood.


Unnamed: 0,target,pred_Resnet18ft0,pred_Resnet18ft1,pred_Resnet18ft2,pred_Resnet18ft3,pred_Resnet18ft4,pred_Resnet50ft0,pred_Resnet50ft2,pred_Resnet101ft0,pred_Resnet101ft1,...,pred_SL_duo_Resnet18ft0_Resnet101ft0,pred_SL_duo_Resnet18ft0_Resnet101ft1,pred_SL_duo_Resnet18ft1_Resnet101ft0,pred_SL_duo_Resnet18ft1_Resnet101ft1,pred_SL_duo_Resnet18ft2_Resnet101ft0,pred_SL_duo_Resnet18ft2_Resnet101ft1,pred_SL_duo_Resnet18ft3_Resnet101ft0,pred_SL_duo_Resnet18ft3_Resnet101ft1,pred_SL_duo_Resnet18ft4_Resnet101ft0,pred_SL_duo_Resnet18ft4_Resnet101ft1
0,113,0,0,0,0,0,68,113,113,0,...,113,0,0,0,0,0,0,0,0,0
1,113,0,0,0,0,0,68,68,113,68,...,113,0,0,0,0,68,0,68,0,68
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Accuracy Prediction AUROC

In [13]:
with open(f"{pred_out_prefix}categories.json", "r") as file:
    categories = json.load(file)
    
# Initialize the AUROC results dictionary
auroc_results = {
    "Category": [],
    "Model": [],
    "Test Set": [],
    "AUROC Entr(self)": [],
    "AUROC Entr(self)+CE(larger||smaller)": [],
    "AUROC Entr(larger)+CE(larger||smaller)": [],
}

# Function to calculate AUROC
def calculate_auroc(pred_df, unc_df, pred_vec, target_vec, unc_vec):
    if isinstance(pred_df[pred_vec].iloc[0], str):
        is_correct = pred_df.apply(lambda row: str(row[target_vec]) not in row[pred_vec], axis=1)
    else:
        is_correct = pred_df[pred_vec] != pred_df[target_vec]
    fpr, tpr, threshold = roc_curve(is_correct, unc_df[unc_vec])
    roc_auc = auc(fpr, tpr)
    return roc_auc

for testtype in test_sets:
    print(f"Evaluating AUROC for {testtype}...")
    pred_df = pd.read_csv(f"{pred_out_prefix}{testtype}.csv")
    unc_df = pd.read_csv(f"{unc_out_prefix}{testtype}.csv")

    for category, models in categories.items():
        for model in models:
            pred_column = f"pred_{model}"
            entr_column = f"entr_{model}"
            auroc_value = calculate_auroc(pred_df, unc_df, pred_column, "target", entr_column)

            # Add AUROC values to the results dictionary
            auroc_results["Category"].append(category)
            auroc_results["Model"].append(model)
            auroc_results["Test Set"].append(testtype)
            auroc_results["AUROC Entr(self)"].append(auroc_value)
            auroc_results["AUROC Entr(self)+CE(larger||smaller)"].append(None)
            auroc_results["AUROC Entr(larger)+CE(larger||smaller)"].append(None)
            if category in ["SM_duo", "SL_duo", "lpft_M", "lpft_L"]:
                model_ls = model.split('_')[2:]
                # print(model_ls)
                entr_l_ce_l_s_column = f"entr_{model_ls[1]}+ce_{model_ls[1]}_{model_ls[0]}"
                combined_auroc_value = calculate_auroc(pred_df, unc_df, f"pred_{model_ls[1]}", "target", entr_l_ce_l_s_column)
                entr_self_ce_l_s_column = f"entr_{model}+ce_{model_ls[1]}_{model_ls[0]}"
                combined_auroc_value_self = calculate_auroc(pred_df, unc_df, f"pred_{model}", "target", entr_self_ce_l_s_column)
                auroc_results["AUROC Entr(larger)+CE(larger||smaller)"][-1] = combined_auroc_value
                auroc_results["AUROC Entr(self)+CE(larger||smaller)"][-1] = combined_auroc_value_self
                

# Convert the results dictionary to a DataFrame
auroc_df = pd.DataFrame(auroc_results)
display(auroc_df)

# Save the AUROC results to a CSV file
auroc_df.to_csv(f"{unc_out_prefix}auroc_results.csv", index=False)

Evaluating AUROC for ind...
Evaluating AUROC for ood...


Unnamed: 0,Category,Model,Test Set,AUROC Entr(self),AUROC Entr(self)+CE(larger||smaller),AUROC Entr(larger)+CE(larger||smaller)
0,S,Resnet18ft0,ind,0.851489,,
1,S,Resnet18ft1,ind,0.883082,,
2,S,Resnet18ft2,ind,0.874906,,
3,S,Resnet18ft3,ind,0.862662,,
4,S,Resnet18ft4,ind,0.873154,,
5,M,Resnet50ft0,ind,0.881775,,
6,M,Resnet50ft2,ind,0.864856,,
7,L,Resnet101ft0,ind,0.859352,,
8,L,Resnet101ft1,ind,0.872691,,
9,SM_duo,SM_duo_Resnet18ft0_Resnet50ft0,ind,0.874653,0.867665,0.865839


In [16]:
# Load the categories dictionary
with open(f"{pred_out_prefix}categories.json", "r") as file:
    categories = json.load(file)

# Initialize the F1-Coverage tradeoff results dictionary
f1_cov_results = {
    "Category": [],
    "Model": [],
    "Test Set": [],
    "F1-Cov Entr(self)": [],
    "F1-Cov Entr(self)+CE(larger||smaller)": [],
    "F1-Cov Entr(larger)+CE(larger||smaller)": [],
}

def calculate_f1_cov_auc(pred_df, unc_df, pred_vec, target_vec, unc_vec, cov_range):
    rank = utils.get_rank(unc_df)
    temp = pd.DataFrame()
    temp["coverage"] = cov_range
    coverage_ls = (cov_range + 1) / 100 * pred_df.shape[0]
    for i, cov in enumerate(coverage_ls):
        cov_pred = pred_df.loc[rank[unc_vec] < cov]
        temp.loc[i, "f1"] = f1_score(cov_pred[target_vec], cov_pred[pred_vec], average='macro')
    area = np.sum(temp["f1"])
    return area

cov_range = np.arange(19, 100)

for testtype in test_sets:
    print(f"Evaluating F1-Coverage tradeoff AUC for {testtype}...")
    pred_df = pd.read_csv(f"{pred_out_prefix}{testtype}.csv")
    unc_df = pd.read_csv(f"{unc_out_prefix}{testtype}.csv")

    for category, models in categories.items():
        print(f"{category=}")
        for model in models:
            pred_column = f"pred_{model}"
            entr_column = f"entr_{model}"
            f1_cov_auc_value = calculate_f1_cov_auc(pred_df, unc_df, pred_column, "target", entr_column, cov_range)

            # Add F1-Coverage AUC values to the results dictionary
            f1_cov_results["Category"].append(category)
            f1_cov_results["Model"].append(model)
            f1_cov_results["Test Set"].append(testtype)
            f1_cov_results["F1-Cov Entr(self)"].append(f1_cov_auc_value)
            f1_cov_results["F1-Cov Entr(self)+CE(larger||smaller)"].append(None)
            f1_cov_results["F1-Cov Entr(larger)+CE(larger||smaller)"].append(None)

            if category in ["SM_duo", "SL_duo", "lpft_M", "lpft_L"]:
                model_ls = model.split('_')[2:]
                ce_column = f"entr_{model}+ce_{model_ls[1]}_{model_ls[0]}"

                entr_l_ce_l_s_column = f"entr_{model_ls[1]}+ce_{model_ls[1]}_{model_ls[0]}"
                combined_f1_cov_auc_value = calculate_f1_cov_auc(pred_df, unc_df, f"pred_{model_ls[1]}", "target", entr_l_ce_l_s_column, cov_range)
                entr_self_ce_l_s_column = f"entr_{model}+ce_{model_ls[1]}_{model_ls[0]}"
                combined_f1_cov_auc_value_self = calculate_f1_cov_auc(pred_df, unc_df, f"pred_{model}", "target", entr_self_ce_l_s_column, cov_range)
                f1_cov_results["F1-Cov Entr(larger)+CE(larger||smaller)"][-1] = combined_f1_cov_auc_value
                f1_cov_results["F1-Cov Entr(self)+CE(larger||smaller)"][-1] = combined_f1_cov_auc_value_self
                

# Convert the results dictionary to a DataFrame
f1_cov_df = pd.DataFrame(f1_cov_results)
display(f1_cov_df)

# Save the F1-Coverage AUC results to a CSV file
f1_cov_df.to_csv(f"{unc_out_prefix}f1_cov_auc_results.csv", index=False)

Evaluating F1-Coverage tradeoff AUC for ind...
category='S'
category='M'
category='L'
category='SM_duo'
category='SL_duo'
Evaluating F1-Coverage tradeoff AUC for ood...
category='S'
category='M'
category='L'
category='SM_duo'
category='SL_duo'


Unnamed: 0,Category,Model,Test Set,F1-Cov Entr(self),F1-Cov Entr(self)+CE(larger||smaller),F1-Cov Entr(larger)+CE(larger||smaller)
0,S,Resnet18ft0,ind,55.29063,,
1,S,Resnet18ft1,ind,55.322073,,
2,S,Resnet18ft2,ind,55.357192,,
3,S,Resnet18ft3,ind,54.065846,,
4,S,Resnet18ft4,ind,56.240043,,
5,M,Resnet50ft0,ind,58.030419,,
6,M,Resnet50ft2,ind,55.395392,,
7,L,Resnet101ft0,ind,56.365964,,
8,L,Resnet101ft1,ind,59.679414,,
9,SM_duo,SM_duo_Resnet18ft0_Resnet50ft0,ind,59.155365,57.961449,57.817824


In [18]:
# Load the categories dictionary
with open(f"{pred_out_prefix}categories.json", "r") as file:
    categories = json.load(file)

# Initialize the AUROC OOD results dictionary
auroc_ood_results = {
    "Category": [],
    "Model": [],
    "Test Set": [],
    "AUROC OOD Entr(self)": [],
    "AUROC OOD Entr(self)+CE(larger||smaller)": [],
    "AUROC OOD Entr(larger)+CE(larger||smaller)": [],
}

def calculate_auroc_ood(pred_df, unc_df, pred_vec, unc_vec):
    is_ood = np.where(pred_df["test_type"].isin(["ood"]), True, False)
    fpr, tpr, threshold = roc_curve(is_ood, unc_df[unc_vec])
    roc_auc = auc(fpr, tpr)
    return roc_auc

# Combine in-distribution and out-of-distribution data
pred_df_ind = pd.read_csv(f"{pred_out_prefix}ind.csv")
unc_df_ind = pd.read_csv(f"{unc_out_prefix}ind.csv")
pred_df_ood = pd.read_csv(f"{pred_out_prefix}ood.csv")
unc_df_ood = pd.read_csv(f"{unc_out_prefix}ood.csv")

pred_df_ind["test_type"] = "ind"
pred_df_ood["test_type"] = "ood"

pred_df_combined = pd.concat([pred_df_ood, pred_df_ind], ignore_index=True)
unc_df_combined = pd.concat([unc_df_ood, unc_df_ind], ignore_index=True)

# Calculate AUROC for OOD detection
for category, models in categories.items():
    for model in models:
        pred_column = f"pred_{model}"
        entr_column = f"entr_{model}"
        auroc_ood_value = calculate_auroc_ood(pred_df_combined, unc_df_combined, pred_column, entr_column)

        # Add AUROC OOD values to the results dictionary
        auroc_ood_results["Category"].append(category)
        auroc_ood_results["Model"].append(model)
        auroc_ood_results["Test Set"].append("combined")
        auroc_ood_results["AUROC OOD Entr(self)"].append(auroc_ood_value)
        auroc_ood_results["AUROC OOD Entr(self)+CE(larger||smaller)"].append(None)
        auroc_ood_results["AUROC OOD Entr(larger)+CE(larger||smaller)"].append(None)

        if category in ["SM_duo", "SL_duo", "lpft_M", "lpft_L"]:
            model_ls = model.split('_')[2:]
            entr_l_ce_l_s_column = f"entr_{model_ls[1]}+ce_{model_ls[1]}_{model_ls[0]}"
            combined_auroc_ood_value = calculate_auroc_ood(pred_df_combined, unc_df_combined, f"pred_{model_ls[1]}", entr_l_ce_l_s_column)
            entr_self_ce_l_s_column = f"entr_{model}+ce_{model_ls[1]}_{model_ls[0]}"
            combined_auroc_ood_value_self = calculate_auroc_ood(pred_df_combined, unc_df_combined, f"pred_{model}", entr_self_ce_l_s_column)
            auroc_ood_results["AUROC OOD Entr(larger)+CE(larger||smaller)"][-1] = combined_auroc_ood_value
            auroc_ood_results["AUROC OOD Entr(self)+CE(larger||smaller)"][-1] = combined_auroc_ood_value_self

# Convert the results dictionary to a DataFrame
auroc_ood_df = pd.DataFrame(auroc_ood_results)
display(auroc_ood_df)

# Save the AUROC OOD results to a CSV file
auroc_ood_df.to_csv(f"{unc_out_prefix}auroc_ood_results.csv", index=False)

Unnamed: 0,Category,Model,Test Set,AUROC OOD Entr(self),AUROC OOD Entr(self)+CE(larger||smaller),AUROC OOD Entr(larger)+CE(larger||smaller)
0,S,Resnet18ft0,combined,0.596179,,
1,S,Resnet18ft1,combined,0.612208,,
2,S,Resnet18ft2,combined,0.596868,,
3,S,Resnet18ft3,combined,0.588367,,
4,S,Resnet18ft4,combined,0.608873,,
5,M,Resnet50ft0,combined,0.625165,,
6,M,Resnet50ft2,combined,0.620984,,
7,L,Resnet101ft0,combined,0.628289,,
8,L,Resnet101ft1,combined,0.625223,,
9,SM_duo,SM_duo_Resnet18ft0_Resnet50ft0,combined,0.606102,0.627237,0.633579
