In [None]:
import sys
sys.path.append('/storage/vbutoi/projects')
sys.path.append('/storage/vbutoi/libraries')
sys.path.append('/storage/vbutoi/projects/ESE')
sys.path.append('/storage/vbutoi/projects/UniverSeg')

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

import os 
os.environ['DATAPATH'] = ':'.join((
       '/storage/vbutoi/datasets',
))

# Results loader object does everything
from ionpy.analysis import ResultsLoader
from pathlib import Path
root = Path("/storage/vbutoi/scratch/ESE")
rs = ResultsLoader()

# For using code without restarting.
%load_ext autoreload
%autoreload 
# For using yaml configs.
%load_ext yamlmagic

In [None]:
exp_base = "01_10_24_wLabAmounts"
# exp_base = "01_14_24_EnsembleAnalysis"

In [None]:
%%yaml results_cfg 

log:
    root: /storage/vbutoi/scratch/ESE/inference
    load_pixel_meters: True 
    remove_shared_columns: True
    # inference_paths:
    #     - "01_10_24_wLabAmounts/WMH_Individual_Uncalibrated"
    #     - "01_10_24_wLabAmounts/WMH_Individual_TempScaling"
    #     - "01_10_24_wLabAmounts/WMH_Individual_VectorScaling"
    #     - "01_10_24_wLabAmounts/WMH_Individual_DirichletScaling"
    #     - "01_10_24_wLabAmounts/WMH_Individual_LTS"
    #     - "01_10_24_wLabAmounts/WMH_Ensemble_Uncalibrated"
    #     - "01_10_24_wLabAmounts/WMH_Ensemble_TempScaling"
    #     - "01_10_24_wLabAmounts/WMH_Ensemble_VectorScaling"
    #     - "01_10_24_wLabAmounts/WMH_Ensemble_DirichletScaling"
    #     - "01_10_24_wLabAmounts/WMH_Ensemble_LTS"
    inference_paths:
        - "01_14_24_EnsembleAnalysis/WMH_Individual_Uncalibrated"
        - "01_14_24_EnsembleAnalysis/WMH_Individual_TempScaling"
        - "01_14_24_EnsembleAnalysis/WMH_Individual_VectorScaling"
        - "01_14_24_EnsembleAnalysis/WMH_Individual_DirichletScaling"
        - "01_14_24_EnsembleAnalysis/WMH_Individual_LTS"
        - "01_14_24_EnsembleAnalysis/WMH_Ensemble_Uncalibrated"
        - "01_14_24_EnsembleAnalysis/WMH_Ensemble_TempScaling"
        - "01_14_24_EnsembleAnalysis/WMH_Ensemble_VectorScaling"
        - "01_14_24_EnsembleAnalysis/WMH_Ensemble_DirichletScaling"
        - "01_14_24_EnsembleAnalysis/WMH_Ensemble_LTS"
    
calibration:
    conf_interval:
        - 0.5
        - 1.
    num_bins: 10
    square_diff: False 
    neighborhood_width: 3

cal_metrics:
    - ECE:
        _fn: ese.experiment.metrics.ece.ece_loss
    - CW_ECE:
        _fn: ese.experiment.metrics.ece.cw_ece_loss
    - Edge_ECE:
        _fn: ese.experiment.metrics.ece.edge_ece_loss
    - ELM:
        _fn: ese.experiment.metrics.elm.elm_loss
    - Foreground_ECE:
        _fn: ese.experiment.metrics.ece.ece_loss
        ignore_index: 0
    - Foreground_CW_ECE:
        _fn: ese.experiment.metrics.ece.cw_ece_loss
        ignore_index: 0
    - Foreground_Edge_ECE:
        _fn: ese.experiment.metrics.ece.edge_ece_loss
        ignore_index: 0       
    - Foreground_ELM:
        _fn: ese.experiment.metrics.elm.elm_loss
        ignore_index: 0

In [None]:
from ese.experiment.analysis.inference import load_cal_inference_stats

inference_df = load_cal_inference_stats(
    results_cfg=results_cfg
)

## We are going to do the same standardization for our df. 

In [None]:
# Get rid of namn rows
####################################################################

# Keep all of the columns where 'num_lab_1_pixels' is greater than 100.
pix_threshold = 100
thresholded_pix_df = inference_df[inference_df['num_lab_1_pixels'] > pix_threshold].reset_index(drop=True)
# Drop the rows of raw_image_info_df that have None for qual_score
de_naned_image_info_df = thresholded_pix_df.dropna(subset=['metric_score'])
# Fill the rest of the NaN values with 'None'
image_info_df = de_naned_image_info_df.fillna('None')

# Add extra variable names.
####################################################################

image_info_df["ensemble"] = image_info_df["model.ensemble"]
image_info_df["pre_softmax"] = image_info_df["model.ensemble_pre_softmax"]
image_info_df["combine_fn"] = image_info_df["model.ensemble_combine_fn"]

image_info_df["pretrained_seed"] = image_info_df["experiment.pretrained_seed"]

image_info_df["model_class"] = image_info_df["model._class"]
image_info_df["pretrained_model_class"] = image_info_df["model._pretrained_class"]

def method_name(model_class, pretrained_model_class, pretrained_seed, ensemble, pre_softmax, combine_fn):
    if ensemble:
        softmax_modifier = "pre" if pre_softmax else "post"
        method_name_string = f"Ensemble ({combine_fn}, {softmax_modifier})" 
    else:
        if pretrained_model_class == "None":
            method_name_string = f"{model_class.split('.')[-1]} (seed={pretrained_seed})"
        else:
            method_name_string = f"{pretrained_model_class.split('.')[-1]} (seed={pretrained_seed})"

    return method_name_string

def model_class(model_class):
    if "UNet" in model_class:
        return "Uncalibrated"
    else:
        return model_class.split('.')[-1]

def configuration(method_name, model_class):
    return f"{method_name}_{model_class}"

image_info_df.augment(method_name)
image_info_df.augment(model_class)
image_info_df.augment(configuration)

# Sorting for plotting.
####################################################################

# Sort the image_info_df by method name, so everything appears nicely
image_info_df = image_info_df.sort_values(by=['method_name', 'model_class'])
# Make sure that the model_class 'Uncalibrated' is first
image_info_df['model_class'] = image_info_df['model_class'].astype('category')
image_info_df['model_class'].cat.reorder_categories(['Uncalibrated', 'Temperature_Scaling', 'Vector_Scaling', 'Dirichlet_Scaling', 'LTS'], inplace=True)

## Let's try to see if there is any hope with having better ECE/ELM makes better ensembles. Note that this isn't a conclusive result just because the number of samples per images that are used to calculate ECE/ELM are not sufficient to get actual statistical quantities.

### First thing we have to do is calculate per slice per model configuration, the delta in performance that each configuration has between that configuration's slice performance and the average un-calibrated UNet performance on that slice.

In [None]:
# 

## Now we want to consider these averaged within subjects.

In [None]:
subj_image_info_df = image_info_df.groupby(["method_name", "model_class", "qual_metric", "ensemble", "data_id"])['qual_score'].mean().reset_index()