In [None]:
import sys
sys.path.append('/storage/vbutoi/projects')
sys.path.append('/storage/vbutoi/libraries')
sys.path.append('/storage/vbutoi/projects/ESE')
sys.path.append('/storage/vbutoi/projects/UniverSeg')

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

import os 
os.environ['DATAPATH'] = ':'.join((
       '/storage/vbutoi/datasets',
))

# Results loader object does everything
from ionpy.analysis import ResultsLoader
from pathlib import Path
root = Path("/storage/vbutoi/scratch/ESE")
rs = ResultsLoader()

# For using code without restarting.
%load_ext autoreload
%autoreload 
# For using yaml configs.
%load_ext yamlmagic

In [None]:
%%yaml results_cfg 

log:
    root: /storage/vbutoi/scratch/ESE/inference
    load_pixel_meters: True 
    remove_shared_columns: True
    drop_nan_metric_rows: False
    min_fg_pixels: 100
    inference_paths:
        - "01_17_24_Ignore_None/WMH_Individual_Uncalibrated"
        - "01_17_24_Ignore_None/WMH_Individual_TempScaling"
        - "01_17_24_Ignore_None/WMH_Individual_VectorScaling"
        - "01_17_24_Ignore_None/WMH_Individual_DirichletScaling"
        - "01_17_24_Ignore_None/WMH_Individual_LTS"
        - "01_17_24_Ignore_None/WMH_Ensemble_Uncalibrated"
        - "01_17_24_Ignore_None/WMH_Ensemble_TempScaling"
        - "01_17_24_Ignore_None/WMH_Ensemble_VectorScaling"
        - "01_17_24_Ignore_None/WMH_Ensemble_DirichletScaling"
        - "01_17_24_Ignore_None/WMH_Ensemble_LTS"
    
calibration:
    conf_interval:
        - 0.5
        - 1.
    num_bins: 10
    square_diff: False 
    neighborhood_width: 3

# cal_metrics:
#     - ECE:
#         _fn: ese.experiment.metrics.ece.ece_loss
#     - CW_ECE:
#         _fn: ese.experiment.metrics.ece.cw_ece_loss
#     - Edge_ECE:
#         _fn: ese.experiment.metrics.ece.edge_ece_loss
#     - ELM:
#         _fn: ese.experiment.metrics.elm.elm_loss
#     - Foreground_ECE:
#         _fn: ese.experiment.metrics.ece.ece_loss
#         ignore_index: 0
#     - Foreground_CW_ECE:
#         _fn: ese.experiment.metrics.ece.cw_ece_loss
#         ignore_index: 0
#     - Foreground_Edge_ECE:
#         _fn: ese.experiment.metrics.ece.edge_ece_loss
#         ignore_index: 0       
#     - Foreground_ELM:
#         _fn: ese.experiment.metrics.elm.elm_loss
#         ignore_index: 0

In [None]:
from ese.experiment.analysis.inference import load_cal_inference_stats

image_info_df = load_cal_inference_stats(
    results_cfg=results_cfg
)

## Let's try to see if there is any hope with having better ECE/ELM makes better ensembles. Note that this isn't a conclusive result just because the number of samples per images that are used to calculate ECE/ELM are not sufficient to get actual statistical quantities.

### First thing we have to do is calculate per slice per model configuration, the delta in performance that each configuration has between that configuration's slice performance and the average un-calibrated UNet performance on that slice.

In [None]:
unet_info_df = image_info_df[image_info_df['ensemble'] == False].reset_index(drop=True)

In [None]:
unet_group_keys = [
    'data_id',
    'slice_idx',
    'num_lab_0_pixels',
    'num_lab_1_pixels',
    'num_bins',
    'neighborhood_width',
    'square_diff',
    'image_metric',
    'model._class',
    'model.checkpoint',
    'model._pretrained_class',
    'groupavg_image_metric',
    'model_class',
    'pretrained_model_class',
    'metric_type',
    'model_type',
    'calibrator'
]

In [None]:
# Run a check, that when you group by these keys, you get a unique row.
# If not, you need to add more keys.
num_rows_per_group = unet_info_df.groupby(unet_group_keys).size()
# They should have exactly 4, for four seeds.
assert (num_rows_per_group.max() == 4) and (num_rows_per_group.min() == 4),\
    f"Grouping by these keys does not give the required number of rows per seed (4), Got: {num_rows_per_group}."

In [None]:
# Group everything we need. 
average_seed_unet = unet_info_df.groupby(unet_group_keys).agg({
    'metric_score': 'mean', 
    'groupavg_metric_score': 'mean'
    }).reset_index()
# Set some useful variables.
average_seed_unet['experiment.pretrained_seed'] = 'Average'
average_seed_unet['pretrained_seed'] = 'Average'
average_seed_unet['model_type'] = 'group' # Now this is a group of results

def method_name(pretrained_model_class, model_class):
    if pretrained_model_class == "None":
        return f"{model_class.split('.')[-1]} (seed=Average)"
    else:
        return f"{pretrained_model_class.split('.')[-1]} (seed=Average)"

def configuration(method_name, calibrator):
    return f"{method_name}_{calibrator}"

average_seed_unet.augment(method_name)
average_seed_unet.augment(configuration)

In [None]:
# Add this unet group back to image info df
info_df_w_baselines = pd.concat([image_info_df, average_seed_unet], axis=0, ignore_index=True)

### Now we have to add to each row a column that is the difference betweeen the row's metric_score and the metric_score corresponding to the same image metric as mean uncalibrated UNet performance. 

In [None]:
# Step 1: Get the rows corresponding to a unet aveages across multiple seeds with no calibration.
average_unet_row = info_df_w_baselines[(info_df_w_baselines['pretrained_seed'] == 'Average') & (info_df_w_baselines['calibrator'] == 'Uncalibrated')]

In [None]:
unique_datapoint_cols = ['data_id', 'slice_idx', 'image_metric', 'groupavg_image_metric']

In [None]:
# Sanity check that for each datapoint we only have one average unet row.
num_avg_unets_per_datapoint = average_unet_row.groupby(unique_datapoint_cols).size()
assert num_avg_unets_per_datapoint.max() == 1,\
    f"There should be only one row for each data_id, slice_idx, image_metric, and groupavg image metric combination, got {num_avg_unets_per_datapoint}."

In [None]:
# Make sure there are no NaNs in the average UNet rows.
assert average_unet_row['metric_score'].isna().sum() == 0, "There should be no NaNs in metric_score of UNet rows."
assert average_unet_row['groupavg_metric_score'].isna().sum() == 0, "There should be no NaNs in groupavg_metric_score of UNet rows."

In [None]:
# Step 2: Merge based on 'image_metric', 'subject_id', and 'slice_idx'
info_df_w_avg_unet_cols = pd.merge(
    info_df_w_baselines, 
    average_unet_row[unique_datapoint_cols + ['metric_score', 'groupavg_metric_score']], 
    on=unique_datapoint_cols, 
    how='left', 
    suffixes=('', '_average_unet')
    )

In [None]:
# Step 3: Calculate the difference
info_df_w_avg_unet_cols['metric_delta'] = info_df_w_avg_unet_cols['metric_score'] - info_df_w_avg_unet_cols['metric_score_average_unet'] # Current - Baseline
info_df_w_avg_unet_cols['groupavg_metric_delta'] = info_df_w_avg_unet_cols['groupavg_metric_score'] - info_df_w_avg_unet_cols['groupavg_metric_score_average_unet'] # Current - Baseline
# Drop those columns
info_df_w_delta = info_df_w_avg_unet_cols.drop(columns=['metric_score_average_unet', 'groupavg_metric_score_average_unet'])

In [None]:
# At this point, we want to drop the rows that correspond to having NaN deltas
# This is because we are only interested in the rows that have a baseline.
# We will use this to plot the deltas.
info_df_w_delta = info_df_w_delta.dropna(subset=['metric_delta', 'groupavg_metric_delta'])

## Now we can look at trends! We want to make some scatterplots to look at relationships between calibration scores and their relative improvement over the baseline.   

In [None]:
# First, we want only the rows corresponding to group metrics, no longer looking at seeds.
grouped_models_df = info_df_w_delta[info_df_w_delta['model_type'] == 'group'].reset_index(drop=True)

In [None]:
# SANITY CHECKS, MAKE SURE THAT FOR CALIBRATORS UNCALIBRATED, TEMPERATURE_SCALING, LTS
for calibrator in ["Uncalibrated", "Temperature_Scaling", "LTS"]:
    unique_qual_metrics = grouped_models_df[grouped_models_df['metric_type'] == 'quality']['image_metric'].unique()
    for quality_metric in unique_qual_metrics:
        # Checkign that the delta is 0 for the calibrator and the quality_metric
        rows = grouped_models_df[
            (grouped_models_df['calibrator'] == calibrator) & 
            (grouped_models_df['image_metric'] == quality_metric) &
            (grouped_models_df['method_name'] == 'UNet (seed=Average)')
        ]
        assert (rows['metric_delta'] == 0).all(),\
            f"Delta from base should be 0 for the calibrator {calibrator} and the quality metric {quality_metric}, got {rows['metric_delta']}."

In [None]:
# Make a bunch of new rows where the image_metric is the groupavg_image_metric and the metric_score is the groupavg_metric_score and the metric_delta is the groupavg_metric_delta
groupavg_rows = grouped_models_df.copy()
groupavg_rows['image_metric'] = groupavg_rows['groupavg_image_metric']
groupavg_rows['metric_score'] = groupavg_rows['groupavg_metric_score']
groupavg_rows['metric_delta'] = groupavg_rows['groupavg_metric_delta']
# Drop the groupavg columns
standard_image_rows = grouped_models_df.drop(columns=['groupavg_image_metric', 'groupavg_metric_score', 'groupavg_metric_delta']) 
groupavg_rows = groupavg_rows.drop(columns=['groupavg_image_metric', 'groupavg_metric_score', 'groupavg_metric_delta'])
# Concatenate the two
grouped_models_df = pd.concat([standard_image_rows, groupavg_rows], axis=0, ignore_index=True)

In [None]:
# Create a pivot table with 'metric_type' as columns
pivot_grouped_models_df = grouped_models_df.pivot_table(
    index=['configuration', 'method_name', 'calibrator', 'data_id', 'slice_idx'],
    values=['metric_score', 'metric_delta'], 
    columns=['metric_type', 'image_metric'], 
    aggfunc='mean'
).reset_index()

In [None]:
# Make a copy so that we can modify the column names
pivot_perf_per_datpoint = pivot_grouped_models_df.copy()
# Make new column names.
new_cols = []
for col in pivot_grouped_models_df.columns.values:
    if col[0] == 'metric_delta':
        new_cols.append(f'delta_{col[-1]}')
    elif col[-1] == '':
        new_cols.append(col[0])
    else:
        new_cols.append(col[-1])
# Set the column names to be the lowest non empty level per column in the multi-index
pivot_perf_per_datpoint.columns = new_cols

In [None]:
pivot_perf_per_datpoint.columns

In [None]:
# We want to sort the pivot_df so that the order of the method names is
# UNet (seed=Average), Ensemble (mean, logits), Ensemble (mean, probs),
# and the order of the calibrators is Uncalibrated, Temperature Scaling, LTS, Vector Scaling, Dirichlet Scaling
method_order_name = ['UNet (seed=Average)', 'Ensemble (mean, logits)', 'Ensemble (mean, probs)']
calibrator_order_name = ['Uncalibrated', 'Temperature_Scaling', 'LTS', 'Vector_Scaling', 'Dirichlet_Scaling']
# Sort the methods
pivot_perf_per_datpoint['method_name'] = pivot_perf_per_datpoint['method_name'].astype('category')
pivot_perf_per_datpoint['method_name'] = pivot_perf_per_datpoint['method_name'].cat.reorder_categories(method_order_name)
# Sort the calibrators
pivot_perf_per_datpoint['calibrator'] = pivot_perf_per_datpoint['calibrator'].astype('category')
pivot_perf_per_datpoint['calibrator'] = pivot_perf_per_datpoint['calibrator'].cat.reorder_categories(calibrator_order_name)
# Sort the dataframe
sorted_pivot_perf_per_datpoint = pivot_perf_per_datpoint.sort_values(by=['method_name', 'calibrator'])

## Looking at change in *predicted ensemble* calibration vs change in Dice.

In [None]:
from ese.experiment.analysis.plot_utils import add_corr_coefficients

In [None]:
g = sns.relplot(
    data=sorted_pivot_perf_per_datpoint, 
    x='delta_Image_ECE', 
    y='delta_Dice',
    row='method_name',
    col='calibrator',
    hue='method_name',
    style='calibrator',
    kind='scatter',
    height=4,
    facet_kws=dict(margin_titles=True)
    )
g.set_titles("")  # Set titles to empty string
g.set(xlim=(-0.006, 0.006), ylim=(-0.13, 0.13))
# Show the plot
g.fig.subplots_adjust(hspace=0.1, wspace=0.15)
# Add correlation coefficients
add_corr_coefficients(
    g, 
    data=sorted_pivot_perf_per_datpoint, 
    x='delta_Image_ECE', 
    y='delta_Dice',
    row='method_name',
    col='calibrator'
)
# Add a title to the entire figure, and make it slightly bigger than the default
g.fig.suptitle('Relationship Between Total ECE Delta and Dice Score Delta', size=20)
g.fig.subplots_adjust(top=0.9)

In [None]:
g = sns.relplot(
    data=sorted_pivot_perf_per_datpoint, 
    x='delta_Image_Foreground-ECE', 
    y='delta_Dice',
    row='method_name',
    col='calibrator',
    hue='method_name',
    style='calibrator',
    height=4,
    facet_kws=dict(margin_titles=True)
    )
g.set_titles("")  # Set titles to empty string
g.set(xlim=(-0.15, 0.15), ylim=(-0.12, 0.12))
# Show the plot
g.fig.subplots_adjust(hspace=0.1, wspace=0.15)
# Add correlation coefficients
add_corr_coefficients(
    g, 
    data=sorted_pivot_perf_per_datpoint, 
    x='delta_Image_Foreground-ECE', 
    y='delta_Dice',
    row='method_name',
    col='calibrator'
)
# Add a title to the entire figure, and make it slightly bigger than the default
g.fig.suptitle('Relationship Between Foreground ECE Delta and Dice Score Delta', size=20)
g.fig.subplots_adjust(top=0.9)

In [None]:
g = sns.relplot(
    data=sorted_pivot_perf_per_datpoint, 
    x='delta_Image_Edge-ECE', 
    y='delta_Dice',
    row='method_name',
    col='calibrator',
    hue='method_name',
    style='calibrator',
    height=4,
    facet_kws=dict(margin_titles=True)
    )
g.set_titles("")  # Set titles to empty string
g.set(xlim=(-0.15, 0.15), ylim=(-0.12, 0.12))
# Show the plot
g.fig.subplots_adjust(hspace=0.1, wspace=0.15)
# Add correlation coefficients
add_corr_coefficients(
    g, 
    data=sorted_pivot_perf_per_datpoint, 
    x='delta_Image_Edge-ECE', 
    y='delta_Dice',
    row='method_name',
    col='calibrator'
)
# Add a title to the entire figure, and make it slightly bigger than the default
g.fig.suptitle('Relationship Between Total Edge ECE Delta and Dice Score Delta', size=20)
g.fig.subplots_adjust(top=0.9)

In [None]:
g = sns.relplot(
    data=sorted_pivot_perf_per_datpoint, 
    x='delta_Image_Foreground-Edge-ECE', 
    y='delta_Dice',
    row='method_name',
    col='calibrator',
    hue='method_name',
    style='calibrator',
    height=4,
    facet_kws=dict(margin_titles=True)
    )
g.set_titles("")  # Set titles to empty string
g.set(xlim=(-0.15, 0.15), ylim=(-0.12, 0.12))
# Show the plot
g.fig.subplots_adjust(hspace=0.1, wspace=0.15)
# Add correlation coefficients
add_corr_coefficients(
    g, 
    data=sorted_pivot_perf_per_datpoint, 
    x='delta_Image_Foreground-Edge-ECE', 
    y='delta_Dice',
    row='method_name',
    col='calibrator'
)
# Add a title to the entire figure, and make it slightly bigger than the default
g.fig.suptitle('Relationship Between Foreground Edge ECE Delta and Dice Score Delta', size=20)
g.fig.subplots_adjust(top=0.9)

In [None]:
g = sns.relplot(
    data=sorted_pivot_perf_per_datpoint, 
    x='delta_Image_ELM', 
    y='delta_Dice',
    row='method_name',
    col='calibrator',
    hue='method_name',
    style='calibrator',
    height=4,
    facet_kws=dict(margin_titles=True)
    )
g.set_titles("")  # Set titles to empty string
g.set(xlim=(-0.006, 0.006), ylim=(-0.12, 0.12))
# Show the plot
g.fig.subplots_adjust(hspace=0.1, wspace=0.15)
# Add correlation coefficients
add_corr_coefficients(
    g, 
    data=sorted_pivot_perf_per_datpoint, 
    x='delta_Image_ELM', 
    y='delta_Dice',
    row='method_name',
    col='calibrator'
)
# Add a title to the entire figure, and make it slightly bigger than the default
g.fig.suptitle('Relationship Between Total ELM Delta and Dice Score Delta', size=20)
g.fig.subplots_adjust(top=0.9)

In [None]:
g = sns.relplot(
    data=sorted_pivot_perf_per_datpoint, 
    x='delta_Image_Foreground-ELM', 
    y='delta_Dice',
    row='method_name',
    col='calibrator',
    hue='method_name',
    style='calibrator',
    height=4,
    facet_kws=dict(margin_titles=True)
    )
g.set_titles("")  # Set titles to empty string
g.set(xlim=(-0.15, 0.15), ylim=(-0.12, 0.12))
# Show the plot
g.fig.subplots_adjust(hspace=0.1, wspace=0.15)
# Add correlation coefficients
add_corr_coefficients(
    g, 
    data=sorted_pivot_perf_per_datpoint, 
    x='delta_Image_Foreground-ELM', 
    y='delta_Dice',
    row='method_name',
    col='calibrator'
)
# Add a title to the entire figure, and make it slightly bigger than the default
g.fig.suptitle('Relationship Between Foreground ELM Delta and Dice Score Delta', size=20)
g.fig.subplots_adjust(top=0.9)

## Now we can look at the average ensemble member calibration vs change in dice.

In [None]:
g = sns.relplot(
    data=sorted_pivot_perf_per_datpoint, 
    x='delta_GroupAvg_Image_ECE', 
    y='delta_Dice',
    row='method_name',
    col='calibrator',
    hue='method_name',
    style='calibrator',
    kind='scatter',
    height=4,
    facet_kws=dict(margin_titles=True)
    )
g.set_titles("")  # Set titles to empty string
g.set(xlim=(-0.006, 0.006), ylim=(-0.13, 0.13))
# Show the plot
g.fig.subplots_adjust(hspace=0.1, wspace=0.15)
# Add correlation coefficients
add_corr_coefficients(
    g, 
    data=sorted_pivot_perf_per_datpoint, 
    x='delta_GroupAvg_Image_ECE', 
    y='delta_Dice',
    row='method_name',
    col='calibrator'
)
# Add a title to the entire figure, and make it slightly bigger than the default
g.fig.suptitle('Relationship Between Total Group Avg ECE Delta and Dice Score Delta', size=20)
g.fig.subplots_adjust(top=0.9)

In [None]:
g = sns.relplot(
    data=sorted_pivot_perf_per_datpoint, 
    x='delta_GroupAvg_Image_Foreground-ECE', 
    y='delta_Dice',
    row='method_name',
    col='calibrator',
    hue='method_name',
    style='calibrator',
    height=4,
    facet_kws=dict(margin_titles=True)
    )
g.set_titles("")  # Set titles to empty string
g.set(xlim=(-0.15, 0.15), ylim=(-0.12, 0.12))
# Show the plot
g.fig.subplots_adjust(hspace=0.1, wspace=0.15)
# Add correlation coefficients
add_corr_coefficients(
    g, 
    data=sorted_pivot_perf_per_datpoint, 
    x='delta_GroupAvg_Image_Foreground-ECE', 
    y='delta_Dice',
    row='method_name',
    col='calibrator'
)
# Add a title to the entire figure, and make it slightly bigger than the default
g.fig.suptitle('Relationship Between Foreground Group Avg ECE Delta and Dice Score Delta', size=20)
g.fig.subplots_adjust(top=0.9)

In [None]:
g = sns.relplot(
    data=sorted_pivot_perf_per_datpoint, 
    x='delta_GroupAvg_Image_Edge-ECE', 
    y='delta_Dice',
    row='method_name',
    col='calibrator',
    hue='method_name',
    style='calibrator',
    height=4,
    facet_kws=dict(margin_titles=True)
    )
g.set_titles("")  # Set titles to empty string
g.set(xlim=(-0.15, 0.15), ylim=(-0.12, 0.12))
# Show the plot
g.fig.subplots_adjust(hspace=0.1, wspace=0.15)
# Add correlation coefficients
add_corr_coefficients(
    g, 
    data=sorted_pivot_perf_per_datpoint, 
    x='delta_GroupAvg_Image_Edge-ECE', 
    y='delta_Dice',
    row='method_name',
    col='calibrator'
)
# Add a title to the entire figure, and make it slightly bigger than the default
g.fig.suptitle('Relationship Between Group Avg Edge ECE Delta and Dice Score Delta', size=20)
g.fig.subplots_adjust(top=0.9)

In [None]:
g = sns.relplot(
    data=sorted_pivot_perf_per_datpoint, 
    x='delta_GroupAvg_Image_Foreground-Edge-ECE', 
    y='delta_Dice',
    row='method_name',
    col='calibrator',
    hue='method_name',
    style='calibrator',
    height=4,
    facet_kws=dict(margin_titles=True)
    )
g.set_titles("")  # Set titles to empty string
g.set(xlim=(-0.15, 0.15), ylim=(-0.12, 0.12))
# Show the plot
g.fig.subplots_adjust(hspace=0.1, wspace=0.15)
# Add correlation coefficients
add_corr_coefficients(
    g, 
    data=sorted_pivot_perf_per_datpoint, 
    x='delta_GroupAvg_Image_Foreground-Edge-ECE', 
    y='delta_Dice',
    row='method_name',
    col='calibrator'
)
# Add a title to the entire figure, and make it slightly bigger than the default
g.fig.suptitle('Relationship Between Group Avg Foreground Edge ECE Delta and Dice Score Delta', size=20)
g.fig.subplots_adjust(top=0.9)

In [None]:
g = sns.relplot(
    data=sorted_pivot_perf_per_datpoint, 
    x='delta_GroupAvg_Image_ELM', 
    y='delta_Dice',
    row='method_name',
    col='calibrator',
    hue='method_name',
    style='calibrator',
    height=4,
    facet_kws=dict(margin_titles=True)
    )
g.set_titles("")  # Set titles to empty string
g.set(xlim=(-0.006, 0.006), ylim=(-0.12, 0.12))
# Show the plot
g.fig.subplots_adjust(hspace=0.1, wspace=0.15)
# Add correlation coefficients
add_corr_coefficients(
    g, 
    data=sorted_pivot_perf_per_datpoint, 
    x='delta_GroupAvg_Image_ELM', 
    y='delta_Dice',
    row='method_name',
    col='calibrator'
)
# Add a title to the entire figure, and make it slightly bigger than the default
g.fig.suptitle('Relationship Between Total Group Avg ELM Delta and Dice Score Delta', size=20)
g.fig.subplots_adjust(top=0.9)

In [None]:
g = sns.relplot(
    data=sorted_pivot_perf_per_datpoint, 
    x='delta_GroupAvg_Image_Foreground-ELM', 
    y='delta_Dice',
    row='method_name',
    col='calibrator',
    hue='method_name',
    style='calibrator',
    height=4,
    facet_kws=dict(margin_titles=True)
    )
g.set_titles("")  # Set titles to empty string
g.set(xlim=(-0.15, 0.15), ylim=(-0.12, 0.12))
# Show the plot
g.fig.subplots_adjust(hspace=0.1, wspace=0.15)
# Add correlation coefficients
add_corr_coefficients(
    g, 
    data=sorted_pivot_perf_per_datpoint, 
    x='delta_GroupAvg_Image_Foreground-ELM', 
    y='delta_Dice',
    row='method_name',
    col='calibrator'
)
# Add a title to the entire figure, and make it slightly bigger than the default
g.fig.suptitle('Relationship Between Foreground Group Avg ELM Delta and Dice Score Delta', size=20)
g.fig.subplots_adjust(top=0.9)