In [1]:
import sys
sys.path.append('/storage/vbutoi/projects')
sys.path.append('/storage/vbutoi/libraries')
sys.path.append('/storage/vbutoi/projects/ESE')
sys.path.append('/storage/vbutoi/projects/UniverSeg')

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

import os 
os.environ['DATAPATH'] = ':'.join((
       '/storage/vbutoi/datasets',
))

# Results loader object does everything
from ionpy.analysis import ResultsLoader
from pathlib import Path
root = Path("/storage/vbutoi/scratch/ESE")
rs = ResultsLoader()

# For using code without restarting.
%load_ext autoreload
%autoreload 
# For using yaml configs.
%load_ext yamlmagic

In [2]:
%%yaml results_cfg 

# EXPERIMENT SETS:
# - Calibration (no weighting): 01_18_24_All_Metrics
# - Only Foreground Loss: 01_21_24_Foreground_Calibrators
# - Balanced Loss: 01_21_24_Balanced_CE_Calibrators

log:
    root: /storage/vbutoi/scratch/ESE/inference
    load_pixel_meters: True 
    remove_shared_columns: True
    drop_nan_metric_rows: True
    min_fg_pixels: 100
    inference_paths:
        - "01_21_24_Foreground_Calibrators/WMH_Individual_Uncalibrated"
        - "01_21_24_Foreground_Calibrators/WMH_Individual_TempScaling"
        - "01_21_24_Foreground_Calibrators/WMH_Individual_VectorScaling"
        - "01_21_24_Foreground_Calibrators/WMH_Individual_DirichletScaling"
        - "01_21_24_Foreground_Calibrators/WMH_Individual_LTS"
        - "01_21_24_Foreground_Calibrators/WMH_Individual_NectarScaling"
        - "01_21_24_Foreground_Calibrators/WMH_Ensemble_Uncalibrated"
        - "01_21_24_Foreground_Calibrators/WMH_Ensemble_TempScaling"
        - "01_21_24_Foreground_Calibrators/WMH_Ensemble_VectorScaling"
        - "01_21_24_Foreground_Calibrators/WMH_Ensemble_DirichletScaling"
        - "01_21_24_Foreground_Calibrators/WMH_Ensemble_LTS"
        - "01_21_24_Foreground_Calibrators/WMH_Ensemble_NectarScaling"
    
calibration:
    conf_interval:
        - 0.5
        - 1.
    num_bins: 10
    square_diff: False 
    neighborhood_width: 3

cal_metrics:
    - ECE:
        _fn: ese.experiment.metrics.ece.ece_loss
    - CW_ECE:
        _fn: ese.experiment.metrics.ece.cw_ece_loss
    - Edge_ECE:
        _fn: ese.experiment.metrics.ece.edge_ece_loss
    - ELM:
        _fn: ese.experiment.metrics.elm.elm_loss
    - Foreground_ECE:
        _fn: ese.experiment.metrics.ece.ece_loss
        ignore_index: 0
    - Foreground_CW_ECE:
        _fn: ese.experiment.metrics.ece.cw_ece_loss
        ignore_index: 0
    - Foreground_Edge_ECE:
        _fn: ese.experiment.metrics.ece.edge_ece_loss
        ignore_index: 0       
    - Foreground_ELM:
        _fn: ese.experiment.metrics.elm.elm_loss
        ignore_index: 0

<IPython.core.display.Javascript object>

In [7]:
from ese.experiment.analysis.inference import load_cal_inference_stats

image_info_df = load_cal_inference_stats(
    results_cfg=results_cfg
)

Dropping NaN metric rows. Started with 137592 rows.
Dropping rows with NaN metric score. Dropped to 137588 rows.
Finished loading inference stats.
Log amounts: log.root                                                                                               log_set                                              
/storage/vbutoi/scratch/ESE/inference/01_21_24_Foreground_Calibrators/WMH_Ensemble_DirichletScaling    20240121_204347-ZFCG-010b6351ac41abec9e24ef71dfbb0408    5880
                                                                                                       20240121_204351-YCZ4-0fb1ebd6b1636fd7660e1a34952f219d    5880
/storage/vbutoi/scratch/ESE/inference/01_21_24_Foreground_Calibrators/WMH_Ensemble_LTS                 20240121_213421-ZFF0-ee240399f60946ca81df7c2eb72e4fa8    5880
                                                                                                       20240121_213425-0Z7E-793ebbf6d709fb9f2325f197a3eed8ea    5880
/storage/vbutoi/scratch

In [8]:
image_info_df['model_class'].unique()

array(['ese.experiment.models.UNet',
       'ese.experiment.models.calibrators.Temperature_Scaling',
       'ese.experiment.models.calibrators.Vector_Scaling',
       'ese.experiment.models.calibrators.Dirichlet_Scaling',
       'ese.experiment.models.calibrators.LTS',
       'ese.experiment.models.calibrators.NECTAR_Scaling',
       'ese.experiment.models.unet.UNet'], dtype=object)

## We are going to remove the case where there are very few pixels, cause unrealistic outliers.

In [4]:
# Sort the image_info_df by method name, so everything appears nicely
image_info_df = image_info_df.sort_values(by=['method_name', 'calibrator'])
# Make sure that the model_class 'Uncalibrated' is first
image_info_df['calibrator'] = image_info_df['calibrator'].astype('category')
image_info_df['calibrator'] = image_info_df['calibrator'].cat.reorder_categories(['Uncalibrated', 'Temperature_Scaling', 'Vector_Scaling', 'Dirichlet_Scaling', 'LTS', 'NECTAR_Scaling'])

ValueError: items in new_categories are not the same as in old categories

## Let's looks at the calibration scores of our models.

In [None]:
g = sns.catplot(
    data=image_info_df,
    x="calibrator",
    y="ECE",
    hue="method_name",
    kind="bar",
    height=5,
    aspect=2
)
# Set column spacingj
g.fig.subplots_adjust(wspace=0.5)
# # Set the y-axis limits
# g.set(ylim=(0.0, 0.25))
# Set the title of the plot
g.fig.suptitle("ECE by Calibration Method and Model Class")
# Move the title slightly up
g.fig.subplots_adjust(top=0.9)

In [None]:
g = sns.catplot(
    data=image_info_df,
    x="calibrator",
    y="Foreground_ECE",
    hue="method_name",
    kind="bar",
    height=5,
    aspect=2
)
# Set column spacing
# # Set the y-axis limits
# g.set(ylim=(0.0, 0.25))
# Set the title of the plot
g.fig.suptitle("Foreground ECE by Calibration Method and Model Class")
# Move the title slightly up
g.fig.subplots_adjust(top=0.9)

In [None]:
g = sns.catplot(
    data=image_info_df,
    x="calibrator",
    y="CW_ECE",
    hue="method_name",
    kind="bar",
    height=5,
    aspect=2
)
# Set column spacing
# # Set the y-axis limits
# g.set(ylim=(0.0, 0.25))
g.fig.suptitle("CW ECE by Calibration Method and Model Class")
# Move the title slightly up
g.fig.subplots_adjust(top=0.9)

In [None]:
g = sns.catplot(
    data=image_info_df,
    x="model_class",
    y="Edge_ECE",
    hue="method_name",
    kind="bar",
    height=5,
    aspect=2
)
# Set column spacing
# # Set the y-axis limits
# g.set(ylim=(0.0, 0.25))
g.fig.suptitle("Edge ECE by Calibration Method and Model Class")
# Move the title slightly up
g.fig.subplots_adjust(top=0.9)

In [None]:
g = sns.catplot(
    data=image_info_df,
    x="calibrator",
    y="ELM",
    hue="method_name",
    kind="bar",
    height=5,
    aspect=2
)
# Set column spacing
# # Set the y-axis limits
# g.set(ylim=(0.0, 0.25))
g.fig.suptitle("ELM by Calibration Method and Model Class")
# Move the title slightly up
g.fig.subplots_adjust(top=0.9)

## Now we can look at the quality averages themselves, first looking slice-wise.

In [None]:
table_df = image_info_df.groupby(["method_name", "calibrator", "image_metric", "ensemble"])['metric_score'].mean().reset_index()
dice_table = image_info_df[image_info_df["image_metric"] == "Dice"]
hd95_table = image_info_df[image_info_df["image_metric"] == "HD95"]
# Sort these by method name so they are consistent in the figures
dice_table = dice_table.sort_values(by=['method_name'])
hd95_table = hd95_table.sort_values(by=['method_name'])

In [None]:
g = sns.catplot(
    data=hd95_table,
    x="model_class",
    y="metric_score",
    hue="method_name",
    kind="bar",
    height=5,
    aspect=2
)
# Set the title of the bar plot
g.fig.suptitle("Hausdorff Distance for Different Calibration Methods (Per Slice)")
# Give the title a bit of spacing from the plot
g.fig.subplots_adjust(top=0.90)
# Set the y axis to be between 4 and 8
g.set(ylim=(4, 9))

In [None]:
g = sns.catplot(
    data=dice_table,
    x="model_class",
    y="metric_score",
    hue="method_name",
    kind="bar",
    height=5,
    aspect=2
)
# Set the title of the bar plot
g.fig.suptitle("Dice Score for Different Calibration Methods (Per Slice)")
# Give the title a bit of spacing from the plot
g.fig.subplots_adjust(top=0.90)
# Set the y axis to be between 0.5 and 1.0
g.set(ylim=(0.7, 0.8))

## Now we want to consider these averaged within subjects.

In [None]:
# subj_image_info_df = image_info_df.groupby(["method_name", "model_class", "qual_metric", "ensemble", "data_id"])['qual_score'].mean().reset_index()

# subject_dice_table = subj_image_info_df[subj_image_info_df["qual_metric"] == "Dice"]
# subject_hd95_table = subj_image_info_df[subj_image_info_df["qual_metric"] == "HD95"]
# # sort these by method name so they are consistent in the tables
# subject_dice_table = subject_dice_table.sort_values(by=['method_name'])
# subject_hd95_table = subject_hd95_table.sort_values(by=['method_name'])

In [None]:
# g = sns.catplot(
#     data=subject_hd95_table,
#     x="model_class",
#     y="qual_score",
#     hue="method_name",
#     kind="bar",
#     height=4,
#     aspect=2
# )
# # Set the title of the bar plot
# g.fig.suptitle("Hausdorff Distance for Different Calibration Methods (Per Subject)")
# # Give the title a bit of spacing from the plot
# g.fig.subplots_adjust(top=0.90)
# # Set the y axis to be between 4 and 8
# g.set(ylim=(4, 14))

In [None]:
# g = sns.catplot(
#     data=subject_dice_table,
#     x="model_class",
#     y="qual_score",
#     hue="method_name",
#     kind="bar",
#     height=4,
#     aspect=2
# )
# # Set the title of the bar plot
# g.fig.suptitle("Dice Score for Different Calibration Methods (Per Subject)")
# # Give the title a bit of spacing from the plot
# g.fig.subplots_adjust(top=0.90)
# # Set the y axis to be between 0.5 and 1.0
# g.set(ylim=(0.6, 0.9))

## Make some tables to show these relationships in

In [None]:
# # Custom formatting function to display 3 significant digits
# def format_sigfigs(x, num_sigfigs):
#     if isinstance(x, (int, float)):
#         format_str = '{:.' + str(num_sigfigs) + 'g}'
#         return format_str.format(x)  # Using format to display in scientific notation with specified significant digits
#     else:
#         return x  # Return the value as is if it's not numeric

# # Applying the formatting function to the pivot table
# formatted_dice_table = dice_table.applymap(format_sigfigs, num_sigfigs=3)
# # Applying the formatting function to the pivot table
# formatted_hd95_table = hd95_table.applymap(format_sigfigs, num_sigfigs=4)

In [None]:
# formatted_dice_table.pivot(index='method_name', columns='model_class', values='qual_score')

In [None]:
# formatted_hd95_table.pivot(index='method_name', columns='model_class', values='qual_score')

## Let's look first at the distribution of errors per configuration.

In [None]:
# from ese.experiment.analysis.plot_utils import build_ensemble_vs_individual_cmap

In [None]:
# dice_image_df = image_info_df[image_info_df['qual_metric'] == 'Dice']
# # Use seaborn to create KDE plot for each configuration
# g = sns.displot(
#     data=dice_image_df.sort_values('configuration'), 
#     x='qual_score', 
#     hue='configuration', 
#     kind='kde',
#     palette=build_ensemble_vs_individual_cmap(dice_image_df),
#     alpha=0.8
#     )

In [None]:
# dice_image_subject_df = dice_image_df.groupby(['configuration', 'data_id'])['qual_score'].mean().reset_index()
# g = sns.displot(
#     data=dice_image_subject_df.sort_values('configuration'), 
#     x='qual_score', 
#     hue='configuration', 
#     kind='kde',
#     palette=build_ensemble_vs_individual_cmap(dice_image_df),
#     alpha=0.8
#     )