In [None]:
import sys
sys.path.append('/storage/vbutoi/projects')
sys.path.append('/storage/vbutoi/libraries')
sys.path.append('/storage/vbutoi/projects/ESE')
sys.path.append('/storage/vbutoi/projects/UniverSeg')

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

import os 
os.environ['DATAPATH'] = ':'.join((
       '/storage/vbutoi/datasets',
))

# Results loader object does everything
from ionpy.analysis import ResultsLoader
from pathlib import Path
root = Path("/storage/vbutoi/scratch/ESE")
rs = ResultsLoader()

# For using code without restarting.
%load_ext autoreload
%autoreload 2
# For using yaml configs.
%load_ext yamlmagic

In [None]:
%%yaml results_cfg 

log:
    root: /storage/vbutoi/scratch/ESE/inference
    load_pixel_meters: False 
    remove_shared_columns: False
    add_dice_loss_rows: True
    drop_nan_metric_rows: True 
    add_baseline_rows: True 
    equal_rows_per_cfg_assert: True 
    inference_group: "02_07_24_WMH_NewMetrics"
    min_fg_pixels: 100
    
calibration:
    num_bins: 15
    square_diff: False 
    neighborhood_width: 3
    metric_cfg_file: "/storage/vbutoi/projects/ESE/ese/experiment/configs/inference/Calibration_Metrics.yaml"

In [None]:
from ese.experiment.analysis.analyze_inf import load_cal_inference_stats

image_info_df = load_cal_inference_stats(
    results_cfg=results_cfg,
    load_cached=True
)

In [None]:
# Focus on the cal split
image_info_df = image_info_df[image_info_df['split'] == 'cal']

## Let's try to see if there is any hope with having better ECE/ELM makes better ensembles. Note that this isn't a conclusive result just because the number of samples per images that are used to calculate ECE/ELM are not sufficient to get actual statistical quantities.

### We have to add to each row a column that is the difference betweeen the row's metric_score and the metric_score corresponding to the same image metric as mean uncalibrated UNet performance. 

In [None]:
# Step 1: Get the rows corresponding to a unet aveages across multiple seeds with no calibration.
average_unet_row = image_info_df[(image_info_df['pretrained_seed'] == 'Average') & (image_info_df['calibrator'] == 'Uncalibrated')]

In [None]:
unique_datapoint_cols = ['data_id', 'slice_idx', 'image_metric', 'groupavg_image_metric']

In [None]:
# Sanity check that for each datapoint we only have one average unet row.
num_avg_unets_per_datapoint = average_unet_row.groupby(unique_datapoint_cols).size()
assert num_avg_unets_per_datapoint.max() == 1,\
    f"There should be only one row for each data_id, slice_idx, image_metric, and groupavg image metric combination, got {num_avg_unets_per_datapoint}."

In [None]:
# Make sure there are no NaNs in the average UNet rows.
assert average_unet_row['metric_score'].isna().sum() == 0, "There should be no NaNs in metric_score of UNet rows."
assert average_unet_row['groupavg_metric_score'].isna().sum() == 0, "There should be no NaNs in groupavg_metric_score of UNet rows."

In [None]:
# Step 2: Merge based on 'image_metric', 'subject_id', and 'slice_idx'
info_df_w_avg_unet_cols = pd.merge(
    image_info_df, 
    average_unet_row[unique_datapoint_cols + ['metric_score', 'groupavg_metric_score']], 
    on=unique_datapoint_cols, 
    how='left', 
    suffixes=('', '_average_unet')
    )

In [None]:
# Step 3: Calculate the difference
info_df_w_avg_unet_cols['metric_delta'] = info_df_w_avg_unet_cols['metric_score'] - info_df_w_avg_unet_cols['metric_score_average_unet'] # Current - Baseline
info_df_w_avg_unet_cols['groupavg_metric_delta'] = info_df_w_avg_unet_cols['groupavg_metric_score'] - info_df_w_avg_unet_cols['groupavg_metric_score_average_unet'] # Current - Baseline
# Drop those columns
info_df_w_delta = info_df_w_avg_unet_cols.drop(columns=['metric_score_average_unet', 'groupavg_metric_score_average_unet'])

## Now we can look at trends! We want to make some scatterplots to look at relationships between calibration scores and their relative improvement over the baseline.   

In [None]:
# First, we want only the rows corresponding to group metrics, no longer looking at seeds.
grouped_models_df = info_df_w_delta[info_df_w_delta['model_type'] == 'group'].reset_index(drop=True)

In [None]:
# SANITY CHECKS, MAKE SURE THAT FOR CALIBRATORS UNCALIBRATED, TEMPERATURE_SCALING, LTS
for calibrator in ["Uncalibrated", "Temperature_Scaling", "LTS"]:
    unique_qual_metrics = grouped_models_df[grouped_models_df['metric_type'] == 'quality']['image_metric'].unique()
    for quality_metric in unique_qual_metrics:
        # Checkign that the delta is 0 for the calibrator and the quality_metric
        rows = grouped_models_df[
            (grouped_models_df['calibrator'] == calibrator) & 
            (grouped_models_df['image_metric'] == quality_metric) &
            (grouped_models_df['method_name'] == 'UNet (seed=Average)')
        ]
        assert (rows['metric_delta'] == 0).all(),\
            f"Delta from base should be 0 for the calibrator {calibrator} and the quality metric {quality_metric}, got {rows['metric_delta']}."

In [None]:
# Make a bunch of new rows where the image_metric is the groupavg_image_metric and the metric_score is the groupavg_metric_score and the metric_delta is the groupavg_metric_delta
groupavg_rows = grouped_models_df.copy()
groupavg_rows['image_metric'] = groupavg_rows['groupavg_image_metric']
groupavg_rows['metric_score'] = groupavg_rows['groupavg_metric_score']
groupavg_rows['metric_delta'] = groupavg_rows['groupavg_metric_delta']
# Drop the groupavg columns
standard_image_rows = grouped_models_df.drop(columns=['groupavg_image_metric', 'groupavg_metric_score', 'groupavg_metric_delta']) 
groupavg_rows = groupavg_rows.drop(columns=['groupavg_image_metric', 'groupavg_metric_score', 'groupavg_metric_delta'])
# Concatenate the two
grouped_models_df = pd.concat([standard_image_rows, groupavg_rows], axis=0, ignore_index=True)

In [None]:
# Create a pivot table with 'metric_type' as columns
pivot_grouped_models_df = grouped_models_df.pivot_table(
    index=['configuration', 'method_name', 'calibrator', 'data_id', 'slice_idx'],
    values=['metric_score', 'metric_delta'], 
    columns=['metric_type', 'image_metric'], 
    aggfunc='mean'
).reset_index()

In [None]:
# Make a copy so that we can modify the column names
pivot_perf_per_datpoint = pivot_grouped_models_df.copy()
# Make new column names.
new_cols = []
for col in pivot_grouped_models_df.columns.values:
    if col[0] == 'metric_delta':
        new_cols.append(f'delta_{col[-1]}')
    elif col[-1] == '':
        new_cols.append(col[0])
    else:
        new_cols.append(col[-1])
# Set the column names to be the lowest non empty level per column in the multi-index
pivot_perf_per_datpoint.columns = new_cols

In [None]:
pivot_perf_per_datpoint['method_name'].unique()

In [None]:
pivot_perf_per_datpoint['calibrator'].unique()

In [None]:
pivot_perf_per_datpoint['method_name'] = pivot_perf_per_datpoint['method_name'].astype('category')
pivot_perf_per_datpoint['method_name'] = pivot_perf_per_datpoint['method_name'].cat.reorder_categories([
    'Average UNet',
    'Ensemble (mean, logits)', 
    'Ensemble (mean, probs)', 
    'Ensemble (product, probs)', 
    ])


pivot_perf_per_datpoint['calibrator'] = pivot_perf_per_datpoint['calibrator'].astype('category')
pivot_perf_per_datpoint['calibrator'] = pivot_perf_per_datpoint['calibrator'].cat.reorder_categories([
    'Uncalibrated',
    'Vanilla',
    'Temperature_Scaling', 
    'Vector_Scaling', 
    'Dirichlet_Scaling',
    'LTS', 
    'NECTAR_Scaling'
])

## Looking at change in *predicted ensemble* calibration vs change in Dice.

In [None]:
# Assert that each subplot will have the same number of datapoints on it.
num_per_config = pivot_perf_per_datpoint.groupby(['method_name', 'calibrator']).size()
assert len(num_per_config.unique()) == 1,\
    f"Each subplot should have the same number of datapoints on it, got {num_per_config}."

In [None]:
num_per_config

In [None]:
from ese.experiment.analysis.analysis_utils.plot_utils import plot_method_vs_calibrator_scatterplots, clump_df_datapoints

In [None]:
pivot_perf_per_datpoint.keys()

In [None]:
clump_points = False

for calibration_metric in ['ECE', 'CW-ECE', 'Edge-ECE', 'ECW-ECE', 'Edge-ELM', 'ELM']:
    for quality_metric in ['delta_Dice', 'delta_HD95', 'delta_BoundaryIOU']:
        # for foreground_option in ['', 'Foreground-']:
        for foreground_option in ['', 'Foreground-']:
            # for group_option in ['', 'GroupAvg_']:
            x_metric_name = f'delta_GroupAvg_Image_{foreground_option}{calibration_metric}'
            y_metric_name = quality_metric 
            if clump_points:
                # Bin the predictions
                num_bins = 90
                # Gather points into super-points.
                clumped_df = clump_df_datapoints(
                    pivot_perf_per_datpoint, 
                    num_bins=50, 
                    x='method_name', 
                    y='calibrator',
                    x_metric=x_metric_name,
                    y_metric=y_metric_name
                )
                # Assert that the number of points in the plot is the same
                num_per_config = clumped_df.groupby(['method_name', 'calibrator']).size()
                assert num_per_config.max() == num_per_config.min(),\
                    f"Each subplot should have the same number of datapoints on it, got {num_per_config}."
                # Verify that there are no NaNs in the binned_pivot_per_datapoint for rows of the chosen y metric
                assert clumped_df[y_metric_name].isna().sum() == 0,\
                    f"There should be no NaNs in {y_metric_name} in binned_pivot_per_datapoint, got {clumped_df[y_metric_name]}."
                plot_df = clumped_df
            else:
                plot_df = pivot_perf_per_datpoint 

            # Plot the relationship between the two metrics
            plot_method_vs_calibrator_scatterplots(
                df=plot_df, 
                x=x_metric_name, 
                y=y_metric_name,
                sharex=False,
                sharey=False,
                height=5
            )