In [None]:
import sys
sys.path.append('/storage/vbutoi/projects')
sys.path.append('/storage/vbutoi/libraries')
sys.path.append('/storage/vbutoi/projects/ESE')
sys.path.append('/storage/vbutoi/projects/UniverSeg')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("darkgrid")
sns.set_context("talk")
pd.set_option('display.max_rows', 200)  # or any other number you prefer


import os 
os.environ['DATAPATH'] = ':'.join((
       '/storage/vbutoi/datasets',
))

from ese.experiment.analysis.analyze_inf import load_cal_inference_stats
# Results loader object does everything
from ionpy.analysis import ResultsLoader
from pathlib import Path
root = Path("/storage/vbutoi/scratch/ESE")
rs = ResultsLoader()

# For using code without restarting.
%load_ext autoreload
%autoreload 2
# For using yaml configs.
%load_ext yamlmagic

In [None]:
%%yaml results_cfg 

log:
    root: "/storage/vbutoi/scratch/ESE/inference"
    inference_groups: 
        - '07_16_24_ACDC_Experiment_1_ALLSPLITS'

options:
    add_dice_loss_rows: True
    drop_nan_metric_rows: True 
    remove_shared_columns: False
    equal_rows_per_cfg_assert: False 

In [None]:
inference_df = load_cal_inference_stats(
    results_cfg=results_cfg,
    load_cached=False,
)

In [None]:
# Add column for what the pretraining loss function was called.
def pretrain_loss_func(model_class, pretraining_loss_func_class, loss_func_class):
    if model_class.split('.')[-1] == "UNet":
        return loss_func_class.split('.')[-1]
    else:
        return pretraining_loss_func_class.split('.')[-1]

# Add columns for what the posthoc loss function was.
def finetune_loss_func(model_class, loss_func_class):
    if model_class.split('.')[-1] == "UNet":
        return "None"
    else:
        return loss_func_class.split('.')[-1]

# Add column for what the posthoc method was.
def calibration_method(model_class, experiment_model_dir):
    model_class_name = model_class.split('.')[-1]
    if model_class_name == "Identity":
        return "Finetune"
    elif model_class_name == "Temperature_Scaling":
        return "TempScaling" 
    elif model_class_name == "ImageBasedTS":
        return "IBTS" 
    elif model_class_name == "LocalTS":
        return "LTS" 
    else:
        if "SVLS" in experiment_model_dir:
            return "SVLS"
        else:
            return "None" 

def total_method(pretrain_loss_func, finetune_loss_func, calibration_method):
    return f"{calibration_method},PT:{pretrain_loss_func},FT:{finetune_loss_func}"

def loss_funcs(pretrain_loss_func, finetune_loss_func):
    return f"{pretrain_loss_func} -> {finetune_loss_func}"


inference_df.augment(pretrain_loss_func)
inference_df.augment(finetune_loss_func)
inference_df.augment(calibration_method)
inference_df.augment(total_method)
inference_df.augment(loss_funcs)

In [None]:
# For the purpose of this experiment, we only care about a few columns in particular:
exp_columns = [
    "data_id",
    "pretrain_loss_func",
    "finetune_loss_func",
    "calibration_method",
    "total_method",
    "loss_funcs",
    "gt_proportion",
    "soft_proportion",
    "hard_proportion",
    "experiment_pretrained_seed", # For this experiment we only use one seed but good anyways.
    "split"
]
# Take these columns of the inference_df, drop other columns.
exp_df = inference_df[exp_columns].drop_duplicates().reset_index(drop=True)

In [None]:
# Get number of examples we are evaluating on.
len(exp_df['data_id'].unique())

In [None]:
# Make two new columns, one for the soft proportion error and one for the hard proportion error.
for pred_type in ['soft', 'hard']:
    exp_df[f'{pred_type}_error'] = (exp_df[f'{pred_type}_proportion'] - exp_df['gt_proportion'])
    exp_df[f'{pred_type}_relative_error'] = exp_df[f'{pred_type}_error'] / exp_df['gt_proportion']

In [None]:
exp_df

# Library Funcs

In [None]:
def prepare_error_df(raw_df, groupby_keys, value_vars, var_name="proportion_type", value_name="error"):
    # Make a clone of the proportion df.
    input_df = raw_df.copy()
    # Melt the dataframe to have a single column for the error.
    error_df = pd.melt(
        input_df,
        id_vars=groupby_keys,
        value_vars=value_vars,
        var_name=var_name,
        value_name=value_name,
    )
    # Make some columns that are useful for plotting.
    error_df[f'absolute_{value_name}'] = error_df[value_name].abs()
    # Return the melted dataframe.
    return error_df

def process_method_names(input_df):
    # Make a clone of the input_df
    df = input_df.copy()
    # Then we augment the proportion_type with the calibrator name.
    df = df.sort_values(by="data_id")
    # Drop the duplicate rows and reset the index.
    df = df.drop_duplicates().reset_index(drop=True)
    # Return the augmented dataframe.
    return df

# ABSOLUTE ERROR

In [None]:
# Make some columns that are useful for plotting.
raw_melted_error_df = prepare_error_df(
    exp_df, 
    groupby_keys=exp_columns,
    value_vars=["soft_error", "hard_error"],
    var_name="proportion_type",
    value_name="error"
)
# Process the proportion types.
melted_error_df = process_method_names(raw_melted_error_df)

In [None]:
print(melted_error_df['calibration_method'].unique())
print(melted_error_df['loss_funcs'].unique())

In [None]:
# Finally, we want to change the order that the methods appear in.
melted_error_df['calibration_method'] = melted_error_df['calibration_method'].astype('category')
melted_error_df['calibration_method'] = melted_error_df['calibration_method'].cat.reorder_categories([
    'None',
    "SVLS",
    'TempScaling',
    'IBTS',
    'LTS'
])
# Finally, we want to change the order that the methods appear in.
melted_error_df['loss_funcs'] = melted_error_df['loss_funcs'].astype('category')
melted_error_df['loss_funcs'] = melted_error_df['loss_funcs'].cat.reorder_categories([
    'PixelCELoss -> None',
    'SoftDiceLoss -> None',
    'PixelCELoss -> PixelCELoss',
    'PixelCELoss -> SoftDiceLoss',
    'SoftDiceLoss -> PixelCELoss',
    'SoftDiceLoss -> SoftDiceLoss'
])

In [None]:
# Always should have an assert that the number of examples is the same.
melted_error_df.groupby(['calibration_method', 'loss_funcs', 'proportion_type', 'split']).size()

In [None]:
# Create the catplot
g = sns.catplot(
    data=melted_error_df,      # Ensure you use the 'data' parameter correctly.
    x="calibration_method",
    y="absolute_error",
    hue="loss_funcs",
    col="proportion_type",
    row="split",
    kind='box',
    height=10,  # Adjust the height as needed
)

# For each subplot, add a line at y = 0 to show the error.
for ax in g.axes.flat:
    ax.axhline(0, ls='--', color='red')

# Show the plot
plt.show()

### We need to get the actual numbers now.

In [None]:
# We want to calculate now the average (across seeds) of the the absolute error for each method.
# That means first averaging within each seed, then averaging across seeds.
# We group by the data_id and the proportion_type.
error_per_method = melted_error_df.groupby(["experiment_pretrained_seed", "proportion_type"]).agg(
    abs_error=("absolute_error", "mean"),
).reset_index()
# Rename abs_error to absolute error.
error_per_seed = error_per_method.rename(columns={"abs_error": "absolute_error"})

In [None]:
# make a table with each row being the proportion type, and the column being the abs_error mean with standard deviation in another column.
method_error_table = error_per_seed.pivot_table(
    values="absolute_error",
    index="proportion_type",
    columns="experiment_pretrained_seed",
    aggfunc="mean",
)
# Average across the pretrained seeds to make an average column, and then make a stdv column.
method_error_table['mean'] = method_error_table.mean(axis=1)
method_error_table['stdv'] = method_error_table.std(axis=1)
# We can drop all other columns that aren't mean and stdv
method_error_table = method_error_table[['mean', 'stdv']]

In [None]:
method_error_table.sort_values(by='mean')

## RELATIVE ERROR

In [None]:
# Make some columns that are useful for plotting.
raw_melted_relative_df = prepare_error_df(
    exp_df, 
    groupby_keys=exp_columns,
    value_vars=["soft_relative_error", "hard_relative_error"],
    var_name="proportion_type",
    value_name="relative_error"
)
# Process the proportion types.
melted_relative_df = process_method_names(raw_melted_relative_df)

In [None]:
# Finally, we want to change the order that the methods appear in.
melted_error_df['calibration_method'] = melted_error_df['calibration_method'].astype('category')
melted_error_df['calibration_method'] = melted_error_df['calibration_method'].cat.reorder_categories([
    'None',
    "SVLS",
    'TempScaling',
    'IBTS',
    'LTS'
])
# Finally, we want to change the order that the methods appear in.
melted_error_df['loss_funcs'] = melted_error_df['loss_funcs'].astype('category')
melted_error_df['loss_funcs'] = melted_error_df['loss_funcs'].cat.reorder_categories([
    'PixelCELoss -> None',
    'SoftDiceLoss -> None',
    'PixelCELoss -> PixelCELoss',
    'PixelCELoss -> SoftDiceLoss',
    'SoftDiceLoss -> PixelCELoss',
    'SoftDiceLoss -> SoftDiceLoss'
])

In [None]:
melted_relative_df['proportion_type'].unique()

In [None]:
# Create the catplot
g = sns.catplot(
    data=melted_relative_df,      # Ensure you use the 'data' parameter correctly.
    x="calibration_method",
    y="relative_error",
    hue="loss_funcs",
    col="proportion_type",
    row="split",
    kind='box',
    height=10,  # Adjust the height as needed
)

# For each subplot, add a line at y = 0 to show the error.
for ax in g.axes.flat:
    ax.axhline(0, ls='--', color='red')

# Show the plot
plt.show()

In [None]:
# Create the catplot
g = sns.catplot(
    data=melted_relative_df,      # Ensure you use the 'data' parameter correctly.
    x="calibration_method",
    y="absolute_relative_error",
    hue="loss_funcs",
    col="proportion_type",
    row="split",
    kind='box',
    height=10,  # Adjust the height as needed
)

# For each subplot, add a line at y = 0 to show the error.
for ax in g.axes.flat:
    ax.axhline(0, ls='--', color='red')

# Show the plot
plt.show()

# And lets look at this over Data_ids.

In [None]:
# We want to calculate now the average (across seeds) of the the absolute error for each method.
# That means first averaging within each seed, then averaging across seeds.
# We group by the data_id and the proportion_type.
relative_per_dataid = melted_relative_df.groupby(["data_id", "proportion type"]).agg(
    abs_relative_error=("absolute relative error", "mean"),
).reset_index()
# Rename abs_error to absolute error.
relative_per_dataid = relative_per_dataid.rename(columns={"abs_relative_error": "absolute relative error"})

In [None]:
# make a table with each row being the proportion type, and the column being the abs_error mean with standard deviation in another column.
method_relative_table_dataid = relative_per_dataid.pivot_table(
    values="absolute relative error",
    index="proportion type",
    columns="data_id",
    aggfunc="mean",
)
# Average across the pretrained seeds to make an average column, and then make a stdv column.
method_relative_table_dataid['mean'] = method_relative_table_dataid.mean(axis=1)
method_relative_table_dataid['stdv'] = method_relative_table_dataid.std(axis=1)
# We can drop all other columns that aren't mean and stdv
method_relative_table_dataid = method_relative_table_dataid[['mean', 'stdv']]

In [None]:
method_relative_table_dataid.sort_values(by='mean')

# Finally, we want to look at the Dice score and Image ECE of all three methods as we are arguing that Image ECE is what we should care about.

In [None]:
inference_df

In [None]:
# Get the relevant columns for looking at the Dice score and Image ECE
metric_cols = [
    "data_id",
    "loss_func",
    "experiment_pretrained_seed",
    "model_pretrained_exp_root",
    "split",
    "image_metric",
    "metric_score"
]
# Take these columns of the inference_df, drop other columns and delete duplicate rows.
metric_df = inference_df[metric_cols].drop_duplicates().reset_index(drop=True)
metric_df.augment(calibrator)
# Drop the 'pretrained_exp_root column
metric_df = metric_df.drop(columns=["model_pretrained_exp_root"])

In [None]:
def train_method(calibrator, loss_func):
    return calibrator + f" ({loss_func})"

metric_df.augment(train_method)

In [None]:
metric_df

In [None]:
# Groupby the pretrained_seed and image_metrics, and calibrator, and take the mean of the metric_score.
metric_per_seed = metric_df.groupby(["experiment_pretrained_seed", "loss_func", "image_metric", "train_method"]).agg(
    metric_score=("metric_score", "mean"),
).reset_index()

In [None]:
# Finally, we want to change the order that the methods appear in.
metric_per_seed['train_method'] = metric_per_seed['train_method'].astype('category')
metric_per_seed['train_method'] = metric_per_seed['train_method'].cat.reorder_categories([
    'Uncalibrated (PixelCELoss)',
    'Uncalibrated (SoftDiceLoss)',
    'Uncalibrated (PixelFocalLoss)',
    'SVLS (PixelCELoss)',
    'SVLS (SoftDiceLoss)',
    'SVLS (PixelFocalLoss)',
])

In [None]:
result = metric_per_seed.groupby(['train_method', 'image_metric']).agg(
    mean_metric_score=('metric_score', 'mean'),
    std_metric_score=('metric_score', 'std')
).reset_index()

# Pivot the table to have one row for each train_method
result = result.pivot(
    index='train_method', 
    columns='image_metric', 
    values=[
        'mean_metric_score', 
        'std_metric_score'
    ]
)

# Flatten the column multi-index
result.columns = ['_'.join(col).strip() for col in result.columns.values]
result = result.reset_index()

In [None]:
result

In [None]:
# Get all the of the columns that are 'calibrator' and have 'Dice_Loss' in them
dice_cols = [col for col in result.columns if 'Dice Loss' in col or 'train_method' in col]
dice_result = result[dice_cols]

In [None]:
dice_result.sort_values(by='mean_metric_score_Dice Loss')

In [None]:
# Get all the of the columns that are 'calibrator' and have 'Image_ECE' in them
image_ece_cols = [col for col in result.columns if 'Image_ECE' in col or 'train_method' in col]
ece_result = result[image_ece_cols]

In [None]:
ece_result.sort_values(by='mean_metric_score_Image_ECE')