In [None]:
import sys
sys.path.append('/storage/vbutoi/projects')
sys.path.append('/storage/vbutoi/libraries')
sys.path.append('/storage/vbutoi/projects/ESE')
sys.path.append('/storage/vbutoi/projects/UniverSeg')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style("darkgrid")
sns.set_context("talk")

import os 
os.environ['DATAPATH'] = ':'.join((
       '/storage/vbutoi/datasets',
))

from ese.experiment.analysis.analyze_inf import load_cal_inference_stats
# Results loader object does everything
from ionpy.analysis import ResultsLoader
from pathlib import Path
root = Path("/storage/vbutoi/scratch/ESE")
rs = ResultsLoader()

# For using code without restarting.
%load_ext autoreload
%autoreload 2
# For using yaml configs.
%load_ext yamlmagic

In [None]:
%%yaml results_cfg 

log:
    root: /storage/vbutoi/scratch/ESE/inference
    inference_groups: 
        - '07_15_24_ACDC_Experiment_1'

options:
    add_dice_loss_rows: True
    drop_nan_metric_rows: True 
    remove_shared_columns: False
    equal_rows_per_cfg_assert: False 

In [6]:
inference_df = load_cal_inference_stats(
    results_cfg=results_cfg,
    load_cached=False,
)

Dropping (datapoint, metric) pairs with NaN metric score. Dropped from 1080 -> 1080 rows.


  inference_df = pd.concat(inference_pd_collection, axis=0)


ValueError: Grouper for 'log_set' not 1-dimensional

In [None]:
for ikey in inference_df.keys():
    print(ikey)

In [None]:
raise ValueError

In [None]:
inference_df['model_pretrained_exp_root'].unique()    

In [None]:
inference_df['loss_func_class'].unique()

In [None]:
inference_df['model_class'].unique()

In [None]:
def pretrain_loss_func(model_class, loss_func_class):
    if model_class.split('.')[-1] == "UNet":
        return loss_func_class
    else:
        #TODO: FILL ME!
        return None

def finetune_loss_func(model_class, loss_func_class):
    if model_class.split('.')[-1] == "UNet":
        return "None"
    else:
        return loss_func_class.split('.')[-1]

def posthoc_method(model_class):
    model_class_name = model_class.split('.')[-1]
    if model_class_name == "None":
        return "Finetune"
    else:
        return model_class_name

inference_df.augment(pretrain_loss_func)
inference_df.augment(finetune_loss_func)
inference_df.augment(posthoc_method)

In [None]:
# For the purpose of this experiment, we only care about a few columns in particular:
exp_columns = [
    "data_id",
    "loss_func",
    "new_gt_proportion", # This is after our resizing and blurring
    "gt_proportion",
    "soft_proportion",
    "hard_proportion",
    "experiment_pretrained_seed", 
    "model_pretrained_exp_root",
    "split",
]
# Take these columns of the inference_df, drop other columns.
exp_df = inference_df[exp_columns].drop_duplicates().reset_index(drop=True)

In [None]:
# Get number of examples we are evaluating on.
len(exp_df['data_id'].unique())

In [None]:
# We need to rename 'proportion' to 'proportion' for the sake of the experiment.
proportion_df = exp_df.rename(columns={
    "gt_proportion": "gt proportion",
    "new_gt_proportion": "new gt proportion",
    "hard_proportion": "hard proportion",
    "soft_proportion": "soft proportion",
})

In [None]:
# Make two new columns, one for the soft proportion error and one for the hard proportion error.
proportion_df['new gt error'] = (proportion_df['new gt proportion'] - proportion_df['gt proportion'])
proportion_df['soft error'] = (proportion_df['soft proportion'] - proportion_df['gt proportion'])
proportion_df['hard error'] = (proportion_df['hard proportion'] - proportion_df['gt proportion'])
# Make the normalized metric that divides the error by the ground truth proportion.
proportion_df['new gt relative error'] = proportion_df['new gt error'] / proportion_df['gt proportion']
proportion_df['soft relative error'] = proportion_df['soft error'] / proportion_df['gt proportion']
proportion_df['hard relative error'] = proportion_df['hard error'] / proportion_df['gt proportion']

In [None]:
proportion_df

In [None]:
# Important for this experiment that we only consider the 'val' and 'cal' splits because we trained on the 'train' split.
proportion_df['split'].unique()

# Library Funcs

In [None]:
def prepare_error_df(raw_df, groupby_keys, value_vars, var_name, value_name):
    # Make a clone of the proportion df.
    input_df = raw_df.copy()
    # Melt the dataframe to have a single column for the error.
    error_df = pd.melt(
        input_df,
        id_vars=groupby_keys,
        value_vars=value_vars,
        var_name=var_name,
        value_name=value_name,
    )
    # Make some columns that are useful for plotting.
    error_df[f'absolute {value_name}'] = error_df[value_name].abs()
    # Return the melted dataframe.
    return error_df


def calibrator(model_pretrained_exp_root):
    if "SVLS" in model_pretrained_exp_root:
        return "SVLS"
    else:
        return "Uncalibrated"


def process_method_names(input_df, value_name):
    # Make a clone of the input_df
    df = input_df.copy()
    # Drop all the rows where calibrator != Uncalibrated AND the proportion_type is hard_proportion_error.
    df = df[~((df['calibrator'] != 'Uncalibrated') & (df['proportion_type'] == f'hard {value_name}'))]
    # Then we augment the proportion_type with the calibrator name.
    def proportion_type(calibrator, loss_func, proportion_type):
        if proportion_type in ["new gt error", "new gt relative error"]:
            return "New GT"
        elif calibrator == "Uncalibrated":
            return "Uncalibrated " + proportion_type.split(" ")[0] + f" ({loss_func})"
        else:
            return calibrator + " soft" + f" ({loss_func})"
    # Finally, sort by data_id
    df['proportion type'] = df.apply(lambda x: proportion_type(x['calibrator'], x['loss_func'], x['proportion_type']), axis=1)
    df = df.sort_values(by="data_id")
    # Drop the duplicate rows and reset the index.
    df = df.drop_duplicates().reset_index(drop=True)
    # Return the augmented dataframe.
    return df

# First, let's look at absolute error.

In [None]:
proportion_df.augment(calibrator)
# Make some columns that are useful for plotting.
raw_melted_error_df = prepare_error_df(
    proportion_df, 
    groupby_keys=[
        "calibrator",
        "loss_func",
        "data_id", 
        "experiment_pretrained_seed", 
        "gt proportion", 
        "new gt proportion",
        "soft proportion", 
        "hard proportion",
        "split",
    ],
    value_vars=["new gt error", "soft error", "hard error"],
    var_name="proportion_type",
    value_name="error"
)
# Process the proportion types.
melted_error_df = process_method_names(raw_melted_error_df, value_name="error")

In [None]:
melted_error_df

In [None]:
melted_error_df['proportion type'].unique()

In [None]:
# Finally, we want to change the order that the methods appear in.
melted_error_df['proportion type'] = melted_error_df['proportion type'].astype('category')
melted_error_df['proportion type'] = melted_error_df['proportion type'].cat.reorder_categories([
    'New GT',
    'Uncalibrated hard (PixelCELoss)',
    'Uncalibrated soft (PixelCELoss)',
    'Uncalibrated hard (SoftDiceLoss)',
    'Uncalibrated soft (SoftDiceLoss)',
    'Uncalibrated hard (PixelFocalLoss)',
    'Uncalibrated soft (PixelFocalLoss)',
    'SVLS soft (PixelCELoss)',
    'SVLS soft (SoftDiceLoss)',
    'SVLS soft (PixelFocalLoss)',
])

# Custom palette dictionary
custom_palette = {
    'New GT': 'red',
    'Uncalibrated hard (PixelCELoss)': 'blue',
    'Uncalibrated soft (PixelCELoss)': 'royalblue',
    'Uncalibrated hard (SoftDiceLoss)': 'darkgreen',
    'Uncalibrated soft (SoftDiceLoss)': 'mediumseagreen',
    'Uncalibrated hard (PixelFocalLoss)': 'indianred',
    'Uncalibrated soft (PixelFocalLoss)': 'lightcoral',
    'SVLS soft (PixelCELoss)': 'goldenrod',
    'SVLS soft (SoftDiceLoss)': 'gold',
    'SVLS soft (PixelFocalLoss)': 'khaki',
}

In [None]:
# Create the catplot
g = sns.catplot(
    data=melted_error_df,      # Ensure you use the 'data' parameter correctly.
    x="proportion type",
    y="absolute error",
    col="experiment_pretrained_seed",
    hue="proportion type",
    col_wrap=4,
    sharex=False,
    aspect=1.5,
    palette=custom_palette
)

# For each subplot, add a line at y = 0 to show the error.
for ax in g.axes.flat:
    ax.axhline(0, ls='--', color='red')
    ax.set_xticklabels([])

# Adjust the layout
plt.subplots_adjust(top=0.85)
g.fig.suptitle('Soft/Hard proportion Error', fontsize=30)

# Add a legend
g.add_legend(title='proportion Type')

# Show the plot
plt.show()

In [None]:
melted_error_df

In [None]:
# We want to calculate now the average (across seeds) of the the absolute error for each method.
# That means first averaging within each seed, then averaging across seeds.
# We group by the data_id and the proportion_type.
error_per_seed = melted_error_df.groupby(["experiment_pretrained_seed", "proportion type"]).agg(
    abs_error=("absolute error", "mean"),
).reset_index()
# Rename abs_error to absolute error.
error_per_seed = error_per_seed.rename(columns={"abs_error": "absolute error"})

In [None]:
# make a table with each row being the proportion type, and the column being the abs_error mean with standard deviation in another column.
method_error_table = error_per_seed.pivot_table(
    values="absolute error",
    index="proportion type",
    columns="experiment_pretrained_seed",
    aggfunc="mean",
)
# Average across the pretrained seeds to make an average column, and then make a stdv column.
method_error_table['mean'] = method_error_table.mean(axis=1)
method_error_table['stdv'] = method_error_table.std(axis=1)
# We can drop all other columns that aren't mean and stdv
method_error_table = method_error_table[['mean', 'stdv']]

In [None]:
method_error_table.sort_values(by='mean')

## Second, let's look at relative error.

In [None]:
proportion_df.augment(calibrator)
# Make some columns that are useful for plotting.
raw_melted_relative_df = prepare_error_df(
    proportion_df, 
    groupby_keys=[
        "calibrator",
        "loss_func",
        "data_id", 
        "experiment_pretrained_seed", 
        "gt proportion", 
        "new gt proportion",
        "soft proportion", 
        "hard proportion",
        "split",
    ],
    value_vars=["new gt relative error", "soft relative error", "hard relative error"],
    var_name="proportion_type",
    value_name="relative error"
)
# Process the proportion types.
melted_relative_df = process_method_names(raw_melted_relative_df, value_name="relative error")

In [None]:
melted_relative_df['proportion type'].unique()

In [None]:
# Finally, we want to change the order that the methods appear in.
melted_error_df['proportion type'] = melted_error_df['proportion type'].astype('category')
melted_error_df['proportion type'] = melted_error_df['proportion type'].cat.reorder_categories([
    'New GT',
    'Uncalibrated hard (PixelCELoss)',
    'Uncalibrated soft (PixelCELoss)',
    'Uncalibrated hard (SoftDiceLoss)',
    'Uncalibrated soft (SoftDiceLoss)',
    'Uncalibrated hard (PixelFocalLoss)',
    'Uncalibrated soft (PixelFocalLoss)',
    'SVLS soft (PixelCELoss)',
    'SVLS soft (SoftDiceLoss)',
    'SVLS soft (PixelFocalLoss)',
])

# Custom palette dictionary
custom_palette = {
    'New GT': 'red',
    'Uncalibrated hard (PixelCELoss)': 'blue',
    'Uncalibrated soft (PixelCELoss)': 'royalblue',
    'Uncalibrated hard (SoftDiceLoss)': 'darkgreen',
    'Uncalibrated soft (SoftDiceLoss)': 'mediumseagreen',
    'Uncalibrated hard (PixelFocalLoss)': 'indianred',
    'Uncalibrated soft (PixelFocalLoss)': 'lightcoral',
    'SVLS soft (PixelCELoss)': 'goldenrod',
    'SVLS soft (SoftDiceLoss)': 'gold',
    'SVLS soft (PixelFocalLoss)': 'khaki',
}

In [None]:
# Create the catplot
g = sns.catplot(
    data=melted_relative_df,      # Ensure you use the 'data' parameter correctly.
    x="proportion type",
    y="relative error",
    col="experiment_pretrained_seed",
    hue="proportion type",
    col_wrap=4,
    sharex=False,
    aspect=1.5,
    palette=custom_palette
)

# Calculate the means
means = melted_relative_df.groupby(['experiment_pretrained_seed', 'proportion type'])['relative error'].mean().reset_index()

# For each subplot make a line at y = 0 to show the error and add the means
for ax in g.axes.flat:
    ax.axhline(0.0, ls='--', color='red')
    ax.set_xticklabels([])
    
    # Get the corresponding seed for the current subplot
    seed = ax.get_title().split(' = ')[-1]
    
    # Filter means for the current subplot
    seed_means = means[means['experiment_pretrained_seed'] == int(seed)]
    
    # Plot a purple circle at the mean position
    for _, row in seed_means.iterrows():
        ax.plot(row['proportion type'], row['relative error'], 'o', color='purple', markersize=8, zorder=10)

# Adjust the layout
plt.subplots_adjust(top=0.85)
g.fig.suptitle('Relative Proportion Error (Blobs)', fontsize=30)

# Add a legend
g.add_legend(title='proportion Type')

# Show the plot
plt.show()

In [None]:
# Create the catplot
g = sns.catplot(
    data=melted_relative_df,      # Ensure you use the 'data' parameter correctly.
    x="proportion type",
    y="absolute relative error",
    col="experiment_pretrained_seed",
    hue="proportion type",
    col_wrap=4,
    sharex=False,
    aspect=1.5,
    palette=custom_palette
)

# Calculate the means
means = melted_relative_df.groupby(['experiment_pretrained_seed', 'proportion type'])['absolute relative error'].mean().reset_index()

# For each subplot make a line at y = 0 to show the error and add the means
for ax in g.axes.flat:
    ax.axhline(0.0, ls='--', color='red')
    ax.set_xticklabels([])
    
    # Get the corresponding seed for the current subplot
    seed = ax.get_title().split(' = ')[-1]
    
    # Filter means for the current subplot
    seed_means = means[means['experiment_pretrained_seed'] == int(seed)]
    
    # Plot a purple circle at the mean position
    for _, row in seed_means.iterrows():
        ax.plot(row['proportion type'], row['absolute relative error'], 'o', color='purple', markersize=8, zorder=10)

# Adjust the layout
plt.subplots_adjust(top=0.85)
g.fig.suptitle('Absolute Relative Proportion Error (Heart Wall)', fontsize=30)

# Add a legend
g.add_legend(title='proportion Type')

# Show the plot
plt.show()

# Let's look at this averaged over seeds.

In [None]:
# We want to calculate now the average (across seeds) of the the absolute error for each method.
# That means first averaging within each seed, then averaging across seeds.
# We group by the data_id and the proportion_type.
relative_per_seed = melted_relative_df.groupby(["experiment_pretrained_seed", "proportion type"]).agg(
    abs_relative_error=("absolute relative error", "mean"),
).reset_index()
# Rename abs_error to absolute error.
relative_per_seed = relative_per_seed.rename(columns={"abs_relative_error": "absolute relative error"})

In [None]:
# make a table with each row being the proportion type, and the column being the abs_error mean with standard deviation in another column.
method_relative_table = relative_per_seed.pivot_table(
    values="absolute relative error",
    index="proportion type",
    columns="experiment_pretrained_seed",
    aggfunc="mean",
)
# Average across the pretrained seeds to make an average column, and then make a stdv column.
method_relative_table['mean'] = method_relative_table.mean(axis=1)
method_relative_table['stdv'] = method_relative_table.std(axis=1)
# We can drop all other columns that aren't mean and stdv
method_relative_table = method_relative_table[['mean', 'stdv']]

In [None]:
method_relative_table.sort_values(by='mean')

# And lets look at this over Data_ids.

In [None]:
# We want to calculate now the average (across seeds) of the the absolute error for each method.
# That means first averaging within each seed, then averaging across seeds.
# We group by the data_id and the proportion_type.
relative_per_dataid = melted_relative_df.groupby(["data_id", "proportion type"]).agg(
    abs_relative_error=("absolute relative error", "mean"),
).reset_index()
# Rename abs_error to absolute error.
relative_per_dataid = relative_per_dataid.rename(columns={"abs_relative_error": "absolute relative error"})

In [None]:
# make a table with each row being the proportion type, and the column being the abs_error mean with standard deviation in another column.
method_relative_table_dataid = relative_per_dataid.pivot_table(
    values="absolute relative error",
    index="proportion type",
    columns="data_id",
    aggfunc="mean",
)
# Average across the pretrained seeds to make an average column, and then make a stdv column.
method_relative_table_dataid['mean'] = method_relative_table_dataid.mean(axis=1)
method_relative_table_dataid['stdv'] = method_relative_table_dataid.std(axis=1)
# We can drop all other columns that aren't mean and stdv
method_relative_table_dataid = method_relative_table_dataid[['mean', 'stdv']]

In [None]:
method_relative_table_dataid.sort_values(by='mean')

# Finally, we want to look at the Dice score and Image ECE of all three methods as we are arguing that Image ECE is what we should care about.

In [None]:
inference_df

In [None]:
# Get the relevant columns for looking at the Dice score and Image ECE
metric_cols = [
    "data_id",
    "loss_func",
    "experiment_pretrained_seed",
    "model_pretrained_exp_root",
    "split",
    "image_metric",
    "metric_score"
]
# Take these columns of the inference_df, drop other columns and delete duplicate rows.
metric_df = inference_df[metric_cols].drop_duplicates().reset_index(drop=True)
metric_df.augment(calibrator)
# Drop the 'pretrained_exp_root column
metric_df = metric_df.drop(columns=["model_pretrained_exp_root"])

In [None]:
def train_method(calibrator, loss_func):
    return calibrator + f" ({loss_func})"

metric_df.augment(train_method)

In [None]:
metric_df

In [None]:
# Groupby the pretrained_seed and image_metrics, and calibrator, and take the mean of the metric_score.
metric_per_seed = metric_df.groupby(["experiment_pretrained_seed", "loss_func", "image_metric", "train_method"]).agg(
    metric_score=("metric_score", "mean"),
).reset_index()

In [None]:
# Finally, we want to change the order that the methods appear in.
metric_per_seed['train_method'] = metric_per_seed['train_method'].astype('category')
metric_per_seed['train_method'] = metric_per_seed['train_method'].cat.reorder_categories([
    'Uncalibrated (PixelCELoss)',
    'Uncalibrated (SoftDiceLoss)',
    'Uncalibrated (PixelFocalLoss)',
    'SVLS (PixelCELoss)',
    'SVLS (SoftDiceLoss)',
    'SVLS (PixelFocalLoss)',
])

In [None]:
result = metric_per_seed.groupby(['train_method', 'image_metric']).agg(
    mean_metric_score=('metric_score', 'mean'),
    std_metric_score=('metric_score', 'std')
).reset_index()

# Pivot the table to have one row for each train_method
result = result.pivot(
    index='train_method', 
    columns='image_metric', 
    values=[
        'mean_metric_score', 
        'std_metric_score'
    ]
)

# Flatten the column multi-index
result.columns = ['_'.join(col).strip() for col in result.columns.values]
result = result.reset_index()

In [None]:
result

In [None]:
# Get all the of the columns that are 'calibrator' and have 'Dice_Loss' in them
dice_cols = [col for col in result.columns if 'Dice Loss' in col or 'train_method' in col]
dice_result = result[dice_cols]

In [None]:
dice_result.sort_values(by='mean_metric_score_Dice Loss')

In [None]:
# Get all the of the columns that are 'calibrator' and have 'Image_ECE' in them
image_ece_cols = [col for col in result.columns if 'Image_ECE' in col or 'train_method' in col]
ece_result = result[image_ece_cols]

In [None]:
ece_result.sort_values(by='mean_metric_score_Image_ECE')