In [None]:
import sys
sys.path.append('/storage/vbutoi/projects')
sys.path.append('/storage/vbutoi/libraries')
sys.path.append('/storage/vbutoi/projects/ESE')
sys.path.append('/storage/vbutoi/projects/UniverSeg')

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set_style("darkgrid")
sns.set_context("talk")

import os 
os.environ['DATAPATH'] = ':'.join((
       '/storage/vbutoi/datasets',
))

from ese.experiment.analysis.analyze_inf import load_cal_inference_stats
# Results loader object does everything
from ionpy.analysis import ResultsLoader
from pathlib import Path
root = Path("/storage/vbutoi/scratch/ESE")
rs = ResultsLoader()

# For using code without restarting.
%load_ext autoreload
%autoreload 2
# For using yaml configs.
%load_ext yamlmagic

In [None]:
%%yaml results_cfg 

log:
    root: /storage/vbutoi/scratch/ESE/inference
    inference_groups: 
        - '06_04_24_WMH_DifferenceExps'

options:
    add_dice_loss_rows: True
    drop_nan_metric_rows: True 
    remove_shared_columns: False
    equal_rows_per_cfg_assert: False 

In [None]:
inference_df = load_cal_inference_stats(
    results_cfg=results_cfg,
    load_cached=False,
)

In [None]:
# Choose only one member of the ensemble and the calibration split.
inference_df = inference_df.select(pretrained_seed=40, split='cal')

In [None]:
len(inference_df['joint_data_slice_id'].unique())

In [None]:
import numpy as np
import itertools

unique_calibrators = inference_df['calibrator'].unique()

pw_error_list = []
# Iterate through all the pair-wise comparisons.
for subj_id in inference_df['data_id'].unique():
    subject_df = inference_df[inference_df['data_id'] == subj_id]
    for (example_id_1, example_id_2) in list(itertools.combinations(subject_df['joint_data_slice_id'].unique(), 2)):
        if example_id_1 != example_id_2:
            # Get the dfs example_id_2 to these two ids:
            example_id_1_df = subject_df[subject_df['joint_data_slice_id'] == example_id_1]
            example_id_2_df = subject_df[subject_df['joint_data_slice_id'] == example_id_2]
            for calibrator in unique_calibrators:
                per_id_1_df = example_id_1_df[example_id_1_df['calibrator'] == calibrator].reset_index(drop=True)
                per_id_2_df = example_id_2_df[example_id_2_df['calibrator'] == calibrator].reset_index(drop=True)
                # Get our desired quantities.
                # GT
                gt_volume_1 = per_id_1_df['gt_volume'].values[0]
                gt_volume_2 = per_id_2_df['gt_volume'].values[0]
                # Soft
                soft_volume_1 = per_id_1_df['soft_volume'].values[0]
                soft_volume_2 = per_id_2_df['soft_volume'].values[0]
                # Hard
                hard_volume_1 = per_id_1_df['hard_volume'].values[0]
                hard_volume_2 = per_id_2_df['hard_volume'].values[0]

                # Now we can get differences in volume by looking at the differences.
                # GT
                gt_volume_diff = gt_volume_2 - gt_volume_1
                gt_volume_quot = gt_volume_2 / gt_volume_1
                # Soft
                soft_volume_diff = soft_volume_2 - soft_volume_1
                soft_volume_quot = soft_volume_2 / soft_volume_1
                # Hard
                hard_volume_diff = hard_volume_2 - hard_volume_1
                hard_volume_quot = hard_volume_2 / hard_volume_1

                # Get the metric information from each of the dataframes
                # Dice
                dice_1 = per_id_1_df[per_id_1_df['image_metric'] == 'Dice']['metric_score'].iloc[0]
                dice_2 = per_id_2_df[per_id_2_df['image_metric'] == 'Dice']['metric_score'].iloc[0]
                # ECE
                ece_1 = per_id_1_df[per_id_1_df['image_metric'] == 'Image_ECE']['metric_score'].iloc[0]
                ece_2 = per_id_2_df[per_id_2_df['image_metric'] == 'Image_ECE']['metric_score'].iloc[0]

                # Place all of these into a record
                pair_record = {
                    'calibrator': calibrator,
                    'example_id_1': example_id_1,
                    'example_id_2': example_id_2,
                    'subj_combo': f'{example_id_1},{example_id_2}',
                    'dice_1': dice_1,
                    'dice_2': dice_2,
                    'ece_1': ece_1,
                    'ece_2': ece_2,
                    'mean_dice': np.mean([dice_1, dice_2]),
                    'mean_ece': np.mean([ece_1, ece_2]),
                    'gt_volume_diff': gt_volume_diff,
                    'gt_volume_quot': gt_volume_quot,
                    'soft_volume_diff': soft_volume_diff,
                    'soft_volume_quot': soft_volume_quot,
                    'soft_diff_error': soft_volume_diff - gt_volume_diff,
                    'soft_quot_error': soft_volume_quot - gt_volume_quot,
                    'hard_volume_diff': hard_volume_diff,
                    'hard_volume_quot': hard_volume_quot,
                    'hard_diff_error': hard_volume_diff - gt_volume_diff,
                    'hard_quot_error': hard_volume_quot - gt_volume_quot,
                }
                # Add the record to the list
                pw_error_list.append(pair_record)
# Convert the final dataframe into a pandas dataframe
pairwise_df = pd.DataFrame(pw_error_list)

# Test 1: Per-subject, how do the real difference compare with respect to predicted difference?

In [None]:
# Melt the dataframe
pw_diff_df = pd.melt(
    pairwise_df, 
    id_vars=[
        'calibrator',
        'example_id_1', 
        'example_id_2', 
        'subj_combo'
    ], 
    value_vars=[
        'gt_volume_diff', 
        'soft_volume_diff', 
        'hard_volume_diff'
    ], 
    var_name='Pred_Type', 
    value_name='Volume Difference'
)


In [None]:
pw_diff_df['Pred_Type'] = pw_diff_df['Pred_Type'].astype('category')
pw_diff_df['Pred_Type'] = pw_diff_df['Pred_Type'].cat.reorder_categories([
    'soft_volume_diff',
    'hard_volume_diff',
    'gt_volume_diff',
])

In [None]:
# Create the boxplot
g = sns.catplot(
    x='calibrator',
    y='Volume Difference',
    kind='box',
    data=pw_diff_df,
    hue='Pred_Type',
    palette="Set2",       # Use a color palette
    linewidth=2,          # Set the linewidth of the edge
    showfliers=False,     # Do not show outliers
    aspect=1.5,            # Adjust the aspect ratio
    height=8,
    showmeans=True,       # Show the mean
    meanprops={"marker":"o", "markerfacecolor":"yellow", "markeredgecolor":"black", "markersize":"8"}
)
plt.title('Volume Differences Between Predictions')

# Test 2: Per-subject, how does the real volume quotient compare with the predicted quotient?

In [None]:
# Melt the dataframe
pw_quot_df = pd.melt(
    pairwise_df, 
    id_vars=[
        'calibrator',
        'example_id_1', 
        'example_id_2', 
        'subj_combo'
    ], 
    value_vars=[
        'gt_volume_quot', 
        'soft_volume_quot', 
        'hard_volume_quot'
    ], 
    var_name='Pred_Type', 
    value_name='Volume Quotient'
)


In [None]:
pw_quot_df['Pred_Type'] = pw_quot_df['Pred_Type'].astype('category')
pw_quot_df['Pred_Type'] = pw_quot_df['Pred_Type'].cat.reorder_categories([
    'soft_volume_quot',
    'hard_volume_quot',
    'gt_volume_quot',
])

In [None]:
# Create the boxplot
g = sns.catplot(
    x='calibrator',
    y='Volume Quotient',
    kind='box',
    data=pw_quot_df,
    hue='Pred_Type',
    palette="Set2",       # Use a color palette
    linewidth=2,          # Set the linewidth of the edge
    aspect=1.5,            # Adjust the aspect ratio
    height=8,
    showfliers=False,     # Do not show outliers
    showmeans=True,       # Show the mean
    meanprops={"marker":"o", "markerfacecolor":"yellow", "markeredgecolor":"black", "markersize":"8"}
)
plt.title('Volume Quotient Comparison')

# Test 3: What does the distribution of difference errors to look like?

In [None]:
# Melt the dataframe
pw_diff_error_df = pd.melt(
    pairwise_df, 
    id_vars=[
        'calibrator',
        'example_id_1', 
        'example_id_2', 
        'subj_combo'
    ], 
    value_vars=[
        'soft_diff_error', 
        'hard_diff_error'
    ], 
    var_name='Pred_Type', 
    value_name='Volume Diff Measurement Error'
)

In [None]:
# Create a larger figure
plt.figure(figsize=(12, 8))

# Create the boxplot with modified whiskers and without showing outliers
ax = sns.boxplot(
    x='calibrator',
    y='Volume Diff Measurement Error',
    hue='Pred_Type',  # Separate by 'Pred_Type
    data=pw_diff_error_df,
    palette="Set2",       # Use a color palette
    linewidth=2,          # Set the linewidth of the edge
    showfliers=False,     # Do not show outliers
    whis=0.5,              # Shorten the whiskers to half the IQR
    showmeans=True,       # Show the mean
    meanprops={"marker":"o", "markerfacecolor":"yellow", "markeredgecolor":"black", "markersize":"8"}
)

# Enhance the plot
ax.set_title('Volume Diff Measurement Error by Prediction Type', fontsize=20)
ax.set_xlabel('Prediction Type', fontsize=15)
ax.set_ylabel('Volume Diff Measurement Error', fontsize=15)
ax.tick_params(labelsize=12)

# Remove the top and right spines for a cleaner look
sns.despine()

# Add a dashed red line at y = 0
plt.axhline(y=0, color='r', linestyle='--', zorder=3)

# Show the plot
plt.show()

# Test 3.5: Per-subject, how do the real compare with respect to quotient?

In [None]:
# Create the boxplot
g = sns.catplot(
    x='calibrator',
    y='Volume Diff Measurement Error',
    kind='box',
    data=pw_diff_error_df,
    hue='Pred_Type',
    palette="Set2",       # Use a color palette
    linewidth=2,          # Set the linewidth of the edge
    aspect=1.5,            # Adjust the aspect ratio
    height=8,
    showfliers=False,     # Do not show outliers
    showmeans=True,
    meanprops={"marker":"o", "markerfacecolor":"yellow", "markeredgecolor":"black", "markersize":"8"}
)

# Add a dashed red line at y = 0
plt.axhline(y=0, color='r', linestyle='--', zorder=3)
plt.title('Volume Diff Measurement Error by Subject Pair', fontsize=20)

# Test 4: What does the distribution of errors of the quotient to look like?

In [None]:
# Melt the dataframe
pw_quot_error_df = pd.melt(
    pairwise_df, 
    id_vars=[
        'calibrator',
        'example_id_1', 
        'example_id_2', 
        'subj_combo'
    ], 
    value_vars=[
        'soft_quot_error', 
        'hard_quot_error'
    ], 
    var_name='Pred_Type', 
    value_name='Volume Quotient Measurement Error'
)

In [None]:
# Create a larger figure
plt.figure(figsize=(12, 8))

# Create the boxplot with modified whiskers and without showing outliers
ax = sns.boxplot(
    x='calibrator',
    y='Volume Quotient Measurement Error',
    hue='Pred_Type',
    data=pw_quot_error_df,
    palette="Set2",       # Use a color palette
    linewidth=2,          # Set the linewidth of the edge
    showfliers=False,     # Do not show outliers
    whis=0.5,              # Shorten the whiskers to half the IQR
    showmeans=True,       # Show the mean
    meanprops={"marker":"o", "markerfacecolor":"yellow", "markeredgecolor":"black", "markersize":"8"}
)

# Enhance the plot
ax.set_title('Volume Quotient Measurement Error by Prediction Type', fontsize=20)
ax.set_xlabel('Prediction Type', fontsize=15)
ax.set_ylabel('Volume Quotient Measurement Error', fontsize=15)
ax.tick_params(labelsize=12)

# Remove the top and right spines for a cleaner look
sns.despine()

# Add a dashed red line at y = 0
plt.axhline(y=0, color='r', linestyle='--', zorder=3)

# Show the plot
plt.show()

In [None]:
# Create the boxplot
g = sns.catplot(
    x='calibrator',
    y='Volume Quotient Measurement Error',
    kind='box',
    data=pw_quot_error_df,
    hue='Pred_Type',
    palette="Set2",       # Use a color palette
    linewidth=2,          # Set the linewidth of the edge
    aspect=1.5,            # Adjust the aspect ratio
    height=8,
    showfliers=False,     # Do not show outliers
    showmeans=True,
    meanprops={"marker":"o", "markerfacecolor":"yellow", "markeredgecolor":"black", "markersize":"8"}
)

# Add a dashed red line at y = 0
plt.axhline(y=0, color='r', linestyle='--', zorder=3)