In [None]:
import sys
sys.path.append('/storage/vbutoi/projects')
sys.path.append('/storage/vbutoi/libraries')
sys.path.append('/storage/vbutoi/projects/ESE')
sys.path.append('/storage/vbutoi/projects/UniverSeg')

import os 
import numpy as np
import seaborn as sns
from pathlib import Path
import matplotlib.pyplot as plt
# Ionpy imports
from ionpy.analysis import ResultsLoader
# Local imports
from ese.analysis.analyze_inf import load_cal_inference_stats
from ese.analysis.analysis_utils.plot_utils import get_prop_color_palette
from ese.analysis.analysis_utils.parse_sweep import get_global_optimal_parameter, get_per_subject_optimal_values
sns.set_style("darkgrid")
sns.set_context("talk")
os.environ['DATAPATH'] = ':'.join((
       '/storage/vbutoi/datasets',
))
root = Path("/storage/vbutoi/scratch/ESE")
rs = ResultsLoader()

# For using code without restarting.
%load_ext autoreload
%autoreload 2
# For using yaml configs.
%load_ext yamlmagic

In [None]:
%%yaml results_cfg 

log:
    # root: '/storage/vbutoi/scratch/ESE/inference/10_26_24_OCTA_Benchmark'
    # root: '/storage/vbutoi/scratch/ESE/inference/10_26_24_ISLES_Benchmark'
    # root: '/storage/vbutoi/scratch/ESE/inference/10_26_24_Roads_FULLRES_Benchmark'
    root: '/storage/vbutoi/scratch/ESE/inference/10_26_24_WMH_Benchmark'
    inference_group: 'Sweep_Temperature'

options:
    verify_graceful_exit: True
    equal_rows_per_cfg_assert: False 

### Useful cell for controlling the plotting functions.

In [3]:
x_key = 'temperature'
y_key = 'soft_RAVE'
xtick_range = np.arange(0, 3.1, 0.1)
cmap = 'magma_r'
aspect = 2
y_lims = (-0.5, 4)

# Plotting Calls

In [None]:
inference_df = load_cal_inference_stats(
    results_cfg=results_cfg,
    load_cached=True
)

In [5]:
def dataset(inference_data_class):
    return inference_data_class.split('.')[-1]

inference_df.augment(dataset)

In [6]:
# for ikey in inference_df.keys():
#     print(ikey)

In [7]:
cols_to_keep = [
    'soft_abs_area_estimation_error',
    'soft_log_abs_area_estimation_error',
    'hard_abs_area_estimation_error',
    'hard_log_abs_area_estimation_error',
    'soft_RAVE',
    'hard_RAVE',
    'dataset',
    'loss_func_class',
    'temperature',
    'hard_volume',
    'soft_volume',
    'gt_volume',
    'data_id',
    'split'
]
# Filter out the columns we want to keep
exp_df = inference_df[cols_to_keep].drop_duplicates().reset_index(drop=True)
# We need to make sure that the cal split goes first.
exp_df = exp_df.sort_values('split', ascending=True)

In [8]:
##############################################################################################################
# This cell is quite important, it allows us to see the base soft volume for each data_id and loss_func_class
##############################################################################################################
# We want to add a base soft volume column to let us so what the uncalibrated volume is.
bsv = exp_df[exp_df['temperature'] == 1.01][['data_id', 'loss_func_class', 'soft_volume']].drop_duplicates().reset_index(drop=True)
# Make a new column that is the combination of data_id and loss_func_class
bsv['data_id_loss_func'] = bsv['data_id'] + '_' + bsv['loss_func_class']
exp_df['data_id_loss_func'] = exp_df['data_id'] + '_' + exp_df['loss_func_class']
# Drop the columns data_id and loss_func_class
bsv = bsv.drop(columns=['data_id', 'loss_func_class'])
# Convert this to a dictionary mapping from data_id to base soft volume
bsv_dict = dict(zip(bsv['data_id_loss_func'], bsv['soft_volume']))
# Make a new column of exp_df, called base_soft_volume, where the value is the corresponding value for the data_id of that row in the bsv_dict
exp_df['base_soft_volume'] = exp_df['data_id_loss_func'].map(bsv_dict)

In [None]:
plt.figure(figsize=(30, 20))
# We want to plot the mean error vs temperature
g = sns.relplot(
    data=exp_df,
    x=x_key,
    y=y_key,
    hue='loss_func_class',
    row='split',
    kind='line',
    height=10,
    aspect=aspect,
    legend=(x_key == 'temperature')
)

g.set(xticks=xtick_range, ylim=y_lims)
# Make a global title using suptitle with some spacing
plt.suptitle(f'{exp_df["dataset"].unique()[0]}: {y_key} vs {x_key}', fontsize=30)
# Add spacing between the title and the plot
plt.subplots_adjust(top=0.9)

In [10]:
global_opt_temp_df = get_global_optimal_parameter(
    exp_df, 
    sweep_key=x_key, 
    y_key=y_key,
    group_keys=['split', 'loss_func_class']
).sort_values(y_key)

In [11]:
# global_opt_temp_df

In [None]:
# We want to plot the mean error vs temperature
g = sns.relplot(
    data=exp_df,
    x=x_key,
    y=y_key,
    hue='data_id',
    col='loss_func_class',
    row='split',
    # col='split',
    kind='line',
    height=8,
    aspect=aspect,
    legend=False,
    palette=get_prop_color_palette(
                exp_df, 
                hue_key='data_id', 
                magnitude_key='gt_volume',
                cmap=cmap
            )
)

g.set(xticks=xtick_range, ylim=y_lims)
# Make a global title using suptitle with some spacing
plt.suptitle(f'{exp_df["dataset"].unique()[0]}: {y_key} vs {x_key}', fontsize=30)
# Add spacing between the title and the plot
plt.subplots_adjust(top=0.9)

In [None]:
# We want to plot the mean error vs temperature
g = sns.relplot(
    data=exp_df,
    x=x_key,
    y=y_key,
    hue='data_id',
    col='loss_func_class',
    row='split',
    # col='split',
    kind='line',
    height=8,
    aspect=aspect,
    legend=False,
    palette=get_prop_color_palette(
                exp_df, 
                hue_key='data_id', 
                magnitude_key='hard_volume',
                cmap=cmap
            )
)

g.set(xticks=xtick_range, ylim=y_lims)

In [None]:
# We want to plot the mean error vs temperature
g = sns.relplot(
    data=exp_df,
    x=x_key,
    y=y_key,
    hue='data_id',
    col='loss_func_class',
    row='split',
    kind='line',
    height=8,
    aspect=aspect,
    legend=False,
    palette=get_prop_color_palette(
                exp_df, 
                hue_key='data_id', 
                magnitude_key='base_soft_volume',
                cmap=cmap
            )
)

g.set(xticks=xtick_range, ylim=y_lims)

In [15]:
opt_temps_per_subj, opt_df = get_per_subject_optimal_values(
    exp_df, 
    sweep_key=x_key, 
    y_key=y_key,
    group_keys=['split', 'loss_func_class'],
    keep_keys=['hard_volume'],
    return_optimal_values=True
)

In [None]:
opt_df

In [None]:
len(opt_df['data_id'].unique())

In [18]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Models and evaluation
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score
# Import additional regression models
from sklearn.linear_model import Ridge, Lasso, ElasticNet, HuberRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.neural_network import MLPRegressor

# Visualization
import matplotlib.pyplot as plt
%matplotlib inline

In [19]:
# loss_fn = 'ese.losses.SoftDiceLoss'
loss_fn = 'ese.losses.PixelCELoss'
df = opt_df[opt_df['loss_func_class'] == loss_fn]

In [20]:
# Split the data into training and validation sets based on 'split' column
df_train = df[df['split'] == 'cal']
df_val = df[df['split'] == 'val']

X_train = df_train[['hard_volume']]
X_val = df_val[['hard_volume']]

# Prepare the features (X) and target (y)
y_train = df_train['temperature']
y_val = df_val['temperature']

In [21]:
# Define the models to test, including additional regressors
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet Regression': ElasticNet(),
    'Kernel Ridge Regression': KernelRidge(),
    'Huber Regressor': HuberRegressor(),
    'Polynomial Regression (degree=2)': make_pipeline(PolynomialFeatures(degree=2), LinearRegression()),
    'Polynomial Regression (degree=3)': make_pipeline(PolynomialFeatures(degree=3), LinearRegression()),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Extra Trees Regressor': ExtraTreesRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42),
    'AdaBoost Regressor': AdaBoostRegressor(random_state=42),
    'Support Vector Regression': SVR(),
    'MLP Regressor': MLPRegressor(random_state=42, max_iter=1000),
}

In [22]:
# Dictionary to store the results
results = {}

# Train and evaluate each model
for name, model in models.items():
    # Fit the model
    model.fit(X_train, y_train)
    # Predict on validation set
    y_pred = model.predict(X_val)
    # Evaluate
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    # Store the results
    results[name] = {'MSE': mse, 'R2 Score': r2}

In [23]:
# We also want to add our baseline of predicting a global optimal temperature
loss_fn_global_temp = global_opt_temp_df[global_opt_temp_df['loss_func_class'] == loss_fn]
cal_estimate_global_temp = float(loss_fn_global_temp[loss_fn_global_temp['split'] == 'cal']['temperature'])
# Make the y_pred by repeating the global temperature for each validation sample
y_pred_global_temp = np.repeat(cal_estimate_global_temp, len(y_val))
# Evaluate
mse = mean_squared_error(y_val, y_pred_global_temp)
r2 = r2_score(y_val, y_pred_global_temp)
# Store the results
results['Global Optimal Temperature'] = {'MSE': mse, 'R2 Score': r2}

In [None]:
# Convert results to a DataFrame for better visualization
results_df = pd.DataFrame(results).T  # Transpose for better format
# We want to srt by the best MSE
results_df = results_df.sort_values('MSE')
print("Model Performance:")
display(results_df)

In [None]:
# Plotting the predictions vs actual values for each model in subplots
import math

# Determine the layout of the subplot grid
num_models = len(models)
cols = 2  # Reduce the number of columns for larger subplots
rows = math.ceil(num_models / cols)  # Calculate the number of rows needed

# Increase the figure size for larger subplots
fig, axs = plt.subplots(rows, cols, figsize=(cols * 8, rows * 6))
axs = axs.flatten()  # Flatten the array of axes for easy iteration

# Plot each model's predictions in a subplot
for idx, (name, model) in enumerate(models.items()):
    y_pred = model.predict(X_val)
    axs[idx].scatter(X_val, y_val, color='black', label='Actual Validation Data')
    axs[idx].scatter(X_val, y_pred, color='blue', label='Predicted Data')
    axs[idx].set_title(f"{name}")
    axs[idx].set_xlabel('Hard Volume')
    axs[idx].set_ylabel('Temperature')
    axs[idx].legend()

# Hide any unused subplots if the grid is larger than the number of models
for idx in range(num_models, len(axs)):
    fig.delaxes(axs[idx])

plt.tight_layout()
plt.show()