In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
from matplotlib.patches import Patch
import matplotlib.colors as mcolors
import matplotlib.patheffects as path_effects
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
import xarray as xr
import seaborn as sns
import sys
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from scipy.stats import norm, spearmanr, ks_2samp, gaussian_kde, truncnorm
from scipy.spatial.distance import mahalanobis
import pickle

# Requires output of the second LHS run with spotpy (narrow)

if 'win' in sys.platform:
    path = "E:/OneDrive/PhD/PhD/Data/Hintereisferner/COSIPY/MiscTests/LHS/"
    filepath = "E:/OneDrive/PhD/PhD/Data/Hintereisferner/COSIPY/MiscTests/"
    tsla = pd.read_csv("E:/OneDrive/PhD/PhD/Data/Hintereisferner/Climate/snowlines/HEF-snowlines-1999-2010_manual_filtered.csv")
    figpath = "E:/OneDrive/PhD/PhD/Data/Hintereisferner/Figures/"
else:
    path = "/mnt/C4AEBBABAEBB9500/OneDrive/PhD/PhD/Data/Hintereisferner/COSIPY/MiscTests/LHS/"
    filepath = "/mnt/C4AEBBABAEBB9500/OneDrive/PhD/PhD/Data/Hintereisferner/COSIPY/MiscTests/"
    tsla = pd.read_csv("/mnt/C4AEBBABAEBB9500/OneDrive/PhD/PhD/Data/Hintereisferner/Climate/snowlines/HEF-snowlines-1999-2010_manual_filtered.csv")
    figpath = "/mnt/C4AEBBABAEBB9500/OneDrive/PhD/PhD/Data/Hintereisferner/Figures/"


In [None]:
df = pd.read_csv(path+"LHS-narrow_1D20m_1999_2010_fullprior.csv", index_col=0)
if np.min(df.index) == 3000:
    print("Faulty index, reset")
    df.reset_index(drop=True, inplace=True)

df

In [None]:
## Create summary of min/max bounds for individual parameters
#with df
if 'win' in sys.platform:
    df_wide = pd.read_csv("E:/OneDrive/PhD/PhD/Data/Hintereisferner/COSIPY/MiscTests/LHS/LHS-wide_1D20m_1999_2010_fullprior.csv", index_col=0)
else:
    df_wide = pd.read_csv("/mnt/C4AEBBABAEBB9500/OneDrive/PhD/PhD/Data/Hintereisferner/COSIPY/MiscTests/LHS/LHS-wide_1D20m_1999_2010_fullprior.csv", index_col=0)
param_names = ["rrr_factor", "alb_ice", "alb_snow", "alb_firn", "albedo_aging", "albedo_depth", "roughness_ice"]
df_wide_sub = df_wide[param_names]

summary_stats = pd.DataFrame({
    'min': df_wide_sub.min(),
    'max': df_wide_sub.max(),
    'mean': df_wide_sub.mean(),
    'std': df_wide_sub.std()
})

print(summary_stats)


In [None]:
df_narrow_sub = df[param_names]

summary_stats = pd.DataFrame({
    'min': df_narrow_sub.min(),
    'max': df_narrow_sub.max(),
    'mean': df_narrow_sub.mean(),
    'std': df_narrow_sub.std()
})

print(summary_stats)

In [None]:
time_start_dt = pd.to_datetime("2000-01-01") #config starts with spinup - need to add 1year
time_end_dt = pd.to_datetime("2009-12-31")

tsla_true_obs = tsla.copy()
tsla_true_obs['LS_DATE'] = pd.to_datetime(tsla_true_obs['LS_DATE'])
print("Start date:", time_start_dt)
print("End date:", time_end_dt)
tsla_true_obs = tsla_true_obs.loc[(tsla_true_obs['LS_DATE'] > time_start_dt) & (tsla_true_obs['LS_DATE'] <= time_end_dt)]
tsla_true_obs.set_index('LS_DATE', inplace=True)
#Normalize standard deviation if necessary
tsla_true_obs['SC_stdev'] = (tsla_true_obs['SC_stdev']) / (tsla_true_obs['glacier_DEM_max'] - tsla_true_obs['glacier_DEM_min'])

thres_unc = (20) / (tsla_true_obs['glacier_DEM_max'].iloc[0] - tsla_true_obs['glacier_DEM_min'].iloc[0])
print(thres_unc)

## Set observational uncertainty where smaller to atleast model resolution (20m) and where larger keep it
sc_norm = np.where(tsla_true_obs['SC_stdev'] < thres_unc, thres_unc, tsla_true_obs['SC_stdev'])
tsla_true_obs['SC_stdev'] = sc_norm
tsla_true_obs


In [None]:
## Load MB data
rgi_id = "RGI60-11.00897"
if 'win' in sys.platform:
    geod_ref = pd.read_csv("E:/OneDrive/PhD/PhD/Data/Hugonnet_21_MB/dh_11_rgi60_pergla_rates.csv")
else:
    geod_ref = pd.read_csv("/mnt/C4AEBBABAEBB9500/OneDrive/PhD/PhD/Data/Hugonnet_21_MB/dh_11_rgi60_pergla_rates.csv")
geod_ref = geod_ref.loc[geod_ref['rgiid'] == rgi_id]
geod_ref = geod_ref.loc[geod_ref['period'] == "2000-01-01_2010-01-01"]
geod_ref = geod_ref[['dmdtda', 'err_dmdtda']]
print(geod_ref)

In [None]:
## Prepare loglike calculations - snowline metrics: Maximum TSLA + Std and normal timeseries? Rate of change?
tsla_true_obs['year'] = tsla_true_obs.index.year

max_tsl_data = tsla_true_obs.loc[tsla_true_obs.groupby('year')["TSL_normalized"].idxmax()]
#max_tsl_data['SC_stdev'] = tsla_true_obs['SC_stdev']

max_tsl_with_uncertainty = max_tsl_data[["TSL_normalized", 'SC_stdev']]

matching_row_numbers = [tsla_true_obs.index.get_loc(date) for date in max_tsl_with_uncertainty.index if date in tsla_true_obs.index]
print("Matching row numbers:", matching_row_numbers)

max_tsl_with_uncertainty['rownumber'] = matching_row_numbers 
print(max_tsl_with_uncertainty)


In [None]:
modtsls = df.iloc[:,13:13+58].transpose()
print(modtsls)

max_modtsls = modtsls.iloc[max_tsl_with_uncertainty['rownumber']]
print(max_modtsls)

mb_mod = df.iloc[:,[12]].transpose()
print(mb_mod)

def loglike_tsla_func(sim_tsla, eval_tsla, sigma_tsla):
    loglike_tsla = -0.5 * np.sum(np.log(2 * np.pi * sigma_tsla**2) + ((eval_tsla-sim_tsla)**2 / sigma_tsla**2))
    avg_loglike_tsla = loglike_tsla / len(eval_tsla)
    return avg_loglike_tsla


#def loglike_tsla_func(sim_tsla, eval_tsla, sigma_tsla, n_tsla):
#    loglike_tsla = -0.5 * np.sum(np.log(2 * np.pi * sigma_tsla**2) + ((eval_tsla - sim_tsla)**2 / sigma_tsla**2)) / n_tsla
#    return loglike_tsla

def loglike_mb_func(sim_mb, eval_mb, sigma_mb):
    loglike_mb = -0.5 * (np.log(2 * np.pi * sigma_mb**2) + ( ((eval_mb-sim_mb)**2) / sigma_mb**2))
    return loglike_mb

def bias_mb_func(sim_mb, eval_mb):
    # For a single point, the bias is the simple difference.
    bias = sim_mb - eval_mb
    return bias

def bias_tsla_func(sim_tsla, eval_tsla):
    # The logic is identical to the albedo function.
    mean_bias = np.mean(sim_tsla - eval_tsla)
    return mean_bias

In [None]:
#mb_res = mb_mod.apply(me, obs_vals = geod_mb, obs_sigma=geod_ref['err_dmdtda'].values, axis=0) #
mb_logp = mb_mod.transpose().apply(loglike_mb_func, eval_mb = geod_ref['dmdtda'].values, sigma_mb= geod_ref['err_dmdtda'].values, axis=0)
print(mb_logp)

#tsl_res = modtsls.apply(mae, obs_vals = tsla_synth['Med_TSL'].values, obs_sigma=tsla_synth['Std_TSL'].values, axis=0) #
tsl_logp = modtsls.apply(loglike_tsla_func, eval_tsla =tsla_true_obs['TSL_normalized'].values, sigma_tsla= tsla_true_obs['SC_stdev'].values, axis=0)
print(tsl_logp)

mb_bias = mb_mod.transpose().apply(bias_mb_func, eval_mb = geod_ref['dmdtda'].values, axis=0)
tsla_bias = modtsls.apply(bias_tsla_func, eval_tsla =tsla_true_obs['TSL_normalized'].values, axis=0)

#tslmax_logp = max_modtsls.apply(loglike_tsla_func, eval_tsla =max_tsl_with_uncertainty['TSL_normalized'].values, sigma_tsla= max_tsl_with_uncertainty['SC_stdev'].values, axis=0)
#print(tsl_logp)

In [None]:
## Load ALBEDO observations 
if 'win' in sys.platform:
    alb_obs_data = xr.open_dataset(r"E:\OneDrive\PhD\PhD\Data\Hintereisferner\Climate\HEF_processed_HRZ-30CC-filter_albedos.nc")
else:
    alb_obs_data = xr.open_dataset("/mnt/C4AEBBABAEBB9500/OneDrive/PhD/PhD/Data/Hintereisferner/Climate/HEF_processed_HRZ-30CC-filter_albedos.nc")
#has nans where no glacier -> can build glacier-wide mean albedo for additional logp
alb_obs_data = alb_obs_data.sortby("time")
alb_obs_data

In [None]:
#tsl_logp = modtsls.apply(loglike_tsla_func, eval_tsla =tsla_true_obs['TSL_normalized'].values, sigma_tsla= tsla_true_obs['SC_stdev'].values, axis=0)
#print(tsl_logp)
sigma_alb = alb_obs_data['sigma_albedo'] #alb_obs_std.albedo / np.sqrt(alb_obs_sample_size.albedo) #Standard Error of the Mean assuming no autocorrelation (which is faulty but just a first order approximation), error super small
eval_alb = alb_obs_data['median_albedo']

list_logp_alb = []
list_sim_alb = []
list_bias_alb = []

for i,r in df.iterrows():
    if i % 300 == 0:
        print(f"Processing file {i}/2500")
    rrr_factor = round(r['rrr_factor'],4)
    alb_ice = round(r['alb_ice'],4)
    alb_snow = round(r['alb_snow'],4)
    alb_firn = round(r['alb_firn'],4)
    alb_aging = round(r['albedo_aging'],4)
    alb_depth = round(r['albedo_depth'],4)
    roughness_ice = round(r['roughness_ice'], 4)

    filename = f"HEF_COSMO_1D20m_1999_2010_HORAYZON_IntpPRES_LHS-narrow_19990101-20091231_RRR-{rrr_factor}_{alb_snow}_{alb_ice}_{alb_firn}_{alb_aging}_{alb_depth}_0.24_{roughness_ice}_4.0_0.0026_num2.nc"
    if 'win' in sys.platform:
        sim_alb = xr.open_dataarray("E:/OneDrive/PhD/PhD/Data/Hintereisferner/Output/albedo_files/LHS/alb_only/"+\
            filename)
    else:
        sim_alb = xr.open_dataarray("/mnt/C4AEBBABAEBB9500/OneDrive/PhD/PhD/Data/Hintereisferner/Output/albedo_files/LHS/alb_only/"+\
            filename)

    
    #sort by time 
    sim_alb = sim_alb.sortby("time")
    sim_alb = sim_alb.sel(time=alb_obs_data.time)
        
    ## compute logp albedo for file
    list_sim_alb.append(sim_alb.data)
    logp_alb_all = -0.5 * np.sum(np.log(2 * np.pi * sigma_alb**2) + ((sim_alb.data-eval_alb)**2 / sigma_alb**2))
    avg_logp_alb = logp_alb_all / len(sim_alb)
    list_logp_alb.append(avg_logp_alb.item())
    #bias 
    bias_alb_all = bias_tsla_func(sim_alb.data, eval_alb)
    list_bias_alb.append(bias_alb_all.item())
    


"""
list_logp_albrel = []
list_logp_albhigh = []
list_logp_alblow = []

indices = []
for i,r in df.iterrows():
    if i % 300 == 0:
        print(f"Processing file {i}/3000")
    rrr_factor = round(r['rrr_factor'],4)
    alb_ice = round(r['alb_ice'],4)
    alb_snow = round(r['alb_snow'],4)
    alb_firn = round(r['alb_firn'],4)
    alb_aging = round(r['albedo_aging'],4)
    alb_depth = round(r['albedo_depth'],4)
    if condition == "fullprior":
        filename = f"HEF_COSMO_1D20m_1999_2010_HORAYZON_LHS_19990101-20091231_RRR-{rrr_factor}_{alb_snow}_{alb_ice}_{alb_firn}_{alb_aging}_{alb_depth}_0.24_1.7_4.0_0.0026_num2.nc"
        if 'win' in sys.platform:
            sim_alb = xr.open_dataset("E:/OneDrive - uibk.ac.at/PhD/PhD/Data/Hintereisferner/COSIPY/MiscTests/LHS/AWScomp/LHS/"+\
                filename)
        else:
            sim_alb = xr.open_dataset("/mnt/C4AEBBABAEBB9500/OneDrive - uibk.ac.at/PhD/PhD/Data/Hintereisferner/Output/albedo_files/"+\
                filename)
    else:
        filename = f"HEF_COSMO_1D20m_1999_2010_HORAYZON_LHSnoRRR_19990101-20091231_RRR-{rrr_factor}_{alb_snow}_{alb_ice}_{alb_firn}_{alb_aging}_{alb_depth}_0.24_1.7_4.0_0.0026_num2AWSmetrics.csv"
        if 'win' in sys.platform:
            sim_alb = xr.open_dataset("E:/OneDrive - uibk.ac.at/PhD/PhD/Data/Hintereisferner/COSIPY/MiscTests/LHS/AWScomp/LHS_lowsnow/"+\
                filename)
        else:
            sim_alb = xr.open_dataset("/mnt/C4AEBBABAEBB9500/OneDrive - uibk.ac.at/PhD/PhD/Data/Hintereisferner/Output/albedo_files/"+\
                filename)
            
    ## Process albedo 
    #ensure same order
    sim_alb = sim_alb.isel(lon=0).sortby("HGT")
    
    # prepare data
    rel, high, low = prepare_logp_alb(sim_alb, alb_obs_data)
    list_logp_albrel.append(rel)
    list_logp_albhigh.append(high)
    list_logp_alblow.append(low)
"""

## keep albedo out for now, some studies say glacier-average albedo is a useful predictor - could implement this as a metric as well    

In [None]:
alb_logp = pd.Series(list_logp_alb)
alb_bias = pd.Series(list_bias_alb)
alb_logp

In [None]:
sim_list = [f"sim{i+1}" for i in range(58)]
print(sim_list)

#get their maximums aligned
mb_logp.max()

#full_var_list = ['rrr_factor', 'alb_ice', 'alb_snow', 'alb_firn', 'albedo_aging', 'albedo_depth', 'mb'] + sim_list + ['ME_acc', 'ME_abl', 'LE_acc', 'LE_abl', 'H_acc',
#                                                                                                                      'H_abl', 'B_acc', 'B_abl', 'TS_acc', 'TS_abl']
#df = df[full_var_list]
df['mb_logp'] = mb_logp
df['tsla_logp'] = tsl_logp  #+ (mb_logp.max().item() - tsl_logp.max().item()) #we scale by /10 -> does that make sense, then most of impact is on MB ... I think we offset by best-performance difference so that they have equal score for best performance?
df['alb_logp'] = alb_logp.values
df['joint_like'] = df['mb_logp'] + df['tsla_logp'] + df['alb_logp']
#
df['bias_mb'] = mb_bias
df['bias_tsla'] = tsla_bias
df['bias_alb'] = alb_bias


df

In [None]:
## Just a quick test - show simulations that perform best for mb_logp in terms of pf
print(df.sort_values(by="mb_logp", ascending=False).head(250)['rrr_factor'].median())
print(df.sort_values(by="mb_logp", ascending=False).head(250)['albedo_aging'].median())
print(df.sort_values(by="mb_logp", ascending=False).head(250)['alb_snow'].median())

In [None]:
## Check ranges of logp to ensure equal weight..
print("MB Logp Range:")
print(df.mb_logp.min(), df.mb_logp.max())
print("TSLA Logp Range:")
print(df.tsla_logp.min(), df.tsla_logp.max())
print("Alb Logp Range:")
print(df.alb_logp.min(), df.alb_logp.max())

In [None]:
#subset = df.loc[df['joint_like'] > df.joint_like.max()-2*df.joint_like.std()]
plt.rcParams.update({'font.size': 22})
subset = df.copy()
## Create correlation histograms after Rounce et al.
## Correlation matrix flawed because it comes from a random uniform sampling - should see ery little correlation initially

correlation_matrix = subset[['rrr_factor','alb_ice','alb_snow','alb_firn','albedo_aging','albedo_depth','roughness_ice','bias_mb','bias_tsla','bias_alb']].corr(method="spearman")

y_label_dict = {'rrr_factor': r'$p_{f}$', 'alb_ice': r'$\alpha_{ice}$', 'alb_snow': r'$\alpha_{fs}$','alb_firn': r'$\alpha_{firn}$', 'albedo_aging': r'$\alpha_{aging}$',
                'albedo_depth': r'$\alpha_{depth}$','roughness_ice': r'$z0_{ice}$', 'mb_logp': r'$\mathcal{L}(MB|\theta)$',
                'tsla_logp': r'$\mathcal{L}(TSLA|\theta)$', 'alb_logp': r'$\mathcal{L}(ALB|\theta)$', 'joint_like': r'$\mathcal{L}(total|\theta)$',
                'bias_mb': r'$\Delta$MB', 'bias_tsla': r'$\Delta$TSLA', 'bias_alb': r'$\Delta$ALB'}
#rename columns and index
new_index = [y_label_dict[x] for x in correlation_matrix.index]
correlation_matrix.index = new_index
correlation_matrix.columns = new_index

# Plot the correlation heatmap
plt.figure(figsize=(16, 9), dpi=150)
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)


In [17]:
### First step, filter data
df_copy = df.copy()

def compute_nroy_threshold(model_outputs, observations, sigmas, scale=0, delta_sigma=3, return_mask=False, clip=False):
    """
    Computes a log-likelihood threshold corresponding to a full delta_sigma deviation.
    Optionally returns a mask of runs considered Not-Ruled-Out-Yet (NROY).

    Parameters:
    - model_outputs: np.ndarray, shape (n_runs, n_points)
    - observations: np.ndarray, shape (n_points,)
    - sigmas: np.ndarray, shape (n_points,) or (n_runs, n_points)
    - scale: float, optional log-likelihood adjustment
    - delta_sigma: float, e.g. 3 for ±3σ shift
    - return_mask: bool, whether to return accepted run indices

    Returns:
    - threshold: float, LL threshold (average LL)
    - best_avg_ll: float, best average LL score
    - nroy_mask: np.ndarray, optional boolean array of accepted runs
    """
    model_outputs = np.asarray(model_outputs)
    observations = np.asarray(observations)
    sigmas = np.asarray(sigmas)

    n_runs, n_points = model_outputs.shape

    if sigmas.ndim == 1:
        sigmas = np.tile(sigmas, (n_runs, 1))  # (n_runs, n_points)

    # Compute LL for all runs
    ll_all = norm.logpdf(model_outputs, loc=observations, scale=sigmas)
    avg_ll_per_run = np.mean(ll_all, axis=1) + scale

    # Best run index and score
    best_idx = np.argmax(avg_ll_per_run)
    best_ll = avg_ll_per_run[best_idx]

    # Shift observations by ±delta_sigma * sigma and re-evaluate LL
    shifted_high = model_outputs[best_idx] + delta_sigma * sigmas[best_idx]
    shifted_low  = model_outputs[best_idx] - delta_sigma * sigmas[best_idx]

    # Clip if physical bounds (e.g. snowlines can’t be negative)
    #for snowlines we'd need to clip it to physical range (0,1)
    if clip:
        shifted_high = np.clip(shifted_high, 0, 1)
        shifted_low  = np.clip(shifted_low, 0, 1)

    ll_shifted_high = norm.logpdf(shifted_high, loc=observations, scale=sigmas[best_idx])
    ll_shifted_low  = norm.logpdf(shifted_low,  loc=observations, scale=sigmas[best_idx])

    drop_high = best_ll - (np.mean(ll_shifted_high) + scale)
    drop_low  = best_ll - (np.mean(ll_shifted_low)  + scale)
    drop_avg  = (drop_high + drop_low) / 2.0

    # Define threshold
    threshold = best_ll - drop_avg

    if return_mask:
        nroy_mask = avg_ll_per_run >= threshold
        return threshold, best_ll, nroy_mask

    return drop_high, drop_low, drop_avg, best_idx, best_ll

### Beware of this drop_avg ...



In [None]:
### repeat analysis but first subsample into relevant 2 sigma space! ### 
#Take scoring threshold (not as arbitrary as best ten percent) derived from +- 2 sigma variation around best solution -> does it work if there are multiple solutions with similar performance?
drop_high, drop_low, drop_avg, best_idx, best_ll = compute_nroy_threshold(
    modtsls.transpose(), tsla_true_obs['TSL_normalized'], tsla_true_obs['SC_stdev'], scale=0, delta_sigma=3, return_mask=False, clip=False)

print(f"Best run index: {best_idx}")
print(f"Best avg log-likelihood: {best_ll:.4f}")
print(f"Avg. Log-likelihood drop at +3σ: {drop_avg:.4f}")
print(f"High Log-likelihood drop at +3σ: {drop_high:.4f}")
print(f"Low Log-likelihood drop at +3σ: {drop_low:.4f}")

In [None]:
from scipy.special import logit, expit
epsilon = 1e-6
clipped_modtsls = np.clip(modtsls, epsilon, 1 - epsilon)
clipped_obs = np.clip(tsla_true_obs['TSL_normalized'], epsilon, 1 - epsilon)

# 1. Transform data to logit space
logit_modtsls = logit(clipped_modtsls)
logit_obs = logit(clipped_obs)

# 2. Propagate error to logit space (this is crucial)
# Each point in each run has a different scaling factor for its sigma
stdev_array = tsla_true_obs['SC_stdev']

# Reshape the 1D sigma array into a 2D column vector (n_points, 1)
# This allows it to be broadcast correctly across the (n_points, n_runs) array.
stdev_reshaped = stdev_array.values.reshape(-1, 1) # or stdev_array[:, np.newaxis]

# The division now works element-wise as intended
logit_sigmas = stdev_reshaped / (clipped_modtsls * (1 - clipped_modtsls))

# 3. Run the analysis in logit space (NO CLIPPING NEEDED)
drop_high, drop_low, drop_avg, best_idx, best_ll = compute_nroy_threshold(
    logit_modtsls.transpose(),
    logit_obs,
    logit_sigmas.transpose(), # Ensure sigmas are correctly shaped
    delta_sigma=3,
    clip=False # IMPORTANT
)

print(f"Best run index: {best_idx}")
print(f"Best avg log-likelihood: {best_ll:.4f}")
print(f"Avg. Log-likelihood drop at +3σ: {drop_avg:.4f}")
print(f"High Log-likelihood drop at +3σ: {drop_high:.4f}")
print(f"Low Log-likelihood drop at +3σ: {drop_low:.4f}")

In [None]:
drop_high, drop_low, drop_avg, best_idx, best_ll = compute_nroy_threshold(
    mb_mod.transpose(), geod_ref['dmdtda'], geod_ref['err_dmdtda'], scale=0, delta_sigma=3, return_mask=False, clip=False)

print(f"Best run index: {best_idx}")
print(f"Best avg log-likelihood: {best_ll:.4f}")
print(f"Avg. Log-likelihood drop at +3σ: {drop_avg:.4f}")
print(f"High Log-likelihood drop at +3σ: {drop_high:.4f}")
print(f"Low Log-likelihood drop at +3σ: {drop_low:.4f}")


In [None]:
drop_high, drop_low, drop_avg, best_idx, best_ll = compute_nroy_threshold(
    np.asarray(list_sim_alb), eval_alb.data, sigma_alb.data, scale=0, delta_sigma=3, return_mask=False, clip=False)

print(f"Best run index: {best_idx}")
print(f"Best avg log-likelihood: {best_ll:.4f}")
print(f"Avg. Log-likelihood drop at +3σ: {drop_avg:.4f}")
print(f"High Log-likelihood drop at +3σ: {drop_high:.4f}")
print(f"Low Log-likelihood drop at +3σ: {drop_low:.4f}")

In [None]:
## How to use thresholds to adjust it for min/max? 
#normalize threshold score
## construct conservative cutoff-threshold - 3sigma
mb_cutoff = 0.4243 - 4.5
tsla_cutoff = -26.5530 + -4.5
alb_cutoff = 1.3205 - 4.5 #4.5 

# 2 sigma cutoff
#mb_cutoff = 0.4243 - 2.0025
#tsla_cutoff = -23.9576 + -3.1491
#alb_cutoff = 0.7417 - 2.8157

cutoff_thres = mb_cutoff + tsla_cutoff + alb_cutoff
    
filtered_data = df_copy.loc[df_copy['joint_like'] >= cutoff_thres]
filtered_data

In [None]:
scale_logps = True

if scale_logps:
    
    # =============================================================================
    # Simulated log-likelihood scores (you already have these from real data)
    # =============================================================================

    L_snow = filtered_data['tsla_logp'].copy()
    L_mass = filtered_data['mb_logp'].copy()
    L_albedo = filtered_data['alb_logp'].copy()

    # =============================================================================
    # Descriptive statistics
    # =============================================================================
    def describe_scores(scores, label):
        stats = {
            'min': np.min(scores),
            'max': np.max(scores),
            'median': np.median(scores),
            'mean': np.mean(scores),
            'std': np.std(scores),
            'range': np.max(scores) - np.min(scores)
        }
        print(f"Statistics for {label}:")
        for key, value in stats.items():
            print(f" {key:>6}: {value:8.2f}")
        print()
        return stats

    print("Raw Statistics:")
    stats_snow   = describe_scores(L_snow,  "Snowline")
    stats_mass   = describe_scores(L_mass,  "Mass Balance")
    stats_albedo = describe_scores(L_albedo,"Albedo")

    # =============================================================================
    # Median shift
    # =============================================================================
    L_snow_shifted   = L_snow - stats_snow['mean']
    L_mass_shifted   = L_mass - stats_mass['mean']
    L_albedo_shifted = L_albedo - stats_albedo['mean']

    print("After Median Shift:")
    describe_scores(L_snow_shifted, "Snowline (Shifted)")
    describe_scores(L_mass_shifted, "Mass Balance (Shifted)")
    describe_scores(L_albedo_shifted, "Albedo (Shifted)")

    # =============================================================================
    # Standardization: shift + scale (std and MAD versions)
    # =============================================================================
    L_snow_std   = L_snow_shifted / stats_snow['std']
    L_mass_std   = L_mass_shifted / stats_mass['std']
    L_albedo_std = L_albedo_shifted / stats_albedo['std']

    # =============================================================================
    # Plotting: raw and standardized
    # =============================================================================
    plt.figure(figsize=(18, 12))  # Increase height to accommodate the third row

    # Row 1 - Original
    plt.subplot(3, 3, 1)
    sns.histplot(L_snow, bins=40, kde=True, color='skyblue')
    plt.title("TSLA (Raw)")

    plt.subplot(3, 3, 2)
    sns.histplot(L_mass, bins=40, kde=True, color='lightgreen')
    plt.title("MB (Raw)")

    plt.subplot(3, 3, 3)
    sns.histplot(L_albedo, bins=40, kde=True, color='salmon')
    plt.title("Alb (Raw)")

    plt.subplot(3, 3, 4)
    sns.histplot(L_snow_shifted, bins=40, kde=True, color='skyblue')
    plt.title("TSLA (Shifted)")

    plt.subplot(3, 3, 5)
    sns.histplot(L_mass_shifted, bins=40, kde=True, color='lightgreen')
    plt.title("MB (Shifted)")

    plt.subplot(3, 3, 6)
    sns.histplot(L_albedo_shifted, bins=40, kde=True, color='salmon')
    plt.title("Alb (Shifted)")
    
    # Row 3 - Standardized by STD
    plt.subplot(3, 3, 7)
    sns.histplot(L_snow_std, bins=40, kde=True, color='skyblue')
    plt.title("TSLA (Standardized+Shifted)")

    plt.subplot(3, 3, 8)
    sns.histplot(L_mass_std, bins=40, kde=True, color='lightgreen')
    plt.title("MB (Standardized+Shifted)")

    plt.subplot(3, 3, 9)
    sns.histplot(L_albedo_std, bins=40, kde=True, color='salmon')
    plt.title("Alb (Standardized+Shifted)")

    plt.tight_layout()

    # Optional: MAD-standardized plots if you want to compare
    # Comment out the above plot and uncomment the block below if you prefer MAD

    
    filtered_data.loc[:,'tsla_logp'] = L_snow_std
    filtered_data.loc[:,'mb_logp'] = L_mass_std
    filtered_data.loc[:,'alb_logp'] = L_albedo_std


In [24]:
## Store stats to load in pymc
# Bundle them together
all_stats = {
    "snow": stats_snow,
    "mass": stats_mass,
    "albedo": stats_albedo,
}

# Save to pickle
if 'win' in sys.platform:
    with open("E:/OneDrive/PhD/PhD/Data/Hintereisferner/COSIPY/MiscTests/LHS/LHS/loglike_stats.pkl", "wb") as f:
        pickle.dump(all_stats, f)
else:
    with open("/mnt/C4AEBBABAEBB9500/OneDrive/PhD/PhD/Data/Hintereisferner/COSIPY/MiscTests/LHS/LHS/loglike_stats.pkl", "wb") as f:
        pickle.dump(all_stats, f)

In [None]:
## Check ranges of logp to ensure equal weight..
print("MB Logp Range:")
print(filtered_data.mb_logp.min(), filtered_data.mb_logp.max())
print("TSLA Logp Range:")
print(filtered_data.tsla_logp.min(), filtered_data.tsla_logp.max())
print("Alb Logp Range:")
print(filtered_data.alb_logp.min(), filtered_data.alb_logp.max())

In [None]:
# Define the columns for correlation
cols_to_correlate = ['rrr_factor','alb_ice','alb_snow','alb_firn','albedo_aging','albedo_depth','roughness_ice','bias_mb','bias_tsla','bias_alb']
data_for_corr = filtered_data[cols_to_correlate]

corr_matrix, p_matrix = spearmanr(data_for_corr)
correlation_df = pd.DataFrame(corr_matrix, index=data_for_corr.columns, columns=data_for_corr.columns)
pval_df = pd.DataFrame(p_matrix, index=data_for_corr.columns, columns=data_for_corr.columns)

# --- Significance mask ---
alpha_threshold = 0.01
significance_mask = (pval_df < alpha_threshold).astype(float)

# --- Your plotting script (with minor changes) ---
y_label_dict = {'rrr_factor': r'$p_{f}$', 'alb_ice': r'$\alpha_{ice}$', 'alb_snow': r'$\alpha_{fs}$','alb_firn': r'$\alpha_{firn}$', 'albedo_aging': r'$\alpha_{aging}$',
                'albedo_depth': r'$\alpha_{depth}$','roughness_ice': r'$z0_{ice}$', 'mb_logp': r'$\mathcal{L}(MB|\theta)$',
                'tsla_logp': r'$\mathcal{L}(TSLA|\theta)$', 'alb_logp': r'$\mathcal{L}(ALB|\theta)$', 'joint_like': r'$\mathcal{L}(total|\theta)$',
                'bias_mb': r'$\Delta{B_{geod}}$', 'bias_tsla': r'$\Delta$SLA', 'bias_alb': r'$\Delta\bar{\alpha}$'}

new_labels = [y_label_dict[x] for x in correlation_df.index]
correlation_df.index = new_labels
correlation_df.columns = new_labels

# --- Plot ---
fig, ax = plt.subplots(1, 1, figsize=(16, 9), dpi=300)
#cmap = sns.diverging_palette(220, 20, as_cmap=True)

# Base heatmap
sns.heatmap(
    correlation_df,
    cmap="coolwarm",
    vmin=-1, vmax=1,
    annot=True, fmt=".2f",
    mask=None,
    ax=ax,
    linewidths=0.5,
    linecolor='gray',
    alpha=1  # we'll modulate alpha manually below
)

# Optional: overlay transparency by reducing color intensity where not significant
# This part modulates transparency by re-plotting a blank mask on top:
#ax = plt.gca()

for i in range(len(correlation_df)):
    for j in range(len(correlation_df)):
        if pval_df.iloc[i, j] >= alpha_threshold:
            ax.add_patch(plt.Rectangle((j, i), 1, 1, color='white', alpha=0.6, zorder=3))

# Optional: Outline significant cells (contours)
for i in range(len(correlation_df)):
    for j in range(len(correlation_df)):
        if pval_df.iloc[i, j] < alpha_threshold:
            ax.add_patch(plt.Rectangle((j, i), 1, 1, fill=False, edgecolor='black', lw=1.5))


fig.tight_layout()
if 'win' in sys.platform:
    pass
else:
    pass
    #plt.savefig("/mnt/C4AEBBABAEBB9500/OneDrive/PhD/PhD/Data/Hintereisferner/Figures/Fig03_correlation_3sigma_filter.png", bbox_inches="tight")

In [None]:
#plt.rcParams.update({'font.size': 20})
### Subset first -- repeat correlation plot to show correlation between parameters which warrants PCA to reduce complex 7D space
filtered_data.loc[:, 'joint_like'] = filtered_data['tsla_logp'] + filtered_data['mb_logp'] + filtered_data['alb_logp']
correlation_matrix = filtered_data[['rrr_factor','alb_ice','alb_snow','alb_firn','albedo_aging','albedo_depth','roughness_ice','joint_like','bias_mb','bias_tsla','bias_alb']].corr(method="spearman")

y_label_dict = {'rrr_factor': r'$p_{f}$', 'alb_ice': r'$\alpha_{ice}$', 'alb_snow': r'$\alpha_{fs}$','alb_firn': r'$\alpha_{firn}$', 'albedo_aging': r'$\alpha_{aging}$',
                'albedo_depth': r'$\alpha_{depth}$','roughness_ice': r'$z0_{ice}$', 'mb_logp': r'$\mathcal{L}(MB|\theta)$',
                'tsla_logp': r'$\mathcal{L}(TSLA|\theta)$', 'alb_logp': r'$\mathcal{L}(ALB|\theta)$', 'joint_like': r'$\mathcal{L}(total|\theta)$',
                'bias_mb': r'$\Delta$MB', 'bias_tsla': r'$\Delta$SLA', 'bias_alb': r'$\Delta\bar{\alpha}$'}
#rename columns and index
new_index = [y_label_dict[x] for x in correlation_matrix.index]
correlation_matrix.index = new_index
correlation_matrix.columns = new_index
"""
# Plot the correlation heatmap6
fig, ax = plt.subplots(1,1, figsize=(16,9), dpi=150)
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5, ax=ax)
fig.tight_layout()
"""


In [None]:
## PCA
param_features = ['rrr_factor','alb_ice','alb_snow','alb_firn','albedo_aging','albedo_depth','roughness_ice'] #only parameters!
# Standardize the parameters
scaler2 = StandardScaler()
param_scaled = scaler2.fit_transform(filtered_data[param_features])

# Run PCA with all components
pca = PCA(n_components=len(param_features))
pca_data = pca.fit_transform(param_scaled)

# Create a DataFrame with the PCA results
pca_df = pd.DataFrame(pca_data, columns=[f'PC{i+1}' for i in range(len(param_features))])

explained_variances = pca.explained_variance_ratio_
loadings = pca.components_

plt.figure(figsize=(16, 9), dpi=150)
plt.bar(range(1, len(explained_variances) + 1), explained_variances, alpha=0.7, align='center', label='Individual Explained Variance')
plt.step(range(1, len(explained_variances) + 1), explained_variances.cumsum(), where='mid', label='Cumulative Explained Variance')
plt.xlabel('Principal Component Index')
plt.ylabel('Explained Variance Ratio')
plt.legend(loc='best')
plt.show()

In [None]:
# assuming `pca` is your fitted PCA object and `df_params` is your LHS input DataFrame
loadings = pd.DataFrame(
    pca.components_.T,
    index=filtered_data[param_features].columns,
    columns=[f"PC{i+1}" for i in range(filtered_data[param_features].shape[1])]
)
print(loadings)


In [30]:
## Continued after clustering!

In [31]:
## Perform clustering, overlay cluster color on PCA plots.
# Extract log-likelihood columns
loglikelihoods = filtered_data[['tsla_logp','alb_logp','mb_logp']] # Last 3 columns: mass balance, albedo, snowlines
#loglikelihoods = filtered_data[['joint_like']]
#loglikelihoods = filtered_data[['rrr_factor', 'alb_ice', 'alb_snow', 'alb_firn', 'albedo_aging', 'albedo_depth', 'roughness_ice']]
#loglikelihoods = filtered_data[['alb_logp']]
# Standardize log-likelihood values (important for clustering)
scaler = StandardScaler()
loglikelihoods_scaled = scaler.fit_transform(loglikelihoods)

In [None]:
wcss = []
K_range = range(1, 10)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=3)
    kmeans.fit(loglikelihoods_scaled)
    wcss.append(kmeans.inertia_)

# Plot the elbow curve
plt.plot(K_range, wcss, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

"""
bic_scores = []
for k in range(1, 10):
    gmm = GaussianMixture(n_components=k, random_state=42)
    gmm.fit(loglikelihoods_scaled)
    bic_scores.append(gmm.bic(loglikelihoods_scaled))

# Plot BIC scores
plt.plot(range(1, 10), bic_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('BIC Score')
plt.title('BIC for GMM')
plt.show()
"""

In [None]:
optimal_k = 4  # Choose based on the elbow method
kmeans = KMeans(n_clusters=optimal_k, random_state=42)

clusters = kmeans.fit_predict(loglikelihoods_scaled)

# Add the cluster labels to your dataframe
try:
    filtered_data.drop('KMeans_Cluster', axis=1, inplace=True)
except:
    pass
filtered_data.loc[:,'KMeans_Cluster'] = clusters

"""
optimal_gmm_k = 3 # Choose based on BIC
gmm = GaussianMixture(n_components=optimal_gmm_k, random_state=42)
subset["GMM_Cluster"] = gmm.fit_predict(loglikelihoods_scaled)
"""

In [None]:
fig, ax = plt.subplots(2,5, dpi=150, figsize=(16,9))
list_axis = [ax[0,0], ax[0,1], ax[0,2], ax[0,3], ax[0,4], ax[1,0], ax[1,1], ax[1,2], ax[1,3], ax[1,4]]
i = 0
for col in ['rrr_factor','alb_ice','alb_snow','alb_firn','albedo_aging','albedo_depth','roughness_ice','mb_logp', 'tsla_logp', 'alb_logp']:  # Exclude log-likelihoods
    sns.boxplot(x=filtered_data["KMeans_Cluster"], y=filtered_data[col], ax=list_axis[i])
    i+=1
fig.tight_layout()

In [None]:
gmm_silhouette = silhouette_score(loglikelihoods_scaled, filtered_data["KMeans_Cluster"])
print(f"GMM Silhouette Score: {gmm_silhouette:.2f}")


In [36]:
## Repeat based on joint_like

In [None]:
#loglikelihoods = filtered_data[['tsla_logp','alb_logp','mb_logp']] # Last 3 columns: mass balance, albedo, snowlines
loglikelihoods = filtered_data[['joint_like']]
# Standardize log-likelihood values (important for clustering)
scaler3 = StandardScaler()
loglikelihoods_scaled = scaler3.fit_transform(loglikelihoods)

wcss = []
K_range = range(1, 10)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=3)
    kmeans.fit(loglikelihoods_scaled)
    wcss.append(kmeans.inertia_)

# Plot the elbow curve
plt.plot(K_range, wcss, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

"""
bic_scores = []
for k in range(1, 10):
    gmm = GaussianMixture(n_components=k, random_state=42)
    gmm.fit(loglikelihoods_scaled)
    bic_scores.append(gmm.bic(loglikelihoods_scaled))

# Plot BIC scores
plt.plot(range(1, 10), bic_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('BIC Score')
plt.title('BIC for GMM')
plt.show()
"""

In [None]:
optimal_k = 4  # Choose based on the elbow method
kmeans2 = KMeans(n_clusters=optimal_k, random_state=42)

clusters2 = kmeans2.fit_predict(loglikelihoods_scaled)

# Add the cluster labels to your dataframe
try:
    filtered_data.drop('KMeans_Cluster2', axis=1, inplace=True)
except:
    pass
filtered_data.loc[:,'KMeans_Cluster2'] = clusters2

"""
optimal_gmm_k = 3 # Choose based on BIC
gmm = GaussianMixture(n_components=optimal_gmm_k, random_state=42)
subset["GMM_Cluster"] = gmm.fit_predict(loglikelihoods_scaled)
"""

In [None]:
# Columns to cluster on
score_columns = ['mb_logp', 'tsla_logp', 'alb_logp']
n_clusters = 4  # Number of clusters

# Add cluster info to DataFrame (for plotting only)
temp_df = filtered_data.copy()

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(16, 9), dpi=150, sharey=False)

for i, col in enumerate(score_columns):
    X = filtered_data[[col]].values

    # Run KMeans on single column
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(X)

    temp_df[f'KMeans_{col}'] = cluster_labels

    # Plot score distributions colored by cluster
    sns.stripplot(
        x=temp_df[f'KMeans_{col}'],
        y=temp_df[col],
        palette='Set1',
        ax=axes[i],
        alpha=0.6,
        jitter=0.1,
        size=4
    )

    # Plot cluster centers
    centers = kmeans.cluster_centers_.flatten()
    for j, center in enumerate(centers):
        axes[i].axhline(center, color='black', linestyle='--', linewidth=1)
    
    axes[i].set_title(f"KMeans on {col}")
    axes[i].set_xlabel("Cluster")
    axes[i].set_ylabel("Log-likelihood")

plt.tight_layout()
plt.show()

In [40]:
if 'win' in sys.platform:
    dic_best_scores = {'mb_logp': 1, 'tsla_logp': 2, 'alb_logp': 2}
else:
    dic_best_scores = {'mb_logp': 1, 'tsla_logp': 3, 'alb_logp': 0}

In [None]:
if 'win' in sys.platform:
    print(len(filtered_data.loc[filtered_data['KMeans_Cluster2'] == 2]))
else:
    print(len(filtered_data.loc[filtered_data['KMeans_Cluster2'] == 3]))

In [None]:
plt.rcParams.update({'font.size': 22})
cols = ['rrr_factor','alb_ice','alb_snow', 'alb_firn', 'albedo_aging','albedo_depth', 'roughness_ice', 'mb_logp',
        'tsla_logp','alb_logp','joint_like']
colors = {"mb_logp": "#D81B1B", "alb_logp": "#1E80E5", "tsla_logp": "#A5781B"}
best_keys = {"mb_logp": "KMeans_mb_logp", "alb_logp": "KMeans_alb_logp", "tsla_logp": "KMeans_tsla_logp"}
y_label_dict = {'rrr_factor': r'$p_{f}$', 'alb_ice': r'$\alpha_{ice}$', 'alb_snow': r'$\alpha_{fs}$','alb_firn': r'$\alpha_{firn}$', 'albedo_aging': r'$\alpha_{aging}$',
                'albedo_depth': r'$\alpha_{depth}$','roughness_ice': r'$z0_{ice}$', 'mb_logp': r'$\mathcal{L}(B_{geod}|\theta,{X})$',
                'tsla_logp': r'$\mathcal{L}(SLA|\theta,{X})$', 'alb_logp': r'$\mathcal{L}(\bar{\alpha}|\theta,{X})$', 'joint_like': r'$\mathcal{L}(total|\theta,{X})$'}

fig, ax = plt.subplots(2, 6, dpi=300, figsize=(20, 12), sharex=False)
list_axis = [ax[0, 0], ax[0, 1], ax[0, 2], ax[0, 3], ax[0, 4], ax[1, 0], ax[1, 1], ax[1, 2], ax[1, 3], ax[1, 4], ax[1, 5]]
plt.subplots_adjust(wspace=0.9, hspace=0.22)

## KS-test not super useful here, just set alpha level to extremely high value so we keep it all clean and don't have to bother with KS-test
alpha_level = 0.99 #0.05
nonsignificant_alpha = 0.3
main_color = '#066555'

ks_results_list = []
boxplot_results_list = []

for i, col in enumerate(cols):
    axis = list_axis[i]
    axis.grid(False)
    if col not in filtered_data.columns:
        axis.axis('off')
        continue

    cluster_labels = sorted(filtered_data['KMeans_Cluster2'].unique())
    data_by_cluster = [filtered_data.loc[filtered_data['KMeans_Cluster2'] == c, col] for c in cluster_labels]
    
    ### MODIFICATION HERE: Define the total "raw" data for the parameter
    raw_data_for_param = filtered_data[col]
    
    p_values = []
    for c in cluster_labels:
        cluster_data = filtered_data.loc[filtered_data['KMeans_Cluster2'] == c, col]
        
        ks_stat, p_val = -1, 1.0
        
        # Check for sufficient data
        if len(cluster_data) > 1:
            ### MODIFICATION HERE: The KS-test is now between the cluster and the total raw data
            ks_stat, p_val = ks_2samp(cluster_data, raw_data_for_param)
            
            median = np.median(cluster_data)
            q1, q3 = np.percentile(cluster_data, [25, 75])
            iqr = q3 - q1
            # Determine whisker positions
            whisker_low_limit = q1 - 1.5 * iqr
            whisker_high_limit = q3 + 1.5 * iqr
            data_within_whiskers = cluster_data[(cluster_data >= whisker_low_limit) & (cluster_data <= whisker_high_limit)]
            whisker_low = data_within_whiskers.min()
            whisker_high = data_within_whiskers.max()
            fliers = cluster_data[(cluster_data < whisker_low_limit) | (cluster_data > whisker_high_limit)].values
            
            boxplot_results_list.append({
                    'Parameter': col,
                    'Cluster': c,
                    'median': median,
                    'q1': q1,
                    'q3': q3,
                    'whisker_low': whisker_low,
                    'whisker_high': whisker_high,
                    'p_value': p_val,
                    'fliers': fliers
                    })

        p_values.append(p_val)

        ks_results_list.append({
            'Parameter': col,
            'Cluster': c,
            'Compared_Against': 'Total_Raw_Data', # Added for clarity
            'KS_Statistic': ks_stat,
            'P_Value': p_val,
            'Is_Significant': p_val < alpha_level
        })

    # Drawing the boxplots (logic remains the same)
    bp = axis.boxplot(data_by_cluster, patch_artist=True, widths=0.6, vert=True,
                      boxprops=dict(facecolor=main_color, edgecolor='black'),
                      medianprops=dict(color='black'),
                      whiskerprops=dict(color='black'),
                      capprops=dict(color='black'),
                      flierprops=dict(marker='o', color='black', alpha=0.5))
    
    axis.axhline(filtered_data[col].median(), color="black", zorder=-1)
    axis.set_title(y_label_dict.get(col, col))
    
    # Applying transparency (logic remains the same)
    for j, box in enumerate(bp['boxes']):
        if p_values[j] >= alpha_level:
            box.set_alpha(nonsignificant_alpha)
            # ... (rest of the alpha setting logic)
            bp['medians'][j].set_alpha(nonsignificant_alpha)
            bp['whiskers'][2*j].set_alpha(nonsignificant_alpha)
            bp['whiskers'][2*j+1].set_alpha(nonsignificant_alpha)
            bp['caps'][2*j].set_alpha(nonsignificant_alpha)
            bp['caps'][2*j+1].set_alpha(nonsignificant_alpha)
            if j < len(bp['fliers']):
                 bp['fliers'][j].set_alpha(nonsignificant_alpha * 0.8)

    ax_inset = inset_axes(list_axis[i], width="50%", height="100%", loc='upper left',
                        bbox_to_anchor=(1, 0, 0.6, 1), bbox_transform=list_axis[i].transAxes, borderpad=0)

    # Loop through each score type (MB, ALB, TSLA) to plot on the same inset
    for score_key, color in colors.items():
        # --- Data Selection (from your script) ---
        if score_key not in temp_df.columns: continue
        cluster_key = best_keys[score_key]
        if cluster_key not in temp_df.columns: continue
        best_cluster = dic_best_scores.get(score_key)
        if best_cluster is None: continue
        
        # Select the specific data for this KDE plot
        data = temp_df.loc[temp_df[cluster_key] == best_cluster, col]
        
        # Skip if there's no data to plot
        if data.empty or data.isna().all():
            continue

        # --- Plot the KDE (from your script) ---
        sns.kdeplot(y=data, ax=ax_inset, fill=True, cut=0, color=color, alpha=0.3, linewidth=1.0)
        
        # --- ADDITION: Calculate and plot percentile lines ---
        # 1. Check if there's enough data for percentile calculation
        if len(data) > 1:
            # 2. Calculate the 25th, 50th, and 75th percentiles
            q25, q50, q75 = np.percentile(data, [25, 50, 75])
            
            # 3. Create a KDE function to find the density (x-value) at each percentile
            kde_func = gaussian_kde(data)
            
            # 4. Define the percentiles, their styles, and get their density values
            percentiles = [q25, q50, q75]
            styles = ['--', '-', '--']
            kde_values = kde_func(percentiles)

            # 5. Plot a horizontal line for each percentile
            # The line goes from x=0 to x=kde_value at the height (y) of the percentile
            for q, kde_v, style in zip(percentiles, kde_values, styles):
                ax_inset.plot([0, kde_v], [q, q], linestyle=style, color=color, linewidth=1.2)
  
        
    ylim1, ylim2 = ax_inset.get_ylim()
    if i != 9:
        list_axis[i].set_ylim(ylim1, ylim2)
    else:
        list_axis[i].set_ylim(ylim1-4, ylim2)
        ax_inset.set_ylim(ylim1-4, ylim2)
    ax_inset.grid(False)
    ax_inset.axis("off")
    # ... (end of inset plotting code) ...

# --- Axis formatting (remains unchanged) ---
ax[0, 1].set_yticks(np.arange(0.12, 0.24 + 0.01, 0.03))
for a in ax[0]:
    a.set_xticklabels([])
    a.set_xlabel("")
    
for a in ax[1]:
    a.set_xlabel("Cluster")

"""
### MODIFICATION: Create a custom legend for the significance colors
legend_patches = [
    Patch(facecolor=significant_color, edgecolor='black', label=f'Significant (p < {alpha})'),
    Patch(facecolor=nonsignificant_color, edgecolor='black', linestyle='--', label=f'Not Significant (p ≥ {alpha})')
]
# Place legend in the empty subplot space (ax[1,5] is used for the explainer, so find another spot)
ax[0, 5].legend(handles=legend_patches, loc='center', fontsize=16)
ax[0, 5].axis('off') # Turn off the axis for the legend plot
"""
# Dummy data
np.random.seed(42)
dummy_data = np.random.normal(loc=10, scale=5, size=100)
ax_explainer = ax[0, 5]

# Plot a simple vertical boxplot
bp = ax_explainer.boxplot(dummy_data, vert=True, patch_artist=True, widths=0.6,
               boxprops=dict(facecolor='#066555', color='black'),
               medianprops=dict(color='black'),
               whiskerprops=dict(color='black'),
               capprops=dict(color='black'),
               flierprops=dict(marker='o', color='black', alpha=0.3))

# Clean up axis for better visuals
ax_explainer.set_xticks([])
ax_explainer.set_yticks([])

# Annotation positions
Q1 = np.percentile(dummy_data, 25)
Q2 = np.median(dummy_data)
Q3 = np.percentile(dummy_data, 75)
whisker_low = np.min(dummy_data[dummy_data > Q1 - 1.5 * (Q3 - Q1)])
whisker_high = np.max(dummy_data[dummy_data < Q3 + 1.5 * (Q3 - Q1)])
outlier = max(dummy_data)
ax_explainer.axhline(y=Q3+2.0, color="black")
# Annotate parts
ax_explainer.annotate("LHS Median", xy=(1, Q3+2.0), xytext=(0.71, Q3+2.5),
                      fontsize=14, va='center',
                      bbox=dict(facecolor='None', edgecolor='None', boxstyle='round,pad=0.01'))
ax_explainer.annotate("Q1", xy=(1, Q1), xytext=(0.75, Q1-0.68),
                      #arrowprops=dict(arrowstyle="->"),
                      fontsize=14, va='center',
                      bbox=dict(facecolor='white', edgecolor='white', boxstyle='round,pad=0.01'))
ax_explainer.annotate("Median", xy=(1, Q2), xytext=(0.75, Q2),
                      #arrowprops=dict(arrowstyle="->"),
                      fontsize=14, va='center',
                      bbox=dict(facecolor='#066555', edgecolor='#066555', boxstyle='round,pad=0.01'))
ax_explainer.annotate("Q3", xy=(1, Q3), xytext=(0.75, Q3+0.68),
                      #arrowprops=dict(arrowstyle="->"),
                      fontsize=14, va='center',
                      bbox=dict(facecolor='white', edgecolor='white', boxstyle='round,pad=0.01'))
ax_explainer.annotate("Q1 - 1.5xIQR", xy=(1, whisker_low), xytext=(0.55, whisker_low - 0.9),
                      #arrowprops=dict(arrowstyle="->"),
                      fontsize=14, va='center',
                      bbox=dict(facecolor='white', edgecolor='white', boxstyle='round,pad=0.01'))

ax_explainer.annotate("Q3 + 1.5xIQR", xy=(1, whisker_high), xytext=(0.55, whisker_high + 0.75),
                      #arrowprops=dict(arrowstyle="->"),
                      fontsize=14, va='center',
                      bbox=dict(facecolor='white', edgecolor='white', boxstyle='round,pad=0.01'))
ax_explainer.set_ylim(-4,22)
"""
# Optional: annotate outlier

if outlier > whisker_high + 0.2:  # only if clearly visible
    ax_explainer.annotate("Outlier", xy=(1, outlier), xytext=(1.2, outlier),
                          #arrowprops=dict(arrowstyle="->"),
                          fontsize=14, va='center')
"""
# Create small top inset for each metric's best cluster
ax_inset = inset_axes(ax_explainer, width="60%", height="100%", loc='upper left',
                        bbox_to_anchor=(1, 0, 0.6, 1), bbox_transform=ax_explainer.transAxes, borderpad=0) #bbox starts at first two values, + next two values
# Dummy data for KDEs (three distributions)
dummy_mb = np.random.normal(loc=12, scale=2.5, size=100)
dummy_tsla = np.random.normal(loc=22, scale=2.5, size=100)
dummy_alb = np.random.normal(loc=2, scale=2.5, size=100)

# Plot vertical KDEs
#sns.kdeplot(y=dummy_mb, ax=ax_inset, label="MB", fill=True, color="#D81B1B", linewidth=1.1)
#sns.kdeplot(y=dummy_tsla, ax=ax_inset, label="TSLA", fill=True, color="#A5781B", linewidth=1.1)
#
# sns.kdeplot(y=dummy_alb, ax=ax_inset, label="ALB", fill=True, color="#1E80E5", linewidth=1.1)

dummy_dic = {"MB": dummy_mb, "TSLA": dummy_tsla, "ALB": dummy_alb}
color_dic = {"MB": "#D81B1B", "ALB": "#1E80E5", "TSLA": "#A5781B"}
# 3. Compute percentiles
for score_key, color in color_dic.items():
    q25 = np.percentile(dummy_dic[score_key], 25)
    q50 = np.median(dummy_dic[score_key])
    q75 = np.percentile(dummy_dic[score_key], 75)
    # Plot KDE vertically
    kde_line = sns.kdeplot(y=dummy_dic[score_key], ax=ax_inset, fill=True, color=color, label=score_key, linewidth=1.1)

    kde_func = gaussian_kde(dummy_dic[score_key])
    kde_vals = kde_func(q25), kde_func(q50), kde_func(q75)

    # 5. Plot horizontal lines at each percentile
    for q, kde_v, style in zip([q25, q50, q75], kde_vals, ['--', '-', '--']):
        ax_inset.plot([0, kde_v[0]], [q, q], linestyle=style, color=color, linewidth=1.0)

ax_inset.text(0.01, 12, r"$B_{geod}$", color="#D81B1B", fontsize=24, fontweight='bold', va='center')
ax_inset.text(0.01, 22, r"$SLA$", color="#A5781B", fontsize=24, fontweight='bold', va='center')
ax_inset.text(0.01, 2, r"$\bar{\alpha}$", color="#1E80E5", fontsize=24, fontweight='bold', va='center')

## add labels
fig.text(0.09, 0.9, 'a)', transform=fig.transFigure, fontsize=24)
fig.text(0.79, 0.9, 'f)', transform=fig.transFigure, fontsize=24)

# Automatically space b–e evenly between 0.1 and 0.8
x_positions = np.linspace(0.09, 0.79, 6)  # gives positions for a–f
labels = ['b)', 'c)', 'd)', 'e)']

for x, label in zip(x_positions[1:-1], labels):  # skip first (a) and last (f)
    fig.text(x, 0.9, label, transform=fig.transFigure, fontsize=24)
#
fig.text(0.09, 0.48, 'g)', transform=fig.transFigure, fontsize=24)
fig.text(0.79, 0.48, 'l)', transform=fig.transFigure, fontsize=24)

# Automatically space b–e evenly between 0.1 and 0.8
x_positions = np.linspace(0.09, 0.79, 6)  # gives positions for a–f
labels = ['h)', 'i)', 'j)', 'k)']

for x, label in zip(x_positions[1:-1], labels):  # skip first (a) and last (f)
    fig.text(x, 0.48, label, transform=fig.transFigure, fontsize=24)

ax_inset.axis('off')  # Clean look
if 'win' in sys.platform:
    plt.savefig("E:/OneDrive/PhD/PhD/Data/Hintereisferner/Figures/Fig04_boxplot_distributions_clusters.pdf", bbox_inches="tight")
else:
    plt.savefig("/mnt/C4AEBBABAEBB9500/OneDrive/PhD/PhD/Data/Hintereisferner/Figures/Fig04_boxplot_distributions_clusters.pdf", bbox_inches="tight")

In [None]:
bp_results_df = pd.DataFrame(boxplot_results_list)
bp_results_df = bp_results_df.sort_values(by=['Parameter', 'Cluster'])
bp_results_df

In [None]:
ks_results_df = pd.DataFrame(ks_results_list)
ks_results_df = ks_results_df.sort_values(by=['Parameter', 'Cluster'])
ks_results_df

In [None]:
# Parameters to plot
cols = ['rrr_factor','alb_ice','alb_snow', 'alb_firn', 'albedo_aging','albedo_depth', 'roughness_ice', 'mb_logp',
        'tsla_logp','alb_logp','joint_like']

colors = {"mb_logp": "#D81B1B", "alb_logp": "#1E80E5", "tsla_logp": "#A5781B"}
best_keys = {"mb_logp": "KMeans_mb_logp", "alb_logp": "KMeans_alb_logp", "tsla_logp": "KMeans_tsla_logp"}

# Create boxplots manually
fig, ax = plt.subplots(2,6, dpi=300, figsize=(20,12), sharex=False)
list_axis = [ax[0,0], ax[0,1], ax[0,2], ax[0,3], ax[0,4], ax[1,0], ax[1,1], ax[1,2], ax[1,3], ax[1,4], ax[1,5]]
plt.subplots_adjust(wspace=.9, hspace=0.2)
for i, col in enumerate(cols):
    axis = list_axis[i]
    data_by_cluster = [filtered_data.loc[filtered_data['KMeans_Cluster2'] == c, col] for c in sorted(filtered_data['KMeans_Cluster2'].unique())]
    
    bp = axis.boxplot(data_by_cluster, patch_artist=True, widths=0.6, vert=True,
                   boxprops=dict(
                       facecolor='#066555',
                       color='black',
                       linestyle='--'  # Dashed box outline
                       ),
               medianprops=dict(color='black'),
               whiskerprops=dict(color='black'),
               capprops=dict(color='black'),
               flierprops=dict(marker='o', color='black', alpha=0.3))
    
    axis.set_title(y_label_dict[col])
    
    
    # Create small top inset for each metric's best cluster
    ax_inset = inset_axes(list_axis[i], width="50%", height="100%", loc='upper left',
                          bbox_to_anchor=(1, 0, 0.6, 1), bbox_transform=list_axis[i].transAxes, borderpad=0) #bbox starts at first two values, + next two values
    #sns.kdeplot(y=temp_df.loc[temp_df['KMeans_mb_logp'] == dic_best_scores['mb_logp'], col], fill=True, ax=ax_inset, color="#D81B1B", alpha=0.3, label="test", linewidth=1.)
    #sns.kdeplot(y=temp_df.loc[temp_df['KMeans_alb_logp'] == dic_best_scores['alb_logp'], col], fill=True, ax=ax_inset, color="#1E80E5", alpha=0.3, label="test", linewidth=1.)
    #sns.kdeplot(y=temp_df.loc[temp_df['KMeans_tsla_logp'] == dic_best_scores['tsla_logp'], col], fill=True, ax=ax_inset, color="#A5781B", alpha=0.3, label="test", linewidth=1.)
    
    for score_key, color in colors.items():
            cluster_key = best_keys[score_key]
            data = temp_df.loc[temp_df[cluster_key] == dic_best_scores[score_key], col]

            # 3. Compute percentiles
            q25 = np.percentile(data, 25)
            q50 = np.median(data)
            q75 = np.percentile(data, 75)
            # Plot KDE vertically
            kde_line = sns.kdeplot(y=data, ax=ax_inset, fill=True, cut=0, color=color, alpha=0.3, linewidth=1.)

            kde_func = gaussian_kde(data)
            kde_vals = kde_func(q25), kde_func(q50), kde_func(q75)

            # 5. Plot horizontal lines at each percentile
            for q, kde_v, style in zip([q25, q50, q75], kde_vals, ['--', '-', '--']):
                ax_inset.plot([0, kde_v[0]], [q, q], linestyle=style, color=color, linewidth=1.0)
    
    list_axis[i].set_ylim(ax_inset.get_ylim())
    ax_inset.axis("off")
ax[0,1].set_yticks(np.arange(0.12,0.24+0.01,0.03))
    
for a in ax[0]:  # ax[0] is the first row
    a.set_xticklabels([])
    a.set_xlabel("")  # Optional: also hide the label
    
## design axis[0,5] by hand
# Dummy data
np.random.seed(42)
dummy_data = np.random.normal(loc=10, scale=5, size=100)
ax_explainer = ax[0, 5]

# Plot a simple vertical boxplot
bp = ax_explainer.boxplot(dummy_data, vert=True, patch_artist=True, widths=0.6,
               boxprops=dict(facecolor='#066555', color='black'),
               medianprops=dict(color='black'),
               whiskerprops=dict(color='black'),
               capprops=dict(color='black'),
               flierprops=dict(marker='o', color='black', alpha=0.3))

# Clean up axis for better visuals
ax_explainer.set_xticks([])
ax_explainer.set_yticks([])

# Annotation positions
Q1 = np.percentile(dummy_data, 25)
Q2 = np.median(dummy_data)
Q3 = np.percentile(dummy_data, 75)
whisker_low = np.min(dummy_data[dummy_data > Q1 - 1.5 * (Q3 - Q1)])
whisker_high = np.max(dummy_data[dummy_data < Q3 + 1.5 * (Q3 - Q1)])
outlier = max(dummy_data)

# Annotate parts
ax_explainer.annotate("Q1", xy=(1, Q1), xytext=(0.75, Q1-0.68),
                      #arrowprops=dict(arrowstyle="->"),
                      fontsize=14, va='center',
                      bbox=dict(facecolor='white', edgecolor='white', boxstyle='round,pad=0.01'))
ax_explainer.annotate("Median", xy=(1, Q2), xytext=(0.75, Q2),
                      #arrowprops=dict(arrowstyle="->"),
                      fontsize=14, va='center',
                      bbox=dict(facecolor='#066555', edgecolor='#066555', boxstyle='round,pad=0.01'))
ax_explainer.annotate("Q3", xy=(1, Q3), xytext=(0.75, Q3+0.68),
                      #arrowprops=dict(arrowstyle="->"),
                      fontsize=14, va='center',
                      bbox=dict(facecolor='white', edgecolor='white', boxstyle='round,pad=0.01'))
ax_explainer.annotate("Q1 - 1.5xIQR", xy=(1, whisker_low), xytext=(0.55, whisker_low - 0.9),
                      #arrowprops=dict(arrowstyle="->"),
                      fontsize=14, va='center',
                      bbox=dict(facecolor='white', edgecolor='white', boxstyle='round,pad=0.01'))

ax_explainer.annotate("Q3 + 1.5xIQR", xy=(1, whisker_high), xytext=(0.55, whisker_high + 0.75),
                      #arrowprops=dict(arrowstyle="->"),
                      fontsize=14, va='center',
                      bbox=dict(facecolor='white', edgecolor='white', boxstyle='round,pad=0.01'))
ax_explainer.set_ylim(-4,22)
"""
# Optional: annotate outlier

if outlier > whisker_high + 0.2:  # only if clearly visible
    ax_explainer.annotate("Outlier", xy=(1, outlier), xytext=(1.2, outlier),
                          #arrowprops=dict(arrowstyle="->"),
                          fontsize=14, va='center')
"""
# Create small top inset for each metric's best cluster
ax_inset = inset_axes(ax_explainer, width="60%", height="100%", loc='upper left',
                        bbox_to_anchor=(1, 0, 0.6, 1), bbox_transform=ax_explainer.transAxes, borderpad=0) #bbox starts at first two values, + next two values
# Dummy data for KDEs (three distributions)
dummy_mb = np.random.normal(loc=12, scale=2.5, size=100)
dummy_tsla = np.random.normal(loc=22, scale=2.5, size=100)
dummy_alb = np.random.normal(loc=2, scale=2.5, size=100)

# Plot vertical KDEs
#sns.kdeplot(y=dummy_mb, ax=ax_inset, label="MB", fill=True, color="#D81B1B", linewidth=1.1)
#sns.kdeplot(y=dummy_tsla, ax=ax_inset, label="TSLA", fill=True, color="#A5781B", linewidth=1.1)
#
# sns.kdeplot(y=dummy_alb, ax=ax_inset, label="ALB", fill=True, color="#1E80E5", linewidth=1.1)

dummy_dic = {"MB": dummy_mb, "TSLA": dummy_tsla, "ALB": dummy_alb}
color_dic = {"MB": "#D81B1B", "ALB": "#1E80E5", "TSLA": "#A5781B"}
# 3. Compute percentiles
for score_key, color in color_dic.items():
    q25 = np.percentile(dummy_dic[score_key], 25)
    q50 = np.median(dummy_dic[score_key])
    q75 = np.percentile(dummy_dic[score_key], 75)
    # Plot KDE vertically
    kde_line = sns.kdeplot(y=dummy_dic[score_key], ax=ax_inset, fill=True, color=color, label=score_key, linewidth=1.1)

    kde_func = gaussian_kde(dummy_dic[score_key])
    kde_vals = kde_func(q25), kde_func(q50), kde_func(q75)

    # 5. Plot horizontal lines at each percentile
    for q, kde_v, style in zip([q25, q50, q75], kde_vals, ['--', '-', '--']):
        ax_inset.plot([0, kde_v[0]], [q, q], linestyle=style, color=color, linewidth=1.0)

ax_inset.text(0.01, 12, r"$B_{geod}$", color="#D81B1B", fontsize=24, fontweight='bold', va='center')
ax_inset.text(0.01, 22, r"$SLA$", color="#A5781B", fontsize=24, fontweight='bold', va='center')
ax_inset.text(0.01, 2, r"$\bar{\alpha}$", color="#1E80E5", fontsize=24, fontweight='bold', va='center')

ax_inset.axis('off')  # Clean look
#if 'win' in sys.platform:
#    plt.savefig("E:/OneDrive/PhD/PhD/Data/Hintereisferner/Figures/boxplot_distributions_clusters.png", bbox_inches="tight")
#else:
#    plt.savefig("/mnt/C4AEBBABAEBB9500/OneDrive/PhD/PhD/Data/Hintereisferner/Figures/boxplot_distributions_clusters.png", bbox_inches="tight")

In [None]:
gmm_silhouette = silhouette_score(loglikelihoods_scaled, filtered_data["KMeans_Cluster2"])
print(f"GMM Silhouette Score: {gmm_silhouette:.2f}")

sns.pairplot(filtered_data, hue="KMeans_Cluster2", vars=['rrr_factor','alb_ice','alb_snow','alb_firn','albedo_aging','albedo_depth','roughness_ice','alb_logp','mb_logp','tsla_logp','joint_like'])
plt.suptitle("Pairplot of Model Parameters by Cluster", y=1.02)
plt.show()


In [None]:
## Compute IQR and CI ratios between best performing cluster and all data sets as a measure of spread

def compute_cir_iqr_ratios(full_df, best_df, param_features, ci=0.95):
    """
    Compute CIR and IQR ratios for each parameter.

    Parameters:
    df_all : DataFrame
        Full parameter set.
    df_top : DataFrame
        Subset of best-performing simulations (e.g., top 10%).
    ci : float
        Credible interval level (default: 0.95).

    Returns:
    DataFrame with CIR and IQR ratios for each parameter.
    """
    ratios = {}
    lower_q = (1 - ci) / 2
    upper_q = 1 - lower_q

    for col in full_df[param_features].columns:
        # CIR
        full_ci = np.quantile(full_df[col], upper_q) - np.quantile(full_df[col], lower_q)
        top_ci = np.quantile(best_df[col], upper_q) - np.quantile(best_df[col], lower_q)
        cir = top_ci / full_ci if full_ci != 0 else np.nan

        # IQR Ratio
        full_iqr = np.percentile(full_df[col], 75) - np.percentile(full_df[col], 25)
        top_iqr = np.percentile(best_df[col], 75) - np.percentile(best_df[col], 25)
        iqr_ratio = top_iqr / full_iqr if full_iqr != 0 else np.nan

        ratios[col] = {"CIR": cir, "IQR_ratio": iqr_ratio}

    return pd.DataFrame(ratios).T

if 'win' in sys.platform:
    results = compute_cir_iqr_ratios(filtered_data, filtered_data.loc[filtered_data['KMeans_Cluster2'] == 3], param_features=param_features)
else:
    results = compute_cir_iqr_ratios(filtered_data, filtered_data.loc[filtered_data['KMeans_Cluster2'] == 3], param_features=param_features)
print(results)

In [None]:
blabla = results.reset_index()
# Dictionary for LaTeX-style labels
y_label_dict_short = {
    'rrr_factor': r'$p_{f}$', 
    'alb_ice': r'$\alpha_{ice}$', 
    'alb_snow': r'$\alpha_{fs}$',
    'alb_firn': r'$\alpha_{firn}$', 
    'albedo_aging': r'$\alpha_{aging}$',
    'albedo_depth': r'$\alpha_{depth}$',
    'roughness_ice': r'$z0_{ice}$'
}

# Apply the mapping to the 'Parameter' column in the DataFrame
blabla['index'] = blabla['index'].map(y_label_dict)
blabla

In [None]:
sns.set(style="whitegrid", font_scale=1.2)
fig, axs = plt.subplots(1, 2, figsize=(14, 6), sharey=True)

# CIR Plot
sns.barplot(x='CIR', y='index', data=blabla, ax=axs[0], palette='viridis')
axs[0].axvline(1.0, color='red', linestyle='--')
axs[0].set_title('Credible Interval Ratio')
axs[0].set_xlim(0, 1.2)
axs[0].set_ylabel("")

# IQR Ratio Plot
sns.barplot(x='IQR_ratio', y='index', data=blabla, ax=axs[1], palette='plasma')
axs[1].axvline(1.0, color='red', linestyle='--')
axs[1].set_title('Interquartile Range Ratio')
axs[1].set_xlim(0, 1.2)

# Layout tweaks
plt.tight_layout()
#if 'win' in sys.platform:
#    plt.savefig("E:/OneDrive/PhD/PhD/Data/Hintereisferner/Figures/cir_iqr_ratios.png", bbox_inches="tight")
#else:
#    plt.savefig("/mnt/C4AEBBABAEBB9500/OneDrive/PhD/PhD/Data/Hintereisferner/Figures/cir_iqr_ratios.png", bbox_inches="tight")

In [None]:
## Investigate loadings
loadings = pca.components_.T
n_components_to_keep = 6  # based on your earlier comment

# Communalities: variance of each parameter explained by the top components
communalities = np.sum(loadings[:, :n_components_to_keep]**2, axis=1)
communalities_df = pd.DataFrame({
    'Parameter': param_features,
    'Communality': communalities
}).sort_values(by='Communality', ascending=False)

plt.figure(figsize=(8, 5))
plt.plot(np.arange(1, len(explained_variances)+1), explained_variances*100, 'o-', linewidth=2)
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance (%)')
plt.title('Scree Plot')
plt.grid(True)
plt.show()

loadings_df = pd.DataFrame(loadings[:, :6], columns=[f'PC{i+1}' for i in range(6)], index=param_features)

plt.figure(figsize=(10, 6))
sns.heatmap(loadings_df, annot=True, cmap='coolwarm', center=0)
plt.title('PCA Loadings for First 6 Components')
plt.show()


In [None]:
communalities_df

In [None]:
df_pca = pd.concat([pca_df, filtered_data[score_columns + ['joint_like']].reset_index(drop=True)], axis=1)
# e.g. plot PC1 vs PC2 colored by joint LL
corr = df_pca.corr(method="spearman").loc[['PC1','PC2','PC3','PC4','PC5','PC6','PC7'],
                         ['mb_logp','tsla_logp','alb_logp','joint_like']]
# Step 5: Visualize the correlations
plt.figure(figsize=(12, 6))
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0)
plt.tight_layout()
plt.show()

print(corr)

In [57]:
df_pca.loc[:,'cluster'] = clusters2

In [None]:
newcolors_sorted = ["gray", "#008b8b", "#6b8e23", "#6a5acd" ]

cmap = mcolors.LinearSegmentedColormap.from_list("custom_cmap", newcolors_sorted, N=256)

gradient = np.linspace(0, 1, 256).reshape(1, -1)
plt.figure(figsize=(6, 1))
plt.imshow(gradient, aspect="auto", cmap=cmap)
plt.axis("off")
plt.title("Adjusted colormap")
plt.show()

In [59]:
vmin = df_pca['joint_like'].min()
vmax = df_pca['joint_like'].max()
plt.rcParams['axes.axisbelow'] = True

def plot_pca_axis(axis, data, x_data, y_data, loading_index, ylabel):
    
    markers = ['o', 's', 'D', 'H']  # expand if needed
    unique_clusters = data['cluster'].unique()
    
    sc = None  # Initialize for scatter return

    # Plot each cluster with a different marker
    for i, cluster in enumerate(unique_clusters):
        print(i, cluster)
        subset = data[data['cluster'] == cluster]
        sc = axis.scatter(
            subset[x_data], subset[y_data],
            c=subset['joint_like'], cmap='jet',
            marker=marker_map[cluster], #markers[i % len(markers)],
            vmin=vmin, vmax=vmax,
            edgecolor='k', linewidth=0.5,
            label=f'Cluster {cluster}', s=30,
            alpha=0.8
        )

    # Correlation annotation
    pc1_corr = corr.loc[x_data]
    pc2_corr = corr.loc[y_data]
    text = (
        f"{x_data} Corr — "+r"$B_{geod}$: "+f"{pc1_corr['mb_logp']:.2f}, SLA: {pc1_corr['tsla_logp']:.2f}, "+r"$\bar{\alpha}$: "+f"{pc1_corr['alb_logp']:.2f}, total: {pc1_corr['joint_like']:.2f}\n"
        f"{y_data} Corr — "+r"$B_{geod}$: "+f"{pc2_corr['mb_logp']:.2f}, SLA: {pc2_corr['tsla_logp']:.2f}, "+r"$\bar{\alpha}$: "+f"{pc2_corr['alb_logp']:.2f}, total: {pc2_corr['joint_like']:.2f}"
    )
    if axis in [axes[4],axes[5]]:
        axis.text(0.01, -0.13, text, transform=axis.transAxes, fontsize=14, va='top')
    else:
        axis.text(0.01, -0.05, text, transform=axis.transAxes, fontsize=14, va='top')

    # Add PCA loadings (scaled for visibility)
    for i, param in enumerate(param_features):  # param_names = list of original parameter names
        loading_x = pca.components_[0, i]
        loading_y = pca.components_[loading_index, i]
        axis.arrow(0, 0, loading_x * 3, loading_y * 3, color='black', alpha=1, head_width=0.05)
        new_param_name = y_label_dict[param]
        axis.text(loading_x * 3., loading_y * 3., param, fontsize=18, ha='center', va='center')

    axis.axhline(0, color='black', lw=0.5)
    axis.axvline(0, color='black', lw=0.5)
    axis.grid(True)

    # Labels and legend
    #axis.set_xlabel('PC 1')
    axis.set_ylabel(ylabel)
    axis.axhline(0, color='gray', linestyle='--', linewidth=0.7)
    axis.axvline(0, color='gray', linestyle='--', linewidth=0.7)
    #axis.legend()
    return sc
    

In [None]:
highlight_cluster = 3

# Define the marker shapes
highlight_marker = '^'
other_markers = ['8', 'p', 'D'] # Circle, Square, Diamond

unique_clusters = sorted(df_pca['cluster'].unique())
marker_map = {}
other_marker_idx = 0
for c in unique_clusters:
    if c == highlight_cluster:
        marker_map[c] = highlight_marker
    else:
        # Correctly assign a different marker to each of the other clusters
        marker_map[c] = other_markers[other_marker_idx % len(other_markers)]
        other_marker_idx += 1

# calc ll diff
max_ll = df_pca['joint_like'].max()
df_pca['ll_difference'] = max_ll - df_pca['joint_like']

pc_cols_for_dist = [f'PC{i+1}' for i in range(7)] # e.g., ['PC1', 'PC2', ..., 'PC7']

best_run_coords = df_pca.loc[df_pca['joint_like'].idxmax(), pc_cols_for_dist].values

all_coords = df_pca[pc_cols_for_dist].values

# Calculate Mahalanobis distance for each point from the best run
inv_cov_matrix = np.diag(1 / pca.explained_variance_[:len(pc_cols_for_dist)])
df_pca['param_distance'] = [mahalanobis(point, best_run_coords, inv_cov_matrix) for point in all_coords]

# Update global parameters
plt.rcParams.update({'font.size': 24})

y_label_dict_short = {
    'rrr_factor': r'$p_{f}$', 
    'alb_ice': r'$\alpha_{ice}$', 
    'alb_snow': r'$\alpha_{fs}$',
    'alb_firn': r'$\alpha_{firn}$', 
    'albedo_aging': r'$\alpha_{aging}$',
    'albedo_depth': r'$\alpha_{depth}$',
    'roughness_ice': r'$z0_{ice}$'
    }

def plot_pca_axis(axis, data, x_data, y_data, loading_index, ylabel, marker_map,
                  color_mode='log_likelihood', highlight_cluster=None):
    
    # This logic block correctly defines the variables to use
    if color_mode == 'll_difference':
        color_col = 'll_difference'
        cmap = 'inferno_r'
        cbar_label = 'Log-Likelihood Difference from Best'
        vmin, vmax = 0, data[color_col].quantile(0.99)
    elif color_mode == 'param_distance':
        color_col = 'param_distance'
        cmap = 'inferno_r'
        cbar_label = 'Parameter Distance from Best Run'
        vmin, vmax = 0, data[color_col].quantile(0.99)
    else: # Default to original 'log_likelihood'
        color_col = 'joint_like'
        cmap = 'jet'
        cbar_label = 'Joint Log-Likelihood'
        vmin, vmax = data[color_col].min(), data[color_col].max()

    sc = None
    
    clusters_to_plot = sorted(data['cluster'].unique())
    if highlight_cluster is not None and highlight_cluster in clusters_to_plot:
        clusters_to_plot.remove(highlight_cluster)
        clusters_to_plot.append(highlight_cluster)

    for cluster in clusters_to_plot:
        subset = data[data['cluster'] == cluster]
        is_highlight = (cluster == highlight_cluster)
        
        if is_highlight:
            marker_size, edge_color, line_w, z_order, alpha = 60, 'black', 0.8, 10, 0.9
        else:
            marker_size, edge_color, line_w, z_order, alpha = 40, 'grey', 0.6, 5, 0.7

        sc = axis.scatter(
            subset[x_data], subset[y_data],
            c=subset[color_col],
            cmap=cmap,
            marker=marker_map.get(cluster, 'x'),
            s=marker_size,
            vmin=vmin, vmax=vmax,
            edgecolor=edge_color,
            linewidth=line_w,
            zorder=z_order,
            alpha=alpha
        )
    
    # Correlation annotation
    var_x = pca.explained_variance_ratio_[0]  # x_data is always PC1
    var_y = pca.explained_variance_ratio_[loading_index]
    
    x_loadings = pca.components_[0]
    top_x_indices = np.argsort(np.abs(x_loadings))[-3:] # Get indices of 2 largest absolute loadings
    top_x_params = [
    f"{y_label_dict_short.get(param_features[i], param_features[i])} ({x_loadings[i]:+.2f})" 
    for i in top_x_indices
    ]

    y_loadings = pca.components_[loading_index]
    top_y_indices = np.argsort(np.abs(y_loadings))[-3:] # Get indices of 2 largest absolute loadings
    top_y_params = [
        f"{y_label_dict_short.get(param_features[i], param_features[i])} ({y_loadings[i]:+.2f})" 
        for i in top_y_indices
    ]
    text = (
        f"{x_data} (Var: {var_x:.1%}) | Loadings: {', '.join(top_x_params)}\n"
        f"{y_data} (Var: {var_y:.1%}) | Loadings: {', '.join(top_y_params)}"
    )
    if axis in [axes[4],axes[5]]:
        axis.text(0.01, -0.13, text, transform=axis.transAxes, fontsize=13, va='top')
    else:
        axis.text(0.01, -0.05, text, transform=axis.transAxes, fontsize=13, va='top')


    text_effect = [path_effects.Stroke(linewidth=2, foreground='white'), path_effects.Normal()]
    padding = 0.05  # Adjust this value to control the gap between the arrow and the text

    for i, param in enumerate(param_features):
        loading_x = pca.components_[0, i] * 3  # scale by 3
        loading_y = pca.components_[loading_index, i] * 3
        
        arrow = axis.arrow(0, 0, loading_x, loading_y, color='black', alpha=1, 
                head_width=0.07, zorder=11, length_includes_head=True)
        arrow.set_path_effects(text_effect)

        # Determine horizontal alignment
        if loading_x > 0:
            ha = 'left'
            text_x = loading_x + padding
        else:
            ha = 'right'
            text_x = loading_x - padding

        # Determine vertical alignment
        if loading_y > 0:
            va = 'bottom'
            text_y = loading_y + padding
        else:
            va = 'top'
            text_y = loading_y - padding

        # Place the text with the new alignment and padded coordinates
        txt = axis.text(text_x, text_y, y_label_dict_short[param], fontsize=20, 
                        ha=ha, va=va, zorder=12)
        txt.set_path_effects(text_effect)

    axis.set_ylabel(ylabel)
    axis.axhline(0, color='gray', linestyle='--', linewidth=0.7)
    axis.axvline(0, color='gray', linestyle='--', linewidth=0.7)

    axis.axhline(0, color='black', lw=0.5)
    axis.axvline(0, color='black', lw=0.5)
    axis.grid(True)
    axis.set_ylabel(ylabel)
    
    return sc, cbar_label # Return the colorbar label as well


# choose here 'log_likelihood', 'll_difference', or 'param_distance'
selected_color_mode = 'll_difference' 

fig, axes = plt.subplots(3, 2, dpi=300, figsize=(16, 12), sharex=True, constrained_layout=True)
axes = axes.flatten()
#plt.subplots_adjust(left=0.05, right=0.85, top=0.95, bottom=0.1, hspace=0.2, wspace=0.2)

# Call the plotting function with the selected mode
sc, cbar_label = plot_pca_axis(axes[0], df_pca, "PC1", "PC2", 1, "PC 2", marker_map=marker_map, color_mode=selected_color_mode, highlight_cluster=3)
plot_pca_axis(axes[1], df_pca, "PC1", "PC3", 2, "PC 3", marker_map=marker_map, color_mode=selected_color_mode, highlight_cluster=3)
plot_pca_axis(axes[2], df_pca, "PC1", "PC4", 3, "PC 4", marker_map=marker_map, color_mode=selected_color_mode, highlight_cluster=3)
plot_pca_axis(axes[3], df_pca, "PC1", "PC5", 4, "PC 5", marker_map=marker_map, color_mode=selected_color_mode, highlight_cluster=3)
plot_pca_axis(axes[4], df_pca, "PC1", "PC6", 5, "PC 6", marker_map=marker_map, color_mode=selected_color_mode, highlight_cluster=3)
sc, cbar_label = plot_pca_axis(axes[5], df_pca, "PC1", "PC7", 6, "PC 7", marker_map=marker_map, color_mode=selected_color_mode, highlight_cluster=3) # Capture last one for colorbar

# Add colorbar using the dynamic label
cbar = fig.colorbar(sc, ax=axes.ravel().tolist(), location='right', pad=0.03, aspect=40)
cbar.set_label(cbar_label)

highlight_cluster = 3
highlight_marker = '^'
other_markers = ['8', 'p', 'D']
unique_clusters = sorted(df_pca['cluster'].unique())
marker_map = {}
other_marker_idx = 0
for c in unique_clusters:
    if c == highlight_cluster:
        marker_map[c] = highlight_marker
    else:
        marker_map[c] = other_markers[other_marker_idx % len(other_markers)]
        other_marker_idx += 1

# Create a legend handle for each unique cluster
legend_handles = [
    mlines.Line2D([], [], color='black' if c == highlight_cluster else 'grey',
                  marker=marker_map[c],
                  linestyle='None', markersize=9, 
                  label=f'Cluster {c+1}{" (Best)" if c == 42 else ""}')
    for c in unique_clusters
]

# Place the legend
fig.legend(handles=legend_handles,
           loc='lower center',
           bbox_to_anchor=(0.5, -0.07),
           ncol=4, # Display all 4 side-by-side
           frameon=False)
 

if 'win' in sys.platform:
    plt.savefig("E:/OneDrive/PhD/PhD/Data/Hintereisferner/Figures/FigS14_pca_loadings_colored.png", bbox_inches="tight")
else:
    plt.savefig("/mnt/C4AEBBABAEBB9500/OneDrive/PhD/PhD/Data/Hintereisferner/Figures/FigS14_pca_loadings_colored.png", bbox_inches="tight")


In [None]:
## Estimate additional error term based on residuals for MCMC later
subset = filtered_data.loc[filtered_data['KMeans_Cluster2'] == 3]
subset['mb_res'] = subset['mb'] - geod_ref['dmdtda'].item()

#
sim_columns = [col for col in subset.columns if col.startswith('sim')]
snowline_sims = subset[sim_columns]
obs = tsla_true_obs['TSL_normalized']
assert snowline_sims.shape[1] == len(obs), "Mismatch in number of time steps!"
# Calculate residuals (modelled - observed)
residuals = np.abs(snowline_sims - obs.values)
# Split into seasons (assume the index is datetime)
residuals.columns = tsla_true_obs.index
is_summer = residuals.columns.month.isin([6, 7, 8, 9])
residuals_summer = residuals.loc[:, is_summer]
residuals_winter = residuals.loc[:, ~is_summer]
# Albedo
residuals_alb = []
subset_alb_list = [list_sim_alb[x] for x in subset.index]

[residuals_alb.append(x - alb_obs_data.median_albedo.values) for x in subset_alb_list]
df_albedo_res = pd.DataFrame(residuals_alb)
df_abs_albedo_res = df_albedo_res.abs()
time_index = pd.to_datetime(alb_obs_data.coords['time'].values)
df_abs_albedo_res.columns = time_index

is_summer = df_abs_albedo_res.columns.month.isin([6, 7, 8, 9])

residuals_albedo_summer = df_abs_albedo_res.loc[:, is_summer]
residuals_albedo_winter = df_abs_albedo_res.loc[:, ~is_summer]
df_abs_albedo_res

In [None]:
### Create residual plot and look at e.g., standard deviation
#Create mean seasonal residuals
alb_mean_summer_residual = residuals_albedo_summer.mean(axis=1)
alb_mean_winter_residual = residuals_albedo_winter.mean(axis=1)
tsla_mean_summer_residual = residuals_summer.mean(axis=1)
tsla_mean_winter_residual = residuals_winter.mean(axis=1)
mb_mean_residual = subset['mb_res'].copy()
mb_mean_residual

In [None]:
## Create histograms
fig, ax = plt.subplots(1,3, figsize=(16,9), dpi=300)
ax[0].hist(alb_mean_summer_residual, label="Summer", edgecolor="black", alpha=0.7)
ax[0].hist(alb_mean_winter_residual, label="Winter", edgecolor="black", alpha=0.7)
#
ax[1].hist(tsla_mean_summer_residual, label="Summer", edgecolor="black", alpha=0.7)
ax[1].hist(tsla_mean_winter_residual, label="Winter", edgecolor="black", alpha=0.7)
ax[1].legend()
#
ax[2].hist(mb_mean_residual, edgecolor="black", alpha=0.7)

print("Prior scales for systematic uncertainty term in MCMC")
print("Prior sigma for mass balance: ", mb_mean_residual.std())
print("--------------------")
print("Prior sigma for summer TSLA: ", tsla_mean_summer_residual.std())
print("Prior sigma for winter TSLA: ", tsla_mean_winter_residual.std())
print("--------------------")
print("Prior sigma for summer ALB: ", alb_mean_summer_residual.std())
print("Prior sigma for winter ALB: ", alb_mean_winter_residual.std())

In [None]:
## Based on CRI and IQR ratio, we set constrained priors (IQR range) for albedo aging, depth, precipitation factor while we keep the other parameters wide.
blabla

In [None]:
def fit_truncnorm_with_iqr(df, iqr_cols):
    trunc_params = {}
    distr_dict = {}
    
    # Precompute IQR bounds for specified columns
    q025 = df.quantile(0.025)
    q975 = df.quantile(0.975)

    for col in df.columns:
        mu = df[col].mean()
        sigma = df[col].std(ddof=1)
        if col in iqr_cols:
            #lower, upper = df[col].min(), df[col].max()
            #lower, upper = q25[col], q75[col]
            lower, upper = q025[col], q975[col]
        else:
            #lower, upper = df[col].min(), df[col].max()
            lower, upper = q025[col], q975[col]
        
        # Standardize bounds
        a, b = (lower - mu) / sigma, (upper - mu) / sigma
        
        # Store parameters and create distribution
        distr_dict[col] = (mu, sigma, lower, upper)
        trunc_params[col] = truncnorm(a=a, b=b, loc=mu, scale=sigma)
    
    return trunc_params, distr_dict

mcmc_df = subset[["rrr_factor","alb_ice","alb_snow","alb_firn",
                  "albedo_aging","albedo_depth","roughness_ice"]]

# Columns for which to use IQR-based bounds
iqr_cols = ["rrr_factor", "albedo_aging", "albedo_depth"]

trunc_params, trunc_dists  = fit_truncnorm_with_iqr(mcmc_df, iqr_cols)

trunc_dists

In [None]:
def plot_truncnorm(mcmc_df, trunc_dists):
    fig, axes = plt.subplots(len(mcmc_df.columns), 1, figsize=(6, 3 * len(mcmc_df.columns)))

    for i, col in enumerate(mcmc_df.columns):
        
        mu, sigma, lower, upper = trunc_dists[col]
            
        #mu = mcmc_df[col].mean()
        #sigma = mcmc_df[col].std()
        if col in ["alb_ice", "alb_snow", "alb_firn"]:
            sigma = 0.1
        elif col in ["roughness_ice"]:
            sigma = 9
        #lower, upper = mcmc_df[col].min(), mcmc_df[col].max()
        # Convert bounds to standard normal space
        a, b = (lower - mu) / sigma, (upper - mu) / sigma
        
        # Generate samples
        x = np.linspace(lower, upper, 100)
        func = truncnorm(a, b, loc=mu, scale=sigma)
        trunc_dists[col] = (mu, sigma, lower, upper)
        pdf = func.pdf(x)
        
        # Plot histogram of actual data
        sns.histplot(mcmc_df[col], kde=False, bins=30, stat="density", alpha=0.5, ax=axes[i], color="gray", label="Observed Data")

        # Plot fitted truncated normal distribution
        axes[i].plot(x, pdf, 'r-', lw=2, label=f'Trunc. Normal Fit\nμ={mu:.2f}, σ={sigma:.2f}')
        
        axes[i].set_title(f"Truncated Normal Fit: {col}")
        axes[i].legend()
    
    plt.tight_layout()
    plt.show()

# Run the function
plot_truncnorm(mcmc_df, trunc_dists)

In [None]:
for key in list(trunc_dists.keys()):
    print("Variable: ", key)
    print("Mu: ", trunc_dists[key][0])
    print("Sigma: ",trunc_dists[key][1])
    print("Lower: ",trunc_dists[key][2])
    print("Upper: ",trunc_dists[key][3])