In [1]:
import pandas as pd
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt
import xarray as xr
import seaborn as sns
import sys
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

# Script requires the output of the first 500 LHS samples (wide). These were generated with the respective spotpy script.

if 'win' in sys.platform:
    path = "E:/OneDrive - uibk.ac.at/PhD/PhD/Data/Hintereisferner/COSIPY/MiscTests/LHS/"
    filepath = "E:/OneDrive - uibk.ac.at/PhD/PhD/Data/Hintereisferner/COSIPY/MiscTests/"
    tsla = pd.read_csv("E:/OneDrive - uibk.ac.at/PhD/PhD/Data/Hintereisferner/Climate/snowlines/HEF-snowlines-1999-2010_manual_filtered.csv")
    figpath = "E:/OneDrive - uibk.ac.at/PhD/PhD/Data/Hintereisferner/Figures/"
else:
    #path = "/mnt/C4AEBBABAEBB9500/OneDrive - uibk.ac.at/PhD/PhD/Data/Hintereisferner/COSIPY/MiscTests/LHS/"
    #temporary solution due to issues with OneDrive!
    path = "/mnt/C4AEBBABAEBB9500/OneDrive/PhD/PhD/Data/Hintereisferner/COSIPY/MiscTests/LHS/"
    figpath = "/home/niki/Dokumente/NewOneDrive/"
    filepath = "/home/niki/Dokumente/NewOneDrive/"
    tsla = pd.read_csv("/mnt/C4AEBBABAEBB9500/OneDrive/PhD/PhD/Data/Hintereisferner/Climate/snowlines/HEF-snowlines-1999-2010_manual_filtered.csv")

In [None]:
df = pd.read_csv(path+"LHS-wide_1D20m_1999_2010_fullprior.csv", index_col=0)
df

In [None]:
## Prepare TSLA data
time_start_dt = pd.to_datetime("2000-01-01") #config starts with spinup - need to add 1year
time_end_dt = pd.to_datetime("2009-12-31")

tsla_true_obs = tsla.copy()
tsla_true_obs['LS_DATE'] = pd.to_datetime(tsla_true_obs['LS_DATE'])
print("Start date:", time_start_dt)
print("End date:", time_end_dt)
tsla_true_obs = tsla_true_obs.loc[(tsla_true_obs['LS_DATE'] > time_start_dt) & (tsla_true_obs['LS_DATE'] <= time_end_dt)]
tsla_true_obs.set_index('LS_DATE', inplace=True)
#Normalize standard deviation if necessary
tsla_true_obs['SC_stdev'] = (tsla_true_obs['SC_stdev']) / (tsla_true_obs['glacier_DEM_max'] - tsla_true_obs['glacier_DEM_min'])

thres_unc = (20) / (tsla_true_obs['glacier_DEM_max'].iloc[0] - tsla_true_obs['glacier_DEM_min'].iloc[0])
print(thres_unc)

## Set observational uncertainty where smaller to atleast model resolution (20m) and where larger keep it
sc_norm = np.where(tsla_true_obs['SC_stdev'] < thres_unc, thres_unc, tsla_true_obs['SC_stdev'])
tsla_true_obs['SC_stdev'] = sc_norm


In [None]:
## Load MB data
rgi_id = "RGI60-11.00897"
if 'win' in sys.platform:
    geod_ref = pd.read_csv("E:/OneDrive/PhD/PhD/Data/Hugonnet_21_MB/dh_11_rgi60_pergla_rates.csv")
else:
    geod_ref = pd.read_csv("/mnt/C4AEBBABAEBB9500/OneDrive/PhD/PhD/Data/Hugonnet_21_MB/dh_11_rgi60_pergla_rates.csv")
geod_ref = geod_ref.loc[geod_ref['rgiid'] == rgi_id]
geod_ref = geod_ref.loc[geod_ref['period'] == "2000-01-01_2010-01-01"]
geod_ref = geod_ref[['dmdtda', 'err_dmdtda']]
print(geod_ref)

In [None]:
modtsls = df.iloc[:,13:13+58].transpose()
print(modtsls)

mb_mod = df.iloc[:,[12]].transpose()
print(mb_mod)

def loglike_tsla_func(sim_tsla, eval_tsla, sigma_tsla):
    loglike_tsla = -0.5 * np.sum(np.log(2 * np.pi * sigma_tsla**2) + ((eval_tsla-sim_tsla)**2 / sigma_tsla**2))
    avg_loglike_tsla = loglike_tsla / len(eval_tsla)
    return avg_loglike_tsla

def loglike_mb_func(sim_mb, eval_mb, sigma_mb):
    loglike_mb = -0.5 * (np.log(2 * np.pi * sigma_mb**2) + ( ((eval_mb-sim_mb)**2) / sigma_mb**2))
    return loglike_mb


In [None]:
mb_logp = mb_mod.transpose().apply(loglike_mb_func, eval_mb = geod_ref['dmdtda'].values, sigma_mb= geod_ref['err_dmdtda'].values, axis=0)
print(mb_logp)

tsl_logp = modtsls.apply(loglike_tsla_func, eval_tsla =tsla_true_obs['TSL_normalized'].values, sigma_tsla= tsla_true_obs['SC_stdev'].values, axis=0)
print(tsl_logp)

In [None]:
## Load ALBEDO observations
if 'win' in sys.platform:
    alb_obs_data = xr.open_dataset(r"E:\OneDrive\PhD\PhD\Data\Hintereisferner\Climate\HEF_processed_HRZ-20CC-filter_albedos.nc")
else:
    alb_obs_data = xr.open_dataset("/mnt/C4AEBBABAEBB9500/OneDrive/PhD/PhD/Data/Hintereisferner/Climate/HEF_processed_HRZ-20CC-filter_albedos.nc")
#has nans where no glacier -> can build glacier-wide mean albedo for additional logp
alb_obs_data = alb_obs_data.sortby("time")
alb_obs_data

In [None]:
#tsl_logp = modtsls.apply(loglike_tsla_func, eval_tsla =tsla_true_obs['TSL_normalized'].values, sigma_tsla= tsla_true_obs['SC_stdev'].values, axis=0)
#print(tsl_logp)
sigma_alb = alb_obs_data.sigma_albedo
eval_alb = alb_obs_data.median_albedo

list_logp_alb = []
list_sim_alb = []
for i,r in df.iterrows():
    if i % 300 == 0:
        print(f"Processing file {i}/3000")
    rrr_factor = round(r['rrr_factor'],4)
    alb_ice = round(r['alb_ice'],4)
    alb_snow = round(r['alb_snow'],4)
    alb_firn = round(r['alb_firn'],4)
    alb_aging = round(r['albedo_aging'],4)
    alb_depth = round(r['albedo_depth'],4)
    roughness_ice = round(r['roughness_ice'], 4)
    filename = f"HEF_COSMO_1D20m_1999_2010_HORAYZON_IntpPRES_LHS-wide_19990101-20091231_RRR-{rrr_factor}_{alb_snow}_{alb_ice}_{alb_firn}_{alb_aging}_{alb_depth}_0.24_{roughness_ice}_4.0_0.0026_num2.nc"
    if 'win' in sys.platform:
        sim_alb = xr.open_dataset("E:/OneDrive/PhD/PhD/Data/Hintereisferner/Output/albedo_files/LHS-wide/"+\
            filename)["ALBEDO_weighted"].resample(time="1D").mean()
    else:
        sim_alb = xr.open_dataset("/mnt/C4AEBBABAEBB9500/OneDrive/PhD/PhD/Data/Hintereisferner/Output/albedo_files/LHS-wide/"+\
            filename)["ALBEDO_weighted"].resample(time="1D").mean()

    
    #sort by time
    sim_alb = sim_alb.sel(time=alb_obs_data.time)
    sim_alb = sim_alb.sortby("time")
        
    ## compute logp albedo for file
    list_sim_alb.append(sim_alb.data)
    logp_alb_all = -0.5 * np.sum(np.log(2 * np.pi * sigma_alb**2) + ((sim_alb.data-eval_alb)**2 / sigma_alb**2))
    avg_logp_alb = logp_alb_all / len(sim_alb)
    list_logp_alb.append(avg_logp_alb.item())

In [None]:
sim_list = [f"sim{i+1}" for i in range(79)]
print(sim_list)

df['mb_logp'] = mb_logp
df['tsla_logp'] = tsl_logp
df['alb_logp'] = list_logp_alb

df['joint_like'] = df['mb_logp'] + df['tsla_logp'] + df['alb_logp']

df

In [None]:
## Check ranges of logp to ensure equal weight..
print("MB Logp Range:")
print(df.mb_logp.min(), df.mb_logp.max())
print("TSLA Logp Range:")
print(df.tsla_logp.min(), df.tsla_logp.max())
print("Alb Logp Range:")
print(df.alb_logp.min(), df.alb_logp.max())

In [35]:
scale_logps = False

if scale_logps:
    # Min-Max normalization
    logp_alb_min = df.alb_logp.min()
    logp_alb_max = df.alb_logp.max()
    logp_alb_normalized = (df.alb_logp - logp_alb_min) / (logp_alb_max - logp_alb_min)
    df['alb_logp'] = logp_alb_normalized

    logp_mb_min = df.mb_logp.min()
    logp_mb_max = df.mb_logp.max()
    logp_mb_normalized = (df.mb_logp - logp_mb_min) / (logp_mb_max - logp_mb_min)
    df['mb_logp'] = logp_mb_normalized

    logp_snowline_min = df.tsla_logp.min()
    logp_snowline_max = df.tsla_logp.max()
    logp_snowline_normalized = (df.tsla_logp - logp_snowline_min) / (logp_snowline_max - logp_snowline_min)
    df['tsla_logp'] = logp_snowline_normalized


    print("New MB Logp Range:")
    print(df.mb_logp.min(), df.mb_logp.max())
    print("New TSLA Logp Range:")
    print(df.tsla_logp.min(), df.tsla_logp.max())
    print("New Alb Logp Range:")
    print(df.alb_logp.min(), df.alb_logp.max())

    df['joint_like'] = df['mb_logp'] + df['tsla_logp'] + df['alb_logp']

In [None]:
param_features = ["rrr_factor","alb_ice","alb_snow","alb_firn","albedo_aging","albedo_depth","roughness_ice"]
## Repeat correlation for only best percent
best_ten = df[param_features+['alb_logp', 'tsla_logp', 'mb_logp','joint_like']]
best_ten = best_ten.loc[best_ten['joint_like'] >= np.nanpercentile(best_ten['joint_like'], 90)]

plt.rcParams.update({'font.size': 14})
# Compute correlation matrix (including log-likelihoods)
correlation_matrix = best_ten[param_features+['alb_logp', 'tsla_logp', 'mb_logp','joint_like']].corr()

# Plot the correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title("Correlation Matrix Between Parameters and Log-Likelihood")
plt.show()


In [40]:
def compute_ll_drop_from_model_vs_obs(model_outputs, observations, sigmas, delta_sigma=2):
    """
    Compute average log-likelihood drop corresponding to delta_sigma error increase 
    for the best-fitting simulation.

    Parameters:
    - model_outputs: array (n_runs, n_points)
    - observations: array (n_points,)
    - sigmas: array (n_points,) or (n_runs, n_points)
    - delta_sigma: float, e.g., 2 for ±2σ threshold

    Returns:
    - best_run_index: index of the best run
    - best_avg_ll: average log-likelihood of the best run
    - drop: drop in average log-likelihood when model moves delta_sigma away
    """
    model_outputs = np.asarray(model_outputs)
    observations = np.asarray(observations)
    sigmas = np.asarray(sigmas)

    n_runs, n_points = model_outputs.shape

    if sigmas.ndim == 1:
        sigmas = np.tile(sigmas, (n_runs, 1))  # shape (n_runs, n_points)

    # Log-likelihood for each model value given observation ± sigma
    loglikelihoods = norm.logpdf(model_outputs, loc=observations, scale=sigmas)
    avg_ll_per_run = np.mean(loglikelihoods, axis=1)

    # Best run
    best_run_index = np.argmax(avg_ll_per_run)
    best_avg_ll = avg_ll_per_run[best_run_index]

    # Apply delta_sigma shift to the model output for best run
    shifted_model = model_outputs[best_run_index] + delta_sigma * sigmas[best_run_index]
    shifted_loglikelihoods = norm.logpdf(shifted_model, loc=observations, scale=sigmas[best_run_index])
    shifted_avg_ll = np.mean(shifted_loglikelihoods)

    drop = best_avg_ll - shifted_avg_ll

    return best_run_index, best_avg_ll, drop


In [None]:
best_idx, best_ll, drop_2sigma = compute_ll_drop_from_model_vs_obs(
    modtsls.transpose(), tsla_true_obs['TSL_normalized'], tsla_true_obs['SC_stdev'], delta_sigma=3
)

print(f"Best run index: {best_idx}")
print(f"Best avg log-likelihood: {best_ll:.4f}")
print(f"Log-likelihood drop at +3σ: {drop_2sigma:.4f}")


In [None]:
best_idx, best_ll, drop_2sigma = compute_ll_drop_from_model_vs_obs(
    mb_mod.transpose(), geod_ref['dmdtda'], geod_ref['err_dmdtda'], delta_sigma=3
)

print(f"Best run index: {best_idx}")
print(f"Best avg log-likelihood: {best_ll:.4f}")
print(f"Log-likelihood drop at +3σ: {drop_2sigma:.4f}")

In [None]:
best_idx, best_ll, drop_2sigma = compute_ll_drop_from_model_vs_obs(
    np.asarray(list_sim_alb), eval_alb.data, sigma_alb, delta_sigma=3
)

print(f"Best run index: {best_idx}")
print(f"Best avg log-likelihood: {best_ll:.4f}")
print(f"Log-likelihood drop at +3σ: {drop_2sigma:.4f}")

In [None]:
## Create thresholds based on this experiment
minimum_mb = 0.4242 - 4.4493
minimum_alb = 0.9257 - 5.7602
minimum_tsla = -28.0132 + -1.9042

minimum_thres = minimum_mb + minimum_alb + minimum_tsla
print("Conservative cutoff at ", minimum_thres)

In [None]:
final_subset = df.loc[df['joint_like'] >= minimum_thres]
print(len(final_subset))
final_subset['joint_like'].hist()

In [None]:
fig, ax = plt.subplots(4,2, figsize=(16,9), dpi=150)
ax[0,0].hist(final_subset['rrr_factor'], edgecolor="black", alpha=0.7, label="Filter")
ax[0,0].hist(df['rrr_factor'], edgecolor="black", alpha=0.7, label="Full", zorder=-1)
#
ax[0,1].hist(final_subset['alb_ice'], edgecolor="black", alpha=0.7, label="Filter")
ax[0,1].hist(df['alb_ice'], edgecolor="black", alpha=0.7, label="Full", zorder=-1)
#
ax[1,0].hist(final_subset['alb_snow'], edgecolor="black", alpha=0.7, label="Filter")
ax[1,0].hist(df['alb_snow'], edgecolor="black", alpha=0.7, label="Full", zorder=-1)
#
ax[1,1].hist(final_subset['alb_firn'], edgecolor="black", alpha=0.7, label="Filter")
ax[1,1].hist(df['alb_firn'], edgecolor="black", alpha=0.7, label="Full", zorder=-1)
#
ax[2,0].hist(final_subset['albedo_aging'], edgecolor="black", alpha=0.7, label="Filter")
ax[2,0].hist(df['albedo_aging'], edgecolor="black", alpha=0.7, label="Full", zorder=-1)
#
ax[2,1].hist(final_subset['albedo_depth'], edgecolor="black", alpha=0.7, label="Filter")
ax[2,1].hist(df['albedo_depth'], edgecolor="black", alpha=0.7, label="Full", zorder=-1)
#
ax[3,0].hist(final_subset['roughness_ice'], edgecolor="black", alpha=0.7, label="Filter")
ax[3,0].hist(df['roughness_ice'], edgecolor="black", alpha=0.7, label="Full", zorder=-1)
#
ax[3,1].hist(final_subset['joint_like'], edgecolor="black", alpha=0.7, label="Filter")
ax[3,1].hist(df['joint_like'], edgecolor="black", alpha=0.7, label="Full", zorder=-1)
ax[0,0].legend()

In [None]:
for param in param_features:
    print(f"Param {param} bounds are: {np.nanmin(final_subset[param])} - {np.nanmax(final_subset[param])}.")
    print(f"Min/Max Sampled bounds are: {np.nanmin(df[param])} - {np.nanmax(df[param])}.")
    print("----------")