## Setting Up:

In [None]:
import sys, os
sys.path.append(os.path.join(os.getcwd(), '../../')) # Add root of repo to import MBM
import traceback

import pickle
import pandas as pd
import warnings
from tqdm.notebook import tqdm
import re
import matplotlib.pyplot as plt
import seaborn as sns
from cmcrameri import cm
import xarray as xr
import massbalancemachine as mbm
from collections import defaultdict
import logging
import torch.nn as nn
from skorch.helper import SliceDataset
from datetime import datetime
from skorch.callbacks import EarlyStopping, LRScheduler, Checkpoint
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset
import itertools
import random
import csv

from scripts.helpers import *
from scripts.glamos_preprocess import *
from scripts.plots import *
from scripts.config_CH import *
from scripts.nn_helpers import *
from scripts.xgb_helpers import *
from scripts.NN_networks import *
from scripts.geodata import *

warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

torch.set_warn_always(False)

cfg = mbm.SwitzerlandConfig()
cfg

In [None]:
# Plot styles:
path_style_sheet = 'scripts/example.mplstyle'
plt.style.use(path_style_sheet)
colors = get_cmap_hex(cm.batlow, 10)
color_dark_blue = colors[0]
color_pink = '#c51b7d'

# RGI Ids:
# Read rgi ids:
rgi_df = pd.read_csv(cfg.dataPath + path_glacier_ids, sep=',')
rgi_df.rename(columns=lambda x: x.strip(), inplace=True)
rgi_df.sort_values(by='short_name', inplace=True)
rgi_df.set_index('short_name', inplace=True)

vois_climate = [
    't2m', 'tp', 'slhf', 'sshf', 'ssrd', 'fal', 'str', 'u10', 'v10'
]

vois_topographical = [
    "aspect_sgi",
    "slope_sgi",
    "hugonnet_dhdt",
    "consensus_ice_thickness",
    "millan_v",
]

In [None]:
seed_all(cfg.seed)

if torch.cuda.is_available():
    print("CUDA is available")
    free_up_cuda()

    # # Try to limit CPU usage of random search
    # torch.set_num_threads(2)  # or 1
    # os.environ["OMP_NUM_THREADS"] = "1"
    # os.environ["MKL_NUM_THREADS"] = "1"
else:
    print("CUDA is NOT available")


## Read GL data:

In [None]:
data_glamos = pd.read_csv(cfg.dataPath + path_PMB_GLAMOS_csv +
                          'CH_wgms_dataset_all.csv')
# drop taelliboden and plainemorte if in there
if 'taelliboden' in data_glamos['GLACIER'].unique():
    data_glamos = data_glamos[data_glamos['GLACIER'] != 'taelliboden']
if 'plainemorte' in data_glamos['GLACIER'].unique():
    data_glamos = data_glamos[data_glamos['GLACIER'] != 'plainemorte']

# Glaciers with data of potential clear sky radiation
# Format to same names as stakes:
glDirect = np.sort([
    re.search(r'xr_direct_(.*?)\.zarr', f).group(1)
    for f in os.listdir(cfg.dataPath + path_pcsr + 'zarr/')
])

restgl = np.sort(Diff(list(glDirect), list(data_glamos.GLACIER.unique())))

print('Glaciers with potential clear sky radiation data:\n', glDirect)
print('Number of glaciers:', len(glDirect))
print('Glaciers without potential clear sky radiation data:\n', restgl)

# Filter out glaciers without data:
data_glamos = data_glamos[data_glamos.GLACIER.isin(glDirect)]

print('-------------------')
print('Number of glaciers:', len(data_glamos['GLACIER'].unique()))
print('Number of winter and annual samples:', len(data_glamos))
print('Number of annual samples:',
      len(data_glamos[data_glamos.PERIOD == 'annual']))
print('Number of winter samples:',
      len(data_glamos[data_glamos.PERIOD == 'winter']))

## Input data:
### Input dataset:

In [None]:
# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

# Transform data to monthly format (run or load data):
paths = {
    'csv_path': cfg.dataPath + path_PMB_GLAMOS_csv,
    'era5_climate_data':
    cfg.dataPath + path_ERA5_raw + 'era5_monthly_averaged_data.nc',
    'geopotential_data':
    cfg.dataPath + path_ERA5_raw + 'era5_geopotential_pressure.nc',
    'radiation_save_path': cfg.dataPath + path_pcsr + 'zarr/'
}
RUN = False
dataloader_gl = process_or_load_data(
    run_flag=RUN,
    data_glamos=data_glamos,
    paths=paths,
    cfg=cfg,
    vois_climate=vois_climate,
    vois_topographical=vois_topographical,
    output_file='CH_wgms_dataset_monthly_NN.csv')
data_monthly = dataloader_gl.data

## Blocking on glaciers:

In [None]:
# Ensure all test glaciers exist in the dataset
existing_glaciers = set(dataloader_gl.data.GLACIER.unique())
missing_glaciers = [g for g in TEST_GLACIERS if g not in existing_glaciers]

if missing_glaciers:
    print(
        f"Warning: The following test glaciers are not in the dataset: {missing_glaciers}"
    )

# Define training glaciers correctly
train_glaciers = [i for i in existing_glaciers if i not in TEST_GLACIERS]

data_test = dataloader_gl.data[dataloader_gl.data.GLACIER.isin(TEST_GLACIERS)]
print('Size of monthly test data:', len(data_test))

data_train = dataloader_gl.data[dataloader_gl.data.GLACIER.isin(
    train_glaciers)]
print('Size of monthly train data:', len(data_train))

if len(data_train) == 0:
    print("Warning: No training data available!")
else:
    test_perc = (len(data_test) / len(data_train)) * 100
    print('Percentage of test size: {:.2f}%'.format(test_perc))

# Number of annual versus winter measurements:
print('-------------\nTrain:')
print('Number of monthly winter and annual samples:', len(data_train))
print('Number of monthly annual samples:',
      len(data_train[data_train.PERIOD == 'annual']))
print('Number of monthly winter samples:',
      len(data_train[data_train.PERIOD == 'winter']))

# Same for test
data_test_annual = data_test[data_test.PERIOD == 'annual']
data_test_winter = data_test[data_test.PERIOD == 'winter']

print('Test:')
print('Number of monthly winter and annual samples:', len(data_test))
print('Number of monthly annual samples:', len(data_test_annual))
print('Number of monthly winter samples:', len(data_test_winter))

print('Total:')
print('Number of monthly rows:', len(dataloader_gl.data))
print('Number of annual rows:',
      len(dataloader_gl.data[dataloader_gl.data.PERIOD == 'annual']))
print('Number of winter rows:',
      len(dataloader_gl.data[dataloader_gl.data.PERIOD == 'winter']))

# same for original data:
print('-------------\nIn annual format:')
print('Number of annual train rows:',
      len(data_glamos[data_glamos.GLACIER.isin(train_glaciers)]))
print('Number of annual test rows:',
      len(data_glamos[data_glamos.GLACIER.isin(TEST_GLACIERS)]))


In [None]:
splits, test_set, train_set = get_CV_splits(dataloader_gl,
                                            test_split_on='GLACIER',
                                            test_splits=TEST_GLACIERS,
                                            random_state=cfg.seed)

print('Test glaciers: ({}) {}'.format(len(test_set['splits_vals']),
                                      test_set['splits_vals']))
test_perc = (len(test_set['df_X']) / len(train_set['df_X'])) * 100
print('Percentage of test size: {:.2f}%'.format(test_perc))
print('Size of test set:', len(test_set['df_X']))
print('Train glaciers: ({}) {}'.format(len(train_set['splits_vals']),
                                       train_set['splits_vals']))
print('Size of train set:', len(train_set['df_X']))

In [None]:
# Validation and train split:
data_train = train_set['df_X']
data_train['y'] = train_set['y']
dataloader = mbm.dataloader.DataLoader(cfg, data=data_train)

train_itr, val_itr = dataloader.set_train_test_split(test_size=0.2)

# Get all indices of the training and valing dataset at once from the iterators. Once called, the iterators are empty.
train_indices, val_indices = list(train_itr), list(val_itr)

df_X_train = data_train.iloc[train_indices]
y_train = df_X_train['POINT_BALANCE'].values

# Get val set
df_X_val = data_train.iloc[val_indices]
y_val = df_X_val['POINT_BALANCE'].values

## Neural Network:

In [None]:
param_init = {'device': 'cpu'}  # Use CPU for training, apparently faster

early_stop = EarlyStopping(
    monitor='valid_loss',
    patience=15,
    threshold=1e-4,  # Optional: stop only when improvement is very small
)

lr_scheduler_cb = LRScheduler(policy=ReduceLROnPlateau,
                              monitor='valid_loss',
                              mode='min',
                              factor=0.5,
                              patience=5,
                              threshold=0.01,
                              threshold_mode='rel',
                              verbose=True)

dataset = dataset_val = None  # Initialized hereafter


def my_train_split(ds, y=None, **fit_params):
    return dataset, dataset_val


params = {
    'lr': 0.001,
    'batch_size': 128,
    'optimizer': torch.optim.Adam,
    'optimizer__weight_decay': 1e-05,
    'module__hidden_layers': [128, 128, 64, 32],
    'module__dropout': 0.2,
    'module__use_batchnorm': True,
}

features_topo = [
    'ELEVATION_DIFFERENCE',
    'pcsr',
] + list(vois_topographical)

feature_columns = features_topo + list(vois_climate)

cfg.setFeatures(feature_columns)

all_columns = feature_columns + cfg.fieldsNotFeatures
df_X_train_subset = df_X_train[all_columns]
df_X_val_subset = df_X_val[all_columns]
df_X_test_subset = test_set['df_X'][all_columns]

print('Shape of training dataset:', df_X_train_subset.shape)
print('Shape of validation dataset:', df_X_val_subset.shape)
print('Shape of testing dataset:', test_set['df_X'][all_columns].shape)
print('Running with features:', feature_columns)
nInp = len(feature_columns)

### Permutation importance:

In [None]:
# Load model and set to CPU
model_filename = "nn_model_2025-08-06.pt"  # Replace with actual date if needed

# read pickle with params
params_filename = "nn_params_2025-08-06.pkl"  # Replace with actual date if needed
with open(f"models/{params_filename}", "rb") as f:
    custom_params = pickle.load(f)

params = custom_params

args = {
    'module': FlexibleNetwork,
    'nbFeatures': nInp,
    'module__input_dim': nInp,
    'module__dropout': params['module__dropout'],
    'module__hidden_layers': params['module__hidden_layers'],
    'train_split': my_train_split,
    'batch_size': params['batch_size'],
    'verbose': 1,
    'iterator_train__shuffle': True,
    'lr': params['lr'],
    'max_epochs': 300,
    'optimizer': params['optimizer'],
    'optimizer__weight_decay': params['optimizer__weight_decay'],
    'module__use_batchnorm': params['module__use_batchnorm'],
    'callbacks': [
        ('early_stop', early_stop),
        ('lr_scheduler', lr_scheduler_cb),
    ]
}

loaded_model = mbm.models.CustomNeuralNetRegressor.load_model(
    cfg,
    model_filename,
    **{
        **args,
        **param_init
    },
)
loaded_model = loaded_model.set_params(device='cpu')
loaded_model = loaded_model.to('cpu')

grouped_ids, scores_NN, ids_NN, y_pred_NN = evaluate_model_and_group_predictions(
    loaded_model, df_X_test_subset, test_set['y'], cfg, mbm)

baseline_score = scores_NN['rmse']
print('Baseline RMSE:', baseline_score)

In [None]:
rng = np.random.default_rng(cfg.seed)
importances = {col: [] for col in feature_columns}

# Compute baseline
_, scores_baseline, _, _ = evaluate_model_and_group_predictions(
    loaded_model, df_X_test_subset, test_set['y'], cfg, mbm)

baseline_score = scores_baseline['rmse']
print(f"Baseline RMSE: {baseline_score:.4f}")

n_repeats = 10
for col in tqdm(feature_columns):
    for _ in range(n_repeats):
        df_permuted = df_X_test_subset.copy()
        df_permuted[col] = rng.permutation(df_permuted[col].values)

        # Evaluate model on permuted data
        _, scores_perm, _, _ = evaluate_model_and_group_predictions(
            loaded_model, df_permuted, test_set['y'], cfg, mbm)
        perm_score = scores_perm['rmse']
        importance = perm_score - baseline_score  # Positive = worse performance
        importances[col].append(importance)

# Aggregate results
df_importances = pd.DataFrame({
    "feature":
    feature_columns,
    "mean_importance": [np.mean(importances[col]) for col in feature_columns],
    "std_importance": [np.std(importances[col]) for col in feature_columns],
}).sort_values(by="mean_importance", ascending=False)

plot_permutation_importance(df_importances, top_n=20)

In [None]:
df_importances[df_importances.mean_importance > 0.02].sort_values(
    by='mean_importance').feature.values

### Dependence per glacier:

In [None]:
gl_per_el = data_glamos[data_glamos.PERIOD == 'annual'].groupby(
    ['GLACIER'])['POINT_ELEVATION'].mean()
gl_per_el = gl_per_el.sort_values(ascending=False)

test_gl_per_el = gl_per_el[TEST_GLACIERS].sort_values().index

In [None]:
# Set up RNG and storage
rng = np.random.default_rng(cfg.seed)
n_repeats = 10
top_n = 20  # Top N features to show

# Prepare subplot layout
n_glaciers = len(TEST_GLACIERS)
ncols = 3  # or any layout you prefer
nrows = int(np.ceil(n_glaciers / ncols))

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(6 * ncols, 5 * nrows))
axes = axes.flatten()

for idx, glacier_name in enumerate(test_gl_per_el):
    ax = axes[idx]
    
    # Subset data
    glacier_test_set = df_X_test_subset[df_X_test_subset.GLACIER == glacier_name].reset_index(drop=True)
    glacier_y = test_set['y'][glacier_test_set.index]

    # Initialize importances dictionary
    importances = {col: [] for col in feature_columns}

    # Compute baseline RMSE
    _, scores_baseline, _, _ = evaluate_model_and_group_predictions(
        loaded_model, glacier_test_set, glacier_y, cfg, mbm)
    baseline_score = scores_baseline['rmse']

    # Permutation importance
    for col in tqdm(feature_columns, desc=f"Processing {glacier_name}"):
        for _ in range(n_repeats):
            df_permuted = glacier_test_set.copy()
            df_permuted[col] = rng.permutation(df_permuted[col].values)
            _, scores_perm, _, _ = evaluate_model_and_group_predictions(
                loaded_model, df_permuted, glacier_y, cfg, mbm)
            perm_score = scores_perm['rmse']
            importance = perm_score - baseline_score
            importances[col].append(importance)

    # Aggregate
    df_importances = pd.DataFrame({
        "feature": feature_columns,
        "mean_importance": [np.mean(importances[col]) for col in feature_columns],
        "std_importance": [np.std(importances[col]) for col in feature_columns],
    }).sort_values(by="mean_importance", ascending=False).head(top_n)

    # Plot in subplot
    ax.barh(df_importances["feature"], df_importances["mean_importance"],
            xerr=df_importances["std_importance"], align="center")
    ax.set_title(f"{glacier_name} (RMSE baseline: {baseline_score:.2f})")
    ax.invert_yaxis()
    ax.set_xlabel("Permutation Importance (↑ = more important)")

# Turn off unused subplots
for i in range(len(TEST_GLACIERS), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

### Partial dependence:

In [None]:
param_init = {'device': 'cpu'}  # Use CPU for training, apparently faster

early_stop = EarlyStopping(
    monitor='valid_loss',
    patience=15,
    threshold=1e-4,  # Optional: stop only when improvement is very small
)

lr_scheduler_cb = LRScheduler(policy=ReduceLROnPlateau,
                              monitor='valid_loss',
                              mode='min',
                              factor=0.5,
                              patience=5,
                              threshold=0.01,
                              threshold_mode='rel',
                              verbose=True)

dataset = dataset_val = None  # Initialized hereafter


def my_train_split(ds, y=None, **fit_params):
    return dataset, dataset_val


params = {
    'lr': 0.001,
    'batch_size': 128,
    'optimizer': torch.optim.Adam,
    'optimizer__weight_decay': 1e-05,
    'module__hidden_layers': [128, 128, 64, 32],
    'module__dropout': 0.2,
    'module__use_batchnorm': True,
}

features_topo = [
    'ELEVATION_DIFFERENCE',
    'pcsr',
] + list(vois_topographical)

feature_columns = features_topo + list(vois_climate)

cfg.setFeatures(feature_columns)

all_columns = feature_columns + cfg.fieldsNotFeatures
df_X_train_subset = df_X_train[all_columns]
df_X_val_subset = df_X_val[all_columns]
df_X_test_subset = test_set['df_X'][all_columns]

print('Shape of training dataset:', df_X_train_subset.shape)
print('Shape of validation dataset:', df_X_val_subset.shape)
print('Shape of testing dataset:', test_set['df_X'][all_columns].shape)
print('Running with features:', feature_columns)
nInp = len(feature_columns)

In [None]:
# Load model and set to CPU
model_filename = "nn_model_2025-08-06.pt"  # Replace with actual date if needed
# read pickle with params
params_filename = "nn_params_2025-08-06.pkl"  # Replace with actual date if needed
with open(f"models/{params_filename}", "rb") as f:
    custom_params = pickle.load(f)

params = custom_params

args = {
    'module': FlexibleNetwork,
    'nbFeatures': nInp,
    'module__input_dim': nInp,
    'module__dropout': params['module__dropout'],
    'module__hidden_layers': params['module__hidden_layers'],
    'train_split': my_train_split,
    'batch_size': params['batch_size'],
    'verbose': 1,
    'iterator_train__shuffle': True,
    'lr': params['lr'],
    'max_epochs': 300,
    'optimizer': params['optimizer'],
    'optimizer__weight_decay': params['optimizer__weight_decay'],
    'module__use_batchnorm': params['module__use_batchnorm'],
    'callbacks': [
        ('early_stop', early_stop),
        ('lr_scheduler', lr_scheduler_cb),
    ]
}

loaded_model = mbm.models.CustomNeuralNetRegressor.load_model(
    cfg,
    model_filename,
    **{
        **args,
        **param_init
    },
)
loaded_model = loaded_model.set_params(device='cpu')
loaded_model = loaded_model.to('cpu')

grouped_ids, scores_NN, ids_NN, y_pred_NN = evaluate_model_and_group_predictions(
    loaded_model, df_X_test_subset, test_set['y'], cfg, mbm)
scores_annual, scores_winter = compute_seasonal_scores(grouped_ids,
                                                       target_col='target',
                                                       pred_col='pred')

baseline_score = scores_NN['rmse']
print('Baseline RMSE:', baseline_score)

In [None]:
len(feature_columns)

In [None]:
def custom_pdp(model, df_X, y, cfg, mbm, feature, grid_resolution=20):
    df_X = df_X.copy()
    feature_values = np.linspace(df_X[feature].min(), df_X[feature].max(),
                                 grid_resolution)
    pdp = []

    for val in feature_values:
        df_temp = df_X.copy()
        df_temp[feature] = val  # Fix this feature at 'val'
        grouped_preds, _, _, _ = evaluate_model_and_group_predictions(
            model, df_temp, y, cfg, mbm)
        pdp.append(
            grouped_preds['pred'].mean())  # Average across grouped predictions

    return feature_values, pdp


# Subset the DataFrame where PERIOD == 'annual'
df_X_test_subset_annual = df_X_test_subset[df_X_test_subset['PERIOD'] ==
                                           'annual'].reset_index()
annual_idx = df_X_test_subset_annual.index
y_test_subset_annual = test_set['y'][annual_idx]

fig, axs = plt.subplots(4, 4, figsize=(20, 10), sharex=False, sharey=False)
axs = axs.flatten()
i = 0
for feature in tqdm(feature_columns):
    values, pdp_vals = custom_pdp(loaded_model, df_X_test_subset_annual,
                                  test_set['y'], cfg, mbm, feature)
    axs[i].plot(values, pdp_vals)
    axs[i].set_xlabel(feature)
    axs[i].set_ylabel("Mean pred. MB")

    i += 1

plt.tight_layout()


In [None]:
# RUN = True
# if RUN:
#     # Setup logging
#     log_filename = f'logs/nn_vars_search_progress_{datetime.now().strftime("%Y-%m-%d")}.csv'

#     fieldnames = list(
#         sampled_params[0].keys()) + ['valid_loss', 'status', 'error']

#     with open(log_filename, mode='w', newline='') as log_file:
#         writer = csv.DictWriter(log_file, fieldnames=fieldnames)
#         writer.writeheader()

#     results = []
#     for i, sampled_feat in enumerate(sampled_params):
#         try:
#             print(f"\n--- Running config {i+1}/{len(sampled_params)} ---")
#             print(sampled_feat)

#             features_topo = [
#                 'ELEVATION_DIFFERENCE',
#             ] + list(sampled_feat['topographical'])

#             features_climate = list(sampled_feat['climate'])

#             feature_columns = features_topo + features_climate

#             cfg.setFeatures(feature_columns)

#             all_columns = feature_columns + cfg.fieldsNotFeatures
#             df_X_train_subset = df_X_train[all_columns]
#             df_X_val_subset = df_X_val[all_columns]
#             df_X_test_subset = test_set['df_X'][all_columns]

#             nInp = len(feature_columns)

#             # Initialize network
#             args = {
#                 'module':
#                 FlexibleNetwork,
#                 'nbFeatures':
#                 nInp,
#                 'module__input_dim':
#                 nInp,
#                 'module__dropout':
#                 params['module__dropout'],
#                 'module__hidden_layers':
#                 params['module__hidden_layers'],
#                 'train_split':
#                 my_train_split,
#                 'batch_size':
#                 params['batch_size'],
#                 'verbose':
#                 1,
#                 'iterator_train__shuffle':
#                 True,
#                 'lr':
#                 params['lr'],
#                 'max_epochs':
#                 200,
#                 'optimizer':
#                 params['optimizer'],
#                 'optimizer__weight_decay':
#                 params['optimizer__weight_decay'],
#                 'module__use_batchnorm':
#                 params['module__use_batchnorm'],
#                 'callbacks': [
#                     ('early_stop', early_stop),
#                     ('lr_scheduler', lr_scheduler_cb),
#                 ]
#             }

#             custom_nn = mbm.models.CustomNeuralNetRegressor(
#                 cfg, **args, **param_init)
#             custom_nn.seed_all()

#             features, metadata = custom_nn._create_features_metadata(
#                 df_X_train_subset)

#             features_val, metadata_val = custom_nn._create_features_metadata(
#                 df_X_val_subset)

#             # Define the dataset for the NN
#             dataset = mbm.data_processing.AggregatedDataset(cfg,
#                                                             features=features,
#                                                             metadata=metadata,
#                                                             targets=y_train)
#             dataset = mbm.data_processing.SliceDatasetBinding(
#                 SliceDataset(dataset, idx=0), SliceDataset(dataset, idx=1))

#             dataset_val = mbm.data_processing.AggregatedDataset(
#                 cfg,
#                 features=features_val,
#                 metadata=metadata_val,
#                 targets=y_val)
#             dataset_val = mbm.data_processing.SliceDatasetBinding(
#                 SliceDataset(dataset_val, idx=0),
#                 SliceDataset(dataset_val, idx=1))

#             custom_nn.fit(dataset.X, dataset.y)
#             # Extract final validation loss
#             valid_loss_best = custom_nn.history[-1]['valid_loss']

#             grouped_ids, scores_NN, ids_NN, y_pred_NN = evaluate_model_and_group_predictions(
#                 custom_nn, df_X_test_subset, test_set['y'], cfg, mbm)

#             scores_annual, scores_winter = compute_seasonal_scores(
#                 grouped_ids, target_col='target', pred_col='pred')

#             row = {
#                 **sampled_feat, 'valid_loss': valid_loss_best,
#                 'status': 'success',
#                 'error': ''
#             }
#         except Exception:
#             err_msg = traceback.format_exc()
#             print(err_msg)
#             row = {
#                 **sampled_feat, 'valid_loss': None,
#                 'status': 'fail',
#                 'error': err_msg
#             }
#         # Append result to log
#         with open(log_filename, mode='a', newline='') as log_file:
#             writer = csv.DictWriter(log_file, fieldnames=fieldnames)
#             writer.writerow(row)