## Setting Up:

In [1]:
import pandas as pd
import os
import warnings
from tqdm.notebook import tqdm
import re
import matplotlib.pyplot as plt
import seaborn as sns
from cmcrameri import cm
import xarray as xr
import massbalancemachine as mbm
from collections import defaultdict
import logging
import torch.nn as nn
from skorch.helper import SliceDataset

from scripts.helpers import *
from scripts.glamos_preprocess import *
from scripts.plots import *
from scripts.config_CH import *
from scripts.xgb_helpers import *

warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

# cfg = mbm.SwitzerlandConfig()
cfg = mbm.Config()

In [2]:
seed_all(cfg.seed)
free_up_cuda()

# Plot styles:
path_style_sheet = 'scripts/example.mplstyle'
plt.style.use(path_style_sheet)
colors = get_cmap_hex(cm.batlow, 10)
color_dark_blue = colors[0]
color_pink = '#c51b7d'

# RGI Ids:
# Read rgi ids:
rgi_df = pd.read_csv(path_glacier_ids, sep=',')
rgi_df.rename(columns=lambda x: x.strip(), inplace=True)
rgi_df.sort_values(by='short_name', inplace=True)
rgi_df.set_index('short_name', inplace=True)

vois_climate = [
    't2m', 'tp', 'slhf', 'sshf', 'ssrd', 'fal', 'str', 'u10', 'v10'
]

vois_topographical = [
    # "aspect", # OGGM
    # "slope", # OGGM
    "aspect_sgi",  # SGI
    "slope_sgi",  # SGI
    "hugonnet_dhdt",  # OGGM
    "consensus_ice_thickness",  # OGGM
    "millan_v",  # OGGM
]

## Read GL data:

In [3]:
data_glamos = pd.read_csv(path_PMB_GLAMOS_csv + 'CH_wgms_dataset_all.csv')

print('Number of glaciers:', len(data_glamos['GLACIER'].unique()))
print('Number of winter and annual samples:', len(data_glamos))
print('Number of annual samples:',
      len(data_glamos[data_glamos.PERIOD == 'annual']))
print('Number of winter samples:',
      len(data_glamos[data_glamos.PERIOD == 'winter']))

# Capitalize glacier names:
glacierCap = {}
for gl in data_glamos['GLACIER'].unique():
    if isinstance(gl, str):  # Ensure the glacier name is a string
        if gl.lower() == 'claridenu':
            glacierCap[gl] = 'Clariden_U'
        elif gl.lower() == 'claridenl':
            glacierCap[gl] = 'Clariden_L'
        else:
            glacierCap[gl] = gl.capitalize()
    else:
        print(f"Warning: Non-string glacier name encountered: {gl}")

data_glamos.head(2)

Number of glaciers: 31
Number of winter and annual samples: 31875
Number of annual samples: 7587
Number of winter samples: 24288


Unnamed: 0,YEAR,POINT_ID,GLACIER,FROM_DATE,TO_DATE,POINT_LAT,POINT_LON,POINT_ELEVATION,POINT_BALANCE,PERIOD,RGIId,aspect,slope,topo,hugonnet_dhdt,consensus_ice_thickness,millan_v,aspect_sgi,slope_sgi,topo_sgi
0,2006,adler_28,adler,20051017,20061011,46.010637,7.855896,3096.507742,-2.592,annual,RGI60-11.02764,254.483669,19.483581,3039.0,-1.504815,43.181839,3.591626,281.110176,20.345155,3015.42
1,2006,adler_54,adler,20051017,20061011,46.010052,7.858628,3141.50652,-2.502,annual,RGI60-11.02764,273.466595,12.549861,3091.0,-0.970061,41.090096,0.975349,280.805856,10.27517,3076.348


### Glaciers with pot. radiadation data:

In [4]:
# Glaciers with data of potential clear sky radiation
# Format to same names as stakes:
glDirect = np.sort([
    re.search(r'xr_direct_(.*?)\.nc', f).group(1)
    for f in os.listdir(path_pcsr + 'csv/')
])

restgl = np.sort(Diff(list(glDirect), list(data_glamos.GLACIER.unique())))

print('Glaciers with potential clear sky radiation data:\n', glDirect)
print('Number of glaciers:', len(glDirect))
print('Glaciers without potential clear sky radiation data:\n', restgl)

# Filter out glaciers without data:
data_glamos = data_glamos[data_glamos.GLACIER.isin(glDirect)]

# Look at the data of the ERA5 dataset:
xr.open_dataset(path_ERA5_raw + 'era5_monthly_averaged_data.nc')

Glaciers with potential clear sky radiation data:
 ['adler' 'albigna' 'aletsch' 'allalin' 'arolla' 'basodino' 'clariden'
 'corbassiere' 'corvatsch' 'findelen' 'forno' 'gietro' 'gorner' 'gries'
 'hohlaub' 'joeri' 'limmern' 'morteratsch' 'murtel' 'oberaar' 'otemma'
 'pers' 'pizol' 'plainemorte' 'plattalva' 'rhone' 'sanktanna'
 'schwarzbach' 'schwarzberg' 'sexrouge' 'silvretta' 'taelliboden' 'tortin'
 'tsanfleuron']
Number of glaciers: 34
Glaciers without potential clear sky radiation data:
 ['arolla' 'joeri' 'pers']


## Input data:
### Input dataset:

In [5]:
# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

# Transform data to monthly format (run or load data):
paths = {
    'csv_path': path_PMB_GLAMOS_csv,
    'era5_climate_data': path_ERA5_raw + 'era5_monthly_averaged_data.nc',
    'geopotential_data': path_ERA5_raw + 'era5_geopotential_pressure.nc',
    'radiation_save_path': path_pcsr + 'csv/'
}
RUN = False
dataloader_gl = process_or_load_data(run_flag=RUN,
                                     data_glamos=data_glamos,
                                     paths=paths,
                                     cfg=cfg,
                                     vois_climate=vois_climate,
                                     vois_topographical=vois_topographical)
data_monthly = dataloader_gl.data

2025-04-08 12:01:30,422 - INFO - Loaded preprocessed data.
2025-04-08 12:01:30,422 - INFO - Number of monthly rows: 280720
2025-04-08 12:01:30,446 - INFO - Number of annual rows: 90870
2025-04-08 12:01:30,476 - INFO - Number of winter rows: 189850


## Blocking on glaciers:

In [6]:
test_glaciers = [
    'tortin', 'plattalva', 'sanktanna', 'schwarzberg', 'hohlaub', 'pizol',
    'corvatsch', 'tsanfleuron', 'forno'
]

# Ensure all test glaciers exist in the dataset
existing_glaciers = set(dataloader_gl.data.GLACIER.unique())
missing_glaciers = [g for g in test_glaciers if g not in existing_glaciers]

if missing_glaciers:
    print(
        f"Warning: The following test glaciers are not in the dataset: {missing_glaciers}"
    )

# Define training glaciers correctly
train_glaciers = [i for i in existing_glaciers if i not in test_glaciers]

data_test = dataloader_gl.data[dataloader_gl.data.GLACIER.isin(test_glaciers)]
print('Size of test data:', len(data_test))

data_train = dataloader_gl.data[dataloader_gl.data.GLACIER.isin(
    train_glaciers)]
print('Size of train data:', len(data_train))

if len(data_train) == 0:
    print("Warning: No training data available!")
else:
    test_perc = (len(data_test) / len(data_train)) * 100
    print('Percentage of test size: {:.2f}%'.format(test_perc))

# Number of annual versus winter measurements:
print('Train:')
print('Number of winter and annual samples:', len(data_train))
print('Number of annual samples:',
      len(data_train[data_train.PERIOD == 'annual']))
print('Number of winter samples:',
      len(data_train[data_train.PERIOD == 'winter']))

# Same for test
data_test_annual = data_test[data_test.PERIOD == 'annual']
data_test_winter = data_test[data_test.PERIOD == 'winter']

print('Test:')
print('Number of winter and annual samples:', len(data_test))
print('Number of annual samples:', len(data_test_annual))
print('Number of winter samples:', len(data_test_winter))

print('Total:')
print('Number of monthly rows:', len(dataloader_gl.data))
print('Number of annual rows:',
      len(dataloader_gl.data[dataloader_gl.data.PERIOD == 'annual']))
print('Number of winter rows:',
      len(dataloader_gl.data[dataloader_gl.data.PERIOD == 'winter']))

Size of test data: 29024
Size of train data: 251696
Percentage of test size: 11.53%
Train:
Number of winter and annual samples: 251696
Number of annual samples: 83148
Number of winter samples: 168548
Test:
Number of winter and annual samples: 29024
Number of annual samples: 7722
Number of winter samples: 21302
Total:
Number of monthly rows: 280720
Number of annual rows: 90870
Number of winter rows: 189850


In [7]:
splits, test_set, train_set = get_CV_splits(dataloader_gl,
                                            test_split_on='GLACIER',
                                            test_splits=test_glaciers,
                                            random_state=cfg.seed)

print('Test glaciers: ({}) {}'.format(len(test_set['splits_vals']),
                                      test_set['splits_vals']))
test_perc = (len(test_set['df_X']) / len(train_set['df_X'])) * 100
print('Percentage of test size: {:.2f}%'.format(test_perc))
print('Size of test set:', len(test_set['df_X']))
print('Train glaciers: ({}) {}'.format(len(train_set['splits_vals']),
                                       train_set['splits_vals']))
print('Size of train set:', len(train_set['df_X']))

# visualiseSplits(test_set['y'], train_set['y'], splits)
# visualiseInputs(train_set, test_set, vois_climate)

Test glaciers: (9) ['corvatsch' 'forno' 'hohlaub' 'pizol' 'plattalva' 'sanktanna'
 'schwarzberg' 'tortin' 'tsanfleuron']
Percentage of test size: 11.53%
Size of test set: 29024
Train glaciers: (22) ['adler' 'albigna' 'aletsch' 'allalin' 'basodino' 'clariden' 'corbassiere'
 'findelen' 'gietro' 'gorner' 'gries' 'limmern' 'morteratsch' 'murtel'
 'oberaar' 'otemma' 'plainemorte' 'rhone' 'schwarzbach' 'sexrouge'
 'silvretta' 'taelliboden']
Size of train set: 251696


## Neural Network:

In [8]:
feature_columns = train_set['df_X'].columns.difference(cfg.metaData)
# feature_columns = feature_columns.drop(cfg.notMetaDataNotFeatures)
feature_columns = list(feature_columns)
feature_columns

['ALTITUDE_CLIMATE',
 'ELEVATION_DIFFERENCE',
 'GLACIER',
 'GLWD_ID',
 'PERIOD',
 'POINT_BALANCE',
 'POINT_ELEVATION',
 'POINT_LAT',
 'POINT_LON',
 'YEAR',
 'aspect_sgi',
 'consensus_ice_thickness',
 'fal',
 'hugonnet_dhdt',
 'millan_v',
 'pcsr',
 'slhf',
 'slope_sgi',
 'sshf',
 'ssrd',
 'str',
 't2m',
 'tp',
 'u10',
 'v10']

In [9]:
# Grid search
# For each of the XGBoost parameter, define the grid range
param_grid = {'lr': [0.001, 0.01], 'max_epochs': [1000, 2000]}

feature_columns = [
    'ELEVATION_DIFFERENCE'
] + list(vois_climate) + list(vois_topographical) + ['pcsr']
all_columns = feature_columns + cfg.fieldsNotFeatures
df_X_train_subset = train_set['df_X'][all_columns]
print('Shape of training dataset:', df_X_train_subset.shape)
print('Shape of testing dataset:', test_set['df_X'][all_columns].shape)
print('Running with features:', feature_columns)

param_init = {}
# param_init['device'] = 'cuda:0'
# # param_init['tree_method'] = 'hist'
# param_init["random_state"] = cfg.seed
# param_init["n_jobs"] = cfg.numJobs
param_init['device'] = 'cpu'

vois_climate = [
    't2m', 'tp', 'slhf', 'sshf', 'ssrd', 'fal', 'str', 'u10', 'v10'
]

vois_topographical = [
    "aspect_sgi",
    "slope_sgi",
    "hugonnet_dhdt",
    "consensus_ice_thickness",
    "millan_v",
]

nInp = len(feature_columns)
network = nn.Sequential(
    nn.Linear(nInp, 12),
    nn.ReLU(),
    nn.Linear(12, 4),
    nn.ReLU(),
    nn.Linear(4, 1),
)

Shape of training dataset: (251696, 26)
Shape of testing dataset: (29024, 26)
Running with features: ['ELEVATION_DIFFERENCE', 't2m', 'tp', 'slhf', 'sshf', 'ssrd', 'fal', 'str', 'u10', 'v10', 'aspect_sgi', 'slope_sgi', 'hugonnet_dhdt', 'consensus_ice_thickness', 'millan_v', 'pcsr']


In [10]:
custom_nn = mbm.models.CustomNeuralNetRegressor(
    cfg,
    network,
    nbFeatures=nInp,
    train_split=
    False,  # train_split is disabled since cross validation is handled by the splits variable hereafter
    batch_size=16,
    verbose=0,
    iterator_train__shuffle=True,
    **param_init)

In [11]:
features, metadata = custom_nn._create_features_metadata(df_X_train_subset,)

bounds_features = {
    k:
    (np.min(train_set['df_X'][k].values), np.max(train_set['df_X'][k].values))
    for k in feature_columns
}
norm = mbm.data_processing.Normalizer(bounds_features)
norm_features = norm.normalize(features)

# Define the dataset for the NN
dataset = mbm.data_processing.AggregatedDataset(cfg,
                                                features=norm_features,
                                                metadata=metadata,
                                                targets=train_set['y'])
splits = dataset.mapSplitsToDataset(splits)

# Use SliceDataset to make the dataset accessible as a numpy array for scikit learn
dataset = [SliceDataset(dataset, idx=0), SliceDataset(dataset, idx=1)]

print(dataset[0].shape, dataset[1].shape)

(28365,) (28365,)


In [12]:
sample = dataset[0][0]
print("Type:", type(sample))
print("Shape:", getattr(sample, "shape", "No shape"))

Type: <class 'numpy.ndarray'>
Shape (if possible): (224,)


In [13]:
print(type(features))
print(features.shape)
print(type(features[0]))

<class 'numpy.ndarray'>
(251696, 16)
<class 'numpy.ndarray'>


In [14]:
custom_nn.fit(dataset[0], dataset[1])

RuntimeError: shape '[-1, 16]' is invalid for input of size 104