## Setting Up:

In [None]:
import pandas as pd
import os
import warnings
from tqdm.notebook import tqdm
import re
from calendar import month_abbr
import matplotlib.pyplot as plt
import seaborn as sns
from cmcrameri import cm
import xarray as xr
import massbalancemachine as mbm
from collections import defaultdict
import logging
import cartopy.io.img_tiles as cimgt
from cartopy import crs as ccrs, feature as cfeature
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER
from matplotlib.patches import Patch

import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import geopandas as gpd
from matplotlib.patches import Wedge, Patch
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER

from scripts.helpers import *
from scripts.glamos_preprocess import *
from scripts.plots import *
from scripts.config_CH import *
from scripts.xgb_helpers import *
from scripts.geodata import *

warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

cfg = mbm.SwitzerlandConfig()

In [None]:
seed_all(cfg.seed)
free_up_cuda()

# Plot styles:
path_style_sheet = 'scripts/example.mplstyle'
plt.style.use(path_style_sheet)
colors = get_cmap_hex(cm.batlow, 10)
color_dark_blue = colors[0]
color_pink = '#c51b7d'

# RGI Ids:
# Read rgi ids:
rgi_df = pd.read_csv(path_glacier_ids, sep=',')
rgi_df.rename(columns=lambda x: x.strip(), inplace=True)
rgi_df.sort_values(by='short_name', inplace=True)
rgi_df.set_index('short_name', inplace=True)

vois_climate = [
    't2m', 'tp', 'slhf', 'sshf', 'ssrd', 'fal', 'str', 'u10', 'v10'
]

vois_topographical = [
    # "aspect", # OGGM
    # "slope", # OGGM
    "aspect_sgi",  # SGI
    "slope_sgi",  # SGI
    "hugonnet_dhdt",  # OGGM
    "consensus_ice_thickness",  # OGGM
    "millan_v",  # OGGM
]

## Read GL data:

In [None]:
data_glamos = pd.read_csv(path_PMB_GLAMOS_csv + 'CH_wgms_dataset_all.csv')

print('Number of glaciers:', len(data_glamos['GLACIER'].unique()))
print('Number of winter and annual samples:', len(data_glamos))
print('Number of annual samples:',
      len(data_glamos[data_glamos.PERIOD == 'annual']))
print('Number of winter samples:',
      len(data_glamos[data_glamos.PERIOD == 'winter']))

# Capitalize glacier names:
glacierCap = {}
for gl in data_glamos['GLACIER'].unique():
    if isinstance(gl, str):  # Ensure the glacier name is a string
        if gl.lower() == 'claridenu':
            glacierCap[gl] = 'Clariden_U'
        elif gl.lower() == 'claridenl':
            glacierCap[gl] = 'Clariden_L'
        else:
            glacierCap[gl] = gl.capitalize()
    else:
        print(f"Warning: Non-string glacier name encountered: {gl}")

data_glamos.head(2)

### Glaciers with pot. radiadation data:

In [None]:
# Glaciers with data of potential clear sky radiation
# Format to same names as stakes:
glDirect = np.sort([
    re.search(r'xr_direct_(.*?)\.zarr', f).group(1)
    for f in os.listdir(path_pcsr + 'zarr/')
])

restgl = np.sort(Diff(list(glDirect), list(data_glamos.GLACIER.unique())))

print('Glaciers with potential clear sky radiation data:\n', glDirect)
print('Number of glaciers:', len(glDirect))
print('Glaciers without potential clear sky radiation data:\n', restgl)

# Filter out glaciers without data:
data_glamos = data_glamos[data_glamos.GLACIER.isin(glDirect)]

# Look at the data of the ERA5 dataset:
xr.open_dataset(path_ERA5_raw + 'era5_monthly_averaged_data.nc')

In [None]:
# Glacier outlines:
glacier_outline_sgi = gpd.read_file(
    os.path.join(path_SGI_topo, 'inventory_sgi2016_r2020',
                 'SGI_2016_glaciers_copy.shp'))  # Load the shapefile
glacier_outline_rgi = gpd.read_file(path_rgi_outlines)

In [None]:
# get number of measurements per glacier:
glacier_info = data_glamos.groupby('GLACIER').size().sort_values(
    ascending=False).reset_index()
glacier_info.rename(columns={0: 'Nb. measurements'}, inplace=True)
glacier_info.set_index('GLACIER', inplace=True)

glacier_loc = data_glamos.groupby('GLACIER')[['POINT_LAT', 'POINT_LON']].mean()

glacier_info = glacier_loc.merge(glacier_info, on='GLACIER')

glacier_period = data_glamos.groupby(['GLACIER', 'PERIOD']).size().unstack().fillna(0).astype(int)

glacier_info = glacier_info.merge(glacier_period, on='GLACIER')

test_glaciers = [
    'tortin', 'plattalva', 'sanktanna', 'schwarzberg', 'hohlaub', 'pizol',
    'corvatsch', 'tsanfleuron', 'forno'
]

glacier_info['Train/Test glacier'] = glacier_info.apply(
    lambda x: 'Test' if x.name in test_glaciers else 'Train', axis=1)
glacier_info.head(2)

### Assign glaciers to river basin names:

In [None]:
# === Load RGI glacier IDs ===
rgi_df = pd.read_csv(path_glacier_ids)
rgi_df.columns = rgi_df.columns.str.strip()
rgi_df = rgi_df.sort_values(by='short_name').set_index('short_name')

# === Load SGI region geometries ===
SGI_regions = gpd.read_file(os.path.join(
    path_SGI_topo, 'inventory_sgi2016_r2020', 'sgi_regions.geojson'))

# Clean object columns
SGI_regions[SGI_regions.select_dtypes(include='object').columns] = \
    SGI_regions.select_dtypes(include='object').apply(lambda col: col.str.strip())

SGI_regions = SGI_regions.drop_duplicates().dropna()
SGI_regions = SGI_regions.set_index('pk_sgi_region')

# === Map to Level 0 river basins ===
catchment_lv0 = {
    'A': 'Rhine',
    'B': 'Rhone',
    'C': 'Po',
    'D': 'Adige',
    'E': 'Danube'
}
rgi_df['rvr_lv0'] = rgi_df['sgi-id'].str[0].map(catchment_lv0)

# === Map to Level 1 river basins using SGI regions ===
def get_river_basin(sgi_id):
    key = sgi_id.split('-')[0]
    if key not in SGI_regions.index:
        return None
    basin = SGI_regions.loc[key, 'river_basin_name']
    if isinstance(basin, pd.Series):
        return basin.dropna().unique()[0] if not basin.dropna().empty else None
    return basin if pd.notna(basin) else None

rgi_df['rvr_lv1'] = rgi_df['sgi-id'].apply(get_river_basin)

# Final formatting
rgi_df = rgi_df.reset_index().rename(columns={'short_name': 'GLACIER'}).set_index('GLACIER')
rgi_df.head()

In [None]:
glacier_info = glacier_info.merge(rgi_df[['rvr_lv0', 'rvr_lv1']], on='GLACIER', how='left')
glacier_info.head()

## Geoplots:
### Per glacier:

In [None]:
# Create the figure object
fig = plt.figure(figsize=(18, 10))

# Set map domain
latN = 48
latS = 45.8
lonW = 5.8
lonE = 10.5
projPC = ccrs.PlateCarree()

# Create map axes
ax2 = plt.axes(projection=projPC)
ax2.set_extent([lonW, lonE, latS, latN], crs=ccrs.Geodetic())

# Add base features
ax2.add_feature(cfeature.COASTLINE)
ax2.add_feature(cfeature.LAKES)
ax2.add_feature(cfeature.RIVERS)
ax2.add_feature(cfeature.BORDERS, linestyle='-', linewidth=1)

# ==== SGI REGION COLORING ====
sgi_colors = [
    '#e7e1ef', '#d4b9da', '#c994c7', '#df65b0', '#e7298a', '#ce1256',
    '#980043', '#67001f'
]

unique_regions = SGI_regions['river_basin_name'].unique()
unique_regions.sort()
color_map = dict(zip(unique_regions, sgi_colors[:len(unique_regions)]))

SGI_regions['color'] = SGI_regions['river_basin_name'].map(color_map)
# SGI_regions.plot(ax=ax2,
#                  transform=projPC,
#                  color=SGI_regions['color'],
#                  alpha=0.6)

# Glacier outlines
glacier_outline_sgi.plot(ax=ax2, transform=projPC, color='black')

# Scatterplot: glacier points
custom_palette = {'Train': color_dark_blue, 'Test': '#b2182b'}
g = sns.scatterplot(data=glacier_info,
                    x='POINT_LON',
                    y='POINT_LAT',
                    size='Nb. measurements',
                    hue='Train/Test glacier',
                    # hue = 'rvr_lv1',
                    sizes=(100, 2000),
                    alpha=0.6,
                    palette=custom_palette,
                    # palette = color_map,
                    transform=projPC,
                    ax=ax2,
                    zorder=10)

# Gridlines
gl = ax2.gridlines(draw_labels=True,
                   linewidth=1,
                   color='gray',
                   alpha=0.5,
                   linestyle='--')
gl.xformatter = LONGITUDE_FORMATTER
gl.yformatter = LATITUDE_FORMATTER
gl.xlabel_style = {'size': 16, 'color': 'black'}
gl.ylabel_style = {'size': 16, 'color': 'black'}
gl.top_labels = gl.right_labels = False

# ==== LEGEND SETUP ====

# 1. SGI region legend handles
sgi_handles = [
    Patch(facecolor=color_map[region], label=region)
    for region in unique_regions
]

# 2. Seaborn scatterplot legend handles
scatter_handles, scatter_labels = g.get_legend_handles_labels()
print(scatter_labels)
# 3. Combine and show all legends
# all_handles = sgi_handles + scatter_handles
all_handles = scatter_handles
#all_labels = list(color_map.keys()) + scatter_labels
all_labels = scatter_labels

ax2.legend(handles=all_handles,
           labels=all_labels,
           #bbox_to_anchor=(1.05, 1),
           loc='upper right',
           frameon=True,
           fontsize=18,
           title='',
           title_fontsize=18)

plt.tight_layout()
plt.show()

### sqrt scaling:

In [None]:
import rasterio
from rasterio.warp import calculate_default_transform, reproject, Resampling
from rasterio.enums import Resampling as RResampling
import numpy as np

# Open the original raster
tif_path = "landesforstinventar-vegetationshoehenmodell_relief_sentinel_2024_2056.tif"

# Desired output resolution (in degrees)
# Approx. 100 m in degrees: ~0.0009 deg
target_res = 0.0009
output_crs = "EPSG:4326"  # WGS84

with rasterio.open(tif_path) as src:
    # Calculate transform and shape with coarser resolution
    transform, width, height = calculate_default_transform(
        src.crs, output_crs, src.width, src.height, *src.bounds,
        resolution=target_res
    )

    # Set up destination array and metadata
    kwargs = src.meta.copy()
    kwargs.update({
        'crs': output_crs,
        'transform': transform,
        'width': width,
        'height': height
    })

    # Prepare empty destination array
    destination = np.empty((height, width), dtype=src.dtypes[0])

    # Reproject with coarsening
    reproject(
        source=rasterio.band(src, 1),
        destination=destination,
        src_transform=src.transform,
        src_crs=src.crs,
        dst_transform=transform,
        dst_crs=output_crs,
        resampling=Resampling.average  # average to reduce noise when downsampling
    )

    extent = [transform[2], transform[2] + transform[0] * width,
              transform[5] + transform[4] * height, transform[5]]

In [None]:
extent

In [None]:
from matplotlib.lines import Line2D

# ---- 1. Preprocessing ----
# Square-root scaling of number of measurements
glacier_info['sqrt_size'] = np.sqrt(glacier_info['Nb. measurements'])

# Cache dataset-wide min and max
sqrt_min = glacier_info['sqrt_size'].min()
sqrt_max = glacier_info['sqrt_size'].max()

# Define the desired marker size range in points^2
sizes = (30, 1500)  # min and max scatter size

# Function to scale individual values consistently
def scaled_size(val, min_out=sizes[0], max_out=sizes[1]):
    sqrt_val = np.sqrt(val)
    if sqrt_max == sqrt_min:
        return (min_out + max_out) / 2
    return min_out + (max_out - min_out) * ((sqrt_val - sqrt_min) / (sqrt_max - sqrt_min))

# Apply scaling to full dataset for the actual plot
glacier_info['scaled_size'] = glacier_info['Nb. measurements'].apply(scaled_size)

# ---- 2. Create figure and base map ----
fig = plt.figure(figsize=(18, 10))

#latN, latS = 48, 45.8
latN, latS = 47.1, 45.8
lonW, lonE = 5.8, 10.5
projPC = ccrs.PlateCarree()
ax2 = plt.axes(projection=projPC)
ax2.set_extent([lonW, lonE, latS, latN], crs=ccrs.Geodetic())

ax2.add_feature(cfeature.COASTLINE)
ax2.add_feature(cfeature.LAKES)
ax2.add_feature(cfeature.RIVERS)
ax2.add_feature(cfeature.BORDERS, linestyle='-', linewidth=1)

# Add the image to the cartopy map
ax2.imshow(
    destination,
    origin='upper',
    extent=extent,
    transform=ccrs.PlateCarree(),  # Assuming raster is in WGS84
    cmap='gray',  # or any other colormap
    alpha=0.6,    # transparency
    zorder=0
)

# Glacier outlines
glacier_outline_sgi.plot(ax=ax2, transform=projPC, color='black')

# ---- 3. Scatterplot ----
custom_palette = {'Train': color_dark_blue, 'Test': '#b2182b'}

g = sns.scatterplot(
    data=glacier_info,
    x='POINT_LON',
    y='POINT_LAT',
    size='scaled_size',
    hue='Train/Test glacier',
    sizes=sizes,
    alpha=0.6,
    palette=custom_palette,
    transform=projPC,
    ax=ax2,
    zorder=10,
    legend=True  # custom legend added below
)

# ---- 4. Gridlines ----
gl = ax2.gridlines(draw_labels=True,
                   linewidth=1,
                   color='gray',
                   alpha=0.5,
                   linestyle='--')
gl.xformatter = LONGITUDE_FORMATTER
gl.yformatter = LATITUDE_FORMATTER
gl.xlabel_style = {'size': 16, 'color': 'black'}
gl.ylabel_style = {'size': 16, 'color': 'black'}
gl.top_labels = gl.right_labels = False

# ---- 5. Custom Combined Legend ----

# Hue legend handles
handles, labels = g.get_legend_handles_labels()
expected_labels = list(custom_palette.keys())
hue_entries = [(h, l) for h, l in zip(handles, labels) if l in expected_labels]

# Size legend values and handles
size_values = [30, 100, 1000, 6000]
size_handles = [
    Line2D([], [], marker='o', linestyle='None',
           markersize=np.sqrt(scaled_size(val)),  # matplotlib uses radius
           markerfacecolor='gray', alpha=0.6,
           label=f'{val}')
    for val in size_values
]

# Separator label
separator_handle = Patch(facecolor='none', edgecolor='none', label='Nb. measurements')

# Combine all legend entries
# combined_handles = [h for h, _ in hue_entries] + [separator_handle] + size_handles
# combined_labels = [l for _, l in hue_entries] + ['Nb. measurements'] + [str(v) for v in size_values]

# same but without separator
combined_handles = [h for h, _ in hue_entries] + size_handles
combined_labels = [l for _, l in hue_entries] + [str(v) for v in size_values]

# Final legend
ax2.legend(combined_handles,
           combined_labels,
           title='Nb. measurements',
           loc='lower right',
           frameon=True,
           fontsize=14,
           title_fontsize=15,
           borderpad=1.2,
           labelspacing=1.2, 
           ncol = 3)
# ax2.set_title('Glacier measurement locations', fontsize = 25)
plt.tight_layout()
plt.show()


## Input data:

In [None]:
# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

# Transform data to monthly format (run or load data):
paths = {
    'csv_path': path_PMB_GLAMOS_csv,
    'era5_climate_data': path_ERA5_raw + 'era5_monthly_averaged_data.nc',
    'geopotential_data': path_ERA5_raw + 'era5_geopotential_pressure.nc',
    'radiation_save_path': path_pcsr + 'zarr/'
}
RUN = False
dataloader_gl = process_or_load_data(run_flag=RUN,
                                     data_glamos=data_glamos,
                                     paths=paths,
                                     cfg=cfg,
                                     vois_climate=vois_climate,
                                     vois_topographical=vois_topographical)
data_monthly = dataloader_gl.data

In [None]:
test_glaciers = [
    'tortin', 'plattalva', 'sanktanna', 'schwarzberg', 'hohlaub', 'pizol',
    'corvatsch', 'tsanfleuron', 'forno'
]

# Ensure all test glaciers exist in the dataset
existing_glaciers = set(dataloader_gl.data.GLACIER.unique())
missing_glaciers = [g for g in test_glaciers if g not in existing_glaciers]

if missing_glaciers:
    print(
        f"Warning: The following test glaciers are not in the dataset: {missing_glaciers}"
    )

# Define training glaciers correctly
train_glaciers = [i for i in existing_glaciers if i not in test_glaciers]

data_test = dataloader_gl.data[dataloader_gl.data.GLACIER.isin(test_glaciers)]
print('Size of test data:', len(data_test))

data_train = dataloader_gl.data[dataloader_gl.data.GLACIER.isin(
    train_glaciers)]
print('Size of train data:', len(data_train))

if len(data_train) == 0:
    print("Warning: No training data available!")
else:
    test_perc = (len(data_test) / len(data_train)) * 100
    print('Percentage of test size: {:.2f}%'.format(test_perc))

# Number of annual versus winter measurements:
print('Train:')
print('Number of winter and annual samples:', len(data_train))
print('Number of annual samples:',
      len(data_train[data_train.PERIOD == 'annual']))
print('Number of winter samples:',
      len(data_train[data_train.PERIOD == 'winter']))

# Same for test
data_test_annual = data_test[data_test.PERIOD == 'annual']
data_test_winter = data_test[data_test.PERIOD == 'winter']

print('Test:')
print('Number of winter and annual samples:', len(data_test))
print('Number of annual samples:', len(data_test_annual))
print('Number of winter samples:', len(data_test_winter))

print('Total:')
print('Number of monthly rows:', len(dataloader_gl.data))
print('Number of annual rows:',
      len(dataloader_gl.data[dataloader_gl.data.PERIOD == 'annual']))
print('Number of winter rows:',
      len(dataloader_gl.data[dataloader_gl.data.PERIOD == 'winter']))

#### Heatmap annual:

In [None]:
plotHeatmap(test_glaciers, data_glamos, glacierCap, period='annual')

#### Heatmap winter:

In [None]:
plotHeatmap(test_glaciers, data_glamos, glacierCap, period='winter')

#### CV splits:

In [None]:
splits, test_set, train_set = get_CV_splits(dataloader_gl,
                                            test_split_on='GLACIER',
                                            test_splits=test_glaciers,
                                            random_state=cfg.seed)

print('Test glaciers: ({}) {}'.format(len(test_set['splits_vals']),
                                      test_set['splits_vals']))
test_perc = (len(test_set['df_X']) / len(train_set['df_X'])) * 100
print('Percentage of test size: {:.2f}%'.format(test_perc))
print('Size of test set:', len(test_set['df_X']))
print('Train glaciers: ({}) {}'.format(len(train_set['splits_vals']),
                                       train_set['splits_vals']))
print('Size of train set:', len(train_set['df_X']))

visualiseSplits(test_set['y'], train_set['y'], splits)
visualiseInputs(train_set, test_set, vois_climate)

In [None]:
# Number of measurements per year:
dataloader_gl.data.groupby(['YEAR', 'PERIOD']).size().unstack().plot(
    kind='bar',
    stacked=True,
    figsize=(20, 5),
    color=[color_dark_blue, '#abd9e9'])
# plt.title('Number of measurements per year for all glaciers', fontsize = 25)
# get legend 
plt.legend(title='Period', fontsize=18, title_fontsize=20, ncol = 2)

## XGBoost:

In [None]:
# Grid search
# For each of the XGBoost parameter, define the grid range
param_grid = {
    'max_depth': [2, 3, 4, 5, 6, 7, 8],
    'n_estimators':
    [50, 100, 200, 300, 400, 500, 600,
     700],  # number of trees (too many = overfitting, too few = underfitting)
    'learning_rate': [0.01, 0.1, 0.15, 0.2, 0.25, 0.3]
}

param_init = {}
param_init['device'] = 'cuda:0'
param_init['tree_method'] = 'hist'
param_init["random_state"] = cfg.seed
param_init["n_jobs"] = cfg.numJobs

vois_climate = [
    't2m', 'tp', 'slhf', 'sshf', 'ssrd', 'fal', 'str', 'u10', 'v10'
]

vois_topographical = [
    "aspect_sgi",
    "slope_sgi",
    "hugonnet_dhdt",
    "consensus_ice_thickness",
    "millan_v",
]

### Predictions of custom parameters:

In [None]:
custom_params = {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 800}

# Feature columns:
feature_columns = [
    'ELEVATION_DIFFERENCE'
] + list(vois_climate) + list(vois_topographical) + ['pcsr']
# feature_columns = ['ELEVATION_DIFFERENCE'
#                    ] + list(vois_climate) + list(vois_topographical)
all_columns = feature_columns + cfg.fieldsNotFeatures
df_X_train_subset = train_set['df_X'][all_columns]
print('Shape of training dataset:', df_X_train_subset.shape)
print('Shape of testing dataset:', test_set['df_X'][all_columns].shape)
print('Running with features:', feature_columns)

params = {**param_init, **custom_params}
print(params)
custom_model = mbm.models.CustomXGBoostRegressor(cfg, **params)

# Fit on train data:
custom_model.fit(train_set['df_X'][all_columns], train_set['y'])

# Make predictions on test
custom_model = custom_model.set_params(device='cpu')
features_test, metadata_test = custom_model._create_features_metadata(
    test_set['df_X'][all_columns])
y_pred = custom_model.predict(features_test)
print('Shape of the test:', features_test.shape)

# Make predictions aggr to meas ID:
y_pred_agg = custom_model.aggrPredict(metadata_test, features_test)

# Calculate scores
score = custom_model.score(test_set['df_X'][all_columns],
                           test_set['y'])  # negative
print('Overall score:', np.abs(score))

grouped_ids = getDfAggregatePred(test_set, y_pred_agg, all_columns)
PlotPredictions(grouped_ids, y_pred, metadata_test, test_set, custom_model)
plt.suptitle(f'MBM tested on {test_glaciers}', fontsize=20)
plt.tight_layout()

In [None]:
# Aggregate predictions to annual or winter:
PlotIndividualGlacierPredVsTruth(grouped_ids, figsize=(20, 15))

In [None]:
FIPlot(custom_model, feature_columns, vois_climate)