## Setting up:

In [None]:
import pandas as pd
import os
import warnings
from tqdm.notebook import tqdm
import re
import massbalancemachine as mbm
import geopandas as gpd
from shapely.geometry import Polygon, LineString, Point
from scipy.spatial.distance import cdist
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GroupKFold, KFold, train_test_split, GroupShuffleSplit
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from cmcrameri import cm
from oggm import cfg, utils, workflow, tasks
import logging
import geopandas as gpd
import xarray as xr
from ast import literal_eval

import config
from scripts.helpers import *
from scripts.glamos_preprocess import *
from scripts.plots import *
from scripts.xgb_helpers import *

warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

In [None]:
seed_all(config.SEED)

# in case no memory
free_up_cuda()

# Plot styles:
path_style_sheet = 'scripts/example.mplstyle'
plt.style.use(path_style_sheet)

custom_working_dir = '../../../data/OGGM/'

# Specify the short names of the climate variables available in the dataset
vois_climate = ['t2m', 'tp', 'slhf', 'sshf', 'ssrd', 'fal', 'str']
voi_topographical = ['aspect', 'slope', 'dis_from_border', 'topo']
# voi_topographical = ['aspect', 'slope', 'dis_from_border']


In [None]:
# RGI Ids:
# Read rgi ids:
rgi_df = pd.read_csv(path_rgi, sep=',')
rgi_df.rename(columns=lambda x: x.strip(), inplace=True)
rgi_df.sort_values(by='short_name', inplace=True)
rgi_df.set_index('short_name', inplace=True)

In [None]:
data_glamos = pd.read_csv(path_PMB_GLAMOS_csv + 'CH_wgms_dataset.csv')
data_glamos.head(2)

## Glacier grid:

In [None]:
glacierName = 'gries'
rgi_gl = rgi_df.loc[glacierName]['rgi_id.v6']
data_gl = data_glamos[data_glamos.RGIId == rgi_gl]
dataset_gl = mbm.Dataset(data=data_gl,
                         region_name='CH',
                         data_path=path_PMB_GLAMOS_csv)
ds, glacier_indices, gdir = dataset_gl.get_glacier_mask(custom_working_dir)

# Create pandas dataframe of glacier grid
years = data_gl['YEAR'].unique()
df_grid_annual = dataset_gl.create_glacier_grid(custom_working_dir)
# Add metadata that is not in WGMS dataset
df_grid_annual["PERIOD"] = "annual"
df_grid_annual['GLACIER'] = glacierName

# Load monthly glacier grid (preprocessed in other notebooks)
df_grid_monthly = pd.read_csv(path_glacier_grid + f'{glacierName}_grid.csv')
dataloader = mbm.DataLoader(data=df_grid_monthly,
                            random_seed=config.SEED,
                            meta_data_columns=config.META_DATA)
print('\nNumber of years: {}, from {} to {}'.format(len(years), years[0],
                                                    years[-1]))
print('\nNumber of total (yearly) measurements:', len(df_grid_annual))
df_grid_monthly.head(2)

In [None]:
# Plot glacier attributes of oggm:
plotGlAttr(ds, cmap=sns.color_palette("viridis", as_cmap=True))

# Plot glacier grid with stakes:
plotGlGrid(df_grid_annual, data_gl)

## Stakes data:

In [None]:
data_gl = data_glamos[data_glamos.RGIId == rgi_gl]
dataset_gl = mbm.Dataset(data=data_gl,
                         region_name='CH',
                         data_path=path_PMB_GLAMOS_csv)
print('Number of winter and annual samples:', len(data_gl))
print('Number of annual samples:', len(data_gl[data_gl.PERIOD == 'annual']))
print('Number of winter samples:', len(data_gl[data_gl.PERIOD == 'winter']))
plotNumMeasPerYear(data_gl, glacierName)

In [None]:
# Add climate features and transform to monthly format
dataloader_gl = getMonthlyDataLoaderOneGl(glacierName, vois_climate,
                                     voi_topographical)

# Get train, test and validation data
splits, test_set, train_set = getCVSplits(dataloader_gl, test_split_on = 'YEAR')

print('Train years:', train_set['splits_vals'])
print('Test years:', test_set['splits_vals'])

# Plot splits
visualiseSplits(test_set['y'], train_set['y'], splits)


## All combinations of features:

In [None]:
feature_columns = ['ELEVATION_DIFFERENCE'] + voi_topographical + vois_climate
print('Feature columns:', feature_columns)
iterable_voi = list(powerset(vois_climate, min_length=3))
iterable_topo = list(powerset(voi_topographical, min_length=2))

# combinations for t2m and tp
combinations_voi_topo = list(itertools.product(iterable_voi, iterable_topo))

print('Number of combinations:', len(combinations_voi_topo))

In [None]:
# Create custom XGB regressor
custom_params = {
    'n_estimators': 100,
    'max_depth': 4,
    'learning_rate': 0.01,
    'gamma': 1
}
param_init = {}
param_init['device'] = 'cuda:0'
param_init['tree_method'] = 'hist'
param_init["random_state"] = config.SEED
# merge two dictionaries
params = {**custom_params, **param_init}

In [None]:
%%time

glaciersToRun = ['aletsch', 'gries', 'silvretta']

for glacierName in glaciersToRun:
    # If this glacier has already been run, skip it
    if os.path.exists(f'results/combinations_climate_topo_{glacierName}.csv'):
        print(f'{glacierName} already run, skipping...')
        continue
    print('Running for', glacierName)

    # Add climate features and transform to monthly format
    dataloader_gl = getMonthlyDataLoaderOneGl(glacierName, vois_climate,
                                         voi_topographical)

    # Get train, test and validation data
    splits, test_set, train_set = getCVSplits(dataloader_gl, test_split_on = 'YEAR')

    dfcombi = pd.DataFrame(combinations_voi_topo, columns=['voi', 'topo'])
    val_score = []
    for (voi, topo) in tqdm(combinations_voi_topo, desc='Number of comb.'):
        feature_columns = ['ELEVATION_DIFFERENCE'] + voi + topo

        # Make a cross-validation split
        splits_cv = dataloader_gl.get_cv_split(n_splits=3,
                                               type_fold='group-meas-id')

        # select those feature in the dataset
        df_X_train_subset = train_set['df_X'][feature_columns +
                                              config.META_DATA +
                                              config.NOT_METADATA_NOT_FEATURES]
        groups_subset = df_X_train_subset['ID'].values

        # Fit the model
        custom_xgboost = mbm.models.CustomXGBoostRegressor(**params)
        custom_xgboost.fit(df_X_train_subset, train_set['y'])

        # Evaluate the model with cross-validation
        scores_cv = cross_val_score(custom_xgboost,
                                    df_X_train_subset,
                                    y=train_set['y'],
                                    groups=groups_subset,
                                    cv=splits_cv)
        val_score.append(scores_cv.mean())

    dfcombi['val_score'] = val_score
    dfcombi.sort_values(by='val_score', ascending=False, inplace=True)
    # Save the results
    dfcombi.to_csv(f'results/combinations_climate_topo_{glacierName}.csv',
                   index=False)

In [None]:
dfcombi = pd.DataFrame()
for glacierName in glaciersToRun:
    dfcombi_ = pd.read_csv(
        f'results/combinations_climate_topo_{glacierName}.csv',
        converters={
            "voi": literal_eval,
            "topo": literal_eval
        })
    dfcombi_.rename(columns={'voi': 'climate'}, inplace=True)
    # Give a hash to each combination of voi and topo:
    dfcombi_['climate-topo-hash'] = [
        makeCombNum(dfcombi_['climate'].iloc[i], dfcombi_['topo'].iloc[i])
        for i in range(len(dfcombi_))
    ]
    # Make val_score positive
    dfcombi_['val_score'] = dfcombi_['val_score'].abs()
    dfcombi_['glacier'] = glacierName
    dfcombi = pd.concat([dfcombi, dfcombi_])

# Find 50 best combinations
N = 50
dfWeights_climate_all, dfWeights_topo_all = pd.DataFrame(), pd.DataFrame()
for glacier in dfcombi.glacier.unique():
    N_best = dfcombi[dfcombi.glacier == glacier].sort_values(
        by='val_score')[:N]
    mean_score = N_best['val_score'].mean()

    topo_weight_map = np.zeros(len(voi_topographical))
    climate_weight_map = np.zeros(len(vois_climate))

    count_climate = N_best.explode('climate').groupby(
        'climate').count().reset_index()
    count_topo = N_best.explode('topo').groupby('topo').count().reset_index()

    for i, var in enumerate(count_topo['topo']):
        topo_weight_map[i] = count_topo.iloc[i]['glacier']

    for i, var in enumerate(count_climate['climate']):
        climate_weight_map[i] = count_climate.iloc[i]['glacier']

    dfWeights = pd.DataFrame({
        'weight':
        np.concatenate([climate_weight_map, topo_weight_map], axis=0),
        'feature_type':
        np.concatenate([
            np.tile('climate', len(vois_climate)),
            np.tile('topo', len(voi_topographical))
        ]),
        'feature':
        np.concatenate([vois_climate, voi_topographical])
    })
    dfWeights['freq_var'] = dfWeights['weight'] / 50
    dfWeights['glacier'] = glacier
    dfWeights['mean_score'] = mean_score
    dfWeights_climate = dfWeights[dfWeights.feature_type ==
                                  'climate'].sort_values(by='freq_var',
                                                         ascending=False)
    dfWeights_topo = dfWeights[dfWeights.feature_type == 'topo'].sort_values(
        by='freq_var', ascending=False)

    dfWeights_climate_all = pd.concat(
        [dfWeights_climate_all, dfWeights_climate])
    dfWeights_topo_all = pd.concat([dfWeights_topo_all, dfWeights_topo])
dfWeights_all = pd.concat([dfWeights_climate_all, dfWeights_topo_all])

In [None]:
# Plot the weights
fig = plt.figure(figsize=(10, 5))

# Plot frequence of variables so that each row is a glacier and each column a type
g = sns.FacetGrid(
    dfWeights_all,
    col="glacier",
    row='feature_type',
)
g.map(sns.barplot, "feature", "freq_var", orient='v', alpha=0.5)

for col_val, ax in g.axes_dict.items():
    ax.set_ylabel('')
    ax.set_xlabel('')
    ax.tick_params(axis="x", rotation=90)
    ax.set_title(col_val)
    mean_score = dfWeights_all[dfWeights_all.glacier ==
                               col_val[1]]['mean_score'].iloc[0]
    legend_text = "\n".join((r"$\mathrm{val\ MSE}=%.1f$" % (mean_score, ), ))
    ax.text(
        0.05,
        0.88,
        legend_text,
        transform=ax.transAxes,
        verticalalignment="bottom",
        fontsize=16,
    )

## Test best combination for a glacier:

In [None]:
glacierName = 'gries'

# Add climate features and transform to monthly format
dataloader_gl = getMonthlyDataLoaderOneGl(glacierName, vois_climate,
                                     voi_topographical)

# Get train, test and validation data
splits, test_set, train_set = getCVSplits(dataloader_gl, test_split_on = 'YEAR')

voi = dfWeights_all[(dfWeights_all.feature_type == 'climate')
                    & (dfWeights_all.glacier == glacierName) &
                    (dfWeights_all.freq_var > 0.5)].feature.values
topo = dfWeights_all[(dfWeights_all.feature_type == 'topo')
                     & (dfWeights_all.glacier == glacierName) &
                     (dfWeights_all.freq_var > 0.5)].feature.values
# remove topo from topo
# feature_columns = ['ELEVATION_DIFFERENCE'] + list(voi) + list(topo)
feature_columns = ['ELEVATION_DIFFERENCE'
                   ] + list(vois_climate) + list(voi_topographical)
feature_columns.remove('topo')
all_columns = feature_columns + config.META_DATA + config.NOT_METADATA_NOT_FEATURES

# Grid search
# For each of the XGBoost parameter, define the grid range
parameters = {
    'max_depth': [
        3,
        4,
        5,
        6,
    ],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'n_estimators': [100, 200, 300],
    'gamma': [0, 1]
}

param_init = {}
param_init['device'] = 'cuda:0'
param_init['tree_method'] = 'hist'
param_init["random_state"] = config.SEED

# custom variables
df_X_train_subset = train_set['df_X'][all_columns]
print('Shape of the dataset:', df_X_train_subset.shape)
print('Running with features: ', feature_columns)

# Create a CustomXGBoostRegressor instance
custom_xgboost = mbm.models.CustomXGBoostRegressor(**param_init)
custom_xgboost.randomsearch(
    parameters=parameters,
    n_iter=20,
    splits=splits,
    features=df_X_train_subset,
    targets=train_set['y'],
    num_jobs=-1,
    random_seed=config.SEED,
)

best_params = params = custom_xgboost.param_search.best_params_
best_estimator = custom_xgboost.param_search.best_estimator_
print("Best parameters:\n", best_params)
print("Best score:\n", custom_xgboost.param_search.best_score_)

In [None]:
# Set to CPU for predictions:
xgb = best_estimator.set_params(device='cpu')

# Make predictions on test
features_test, metadata_test = xgb._create_features_metadata(
    test_set['df_X'][all_columns], config.META_DATA)
y_pred = xgb.predict(features_test)
print('Shape of the test:', features_test.shape)

# Make predictions aggr to meas ID:
y_pred_agg = xgb.aggrPredict(metadata_test, config.META_DATA, features_test)

# Calculate scores
score = xgb.score(test_set['df_X'][all_columns], test_set['y'])  # negative
mse, rmse, mae, pearson_corr = xgb.evalMetrics(metadata_test, y_pred,
                                               test_set['y'])

# Aggregate predictions to annual or winter:
df_pred = test_set['df_X'][all_columns].copy()
df_pred['target'] = test_set['y']
grouped_ids = df_pred.groupby('ID').agg({'target': 'mean', 'YEAR': 'first'})
grouped_ids['pred'] = y_pred_agg
grouped_ids['PERIOD'] = test_set['df_X'][
    feature_columns + config.META_DATA +
    config.NOT_METADATA_NOT_FEATURES].groupby('ID')['PERIOD'].first()

fig = plt.figure(figsize=(15, 5))
ax = plt.subplot(1, 2, 1)
predVSTruth(ax, grouped_ids, mae, rmse, pearson_corr)
ax.set_title('Target vs prediction')

ax = plt.subplot(1, 2, 2)

plotMeanPred(grouped_ids, ax)
ax.legend()
ax.set_title('Mean yearly target and prediction')
plt.suptitle(f'XGBoost on {glacierName.title()} (split years)', fontsize=20)
plt.tight_layout()

In [None]:
feature_columns = ['ELEVATION_DIFFERENCE'] + voi_topographical + vois_climate
FIPlot(best_estimator, feature_columns, vois_climate)

In [None]:
## Whole grid:
# Make predictions:
df_grid_monthly = pd.read_csv(path_glacier_grid + f'{glacierName}_grid.csv')
# Select only the subset of features
df_grid_monthly = df_grid_monthly[all_columns]

# Set to CPU for predictions:
xgb = xgb.set_params(device='cpu')

# Make predictions on whole glacier grid
features_grid, metadata_grid = xgb._create_features_metadata(
    df_grid_monthly, config.META_DATA)
print('Shape of the dataset:', features_grid.shape)
y_pred_grid = xgb.predict(features_grid)

# Make predictions aggr to meas ID:
y_pred_grid_agg = xgb.aggrPredict(metadata_grid, config.META_DATA,
                                  features_grid)

# Aggregate predictions to annual or winter:
grouped_ids = df_grid_monthly.groupby('ID').agg({'YEAR': 'mean'})
grouped_ids['pred'] = y_pred_grid_agg

# Sum over all points of a glacier to get glacier wide SMB
grouped_ids = grouped_ids.groupby('YEAR').mean()

df_target = pd.read_csv(path_SMB_GLAMOS_csv + 'fix/' +
                        f'{glacierName}_fix.csv')
df_target = transformDates(df_target)
# Remove obvious duplicates:
df_target = df_target.drop_duplicates()
df_target['YEAR'] = df_target['date1'].apply(lambda x: pd.to_datetime(x).year)
df_target['Annual Balance'] = df_target['Annual Balance'] / (1000)
df_target = df_target[['YEAR', 'Annual Balance']].set_index('YEAR')

fig, ax = plt.subplots(1, 1, figsize=(15, 5))
grouped_ids.plot(y='pred', label='Predicted SMB', ax=ax, color=color_xgb)

df_target[df_target.index > 1960].plot(y='Annual Balance',
                                       label='GLAMOS SMB',
                                       ax=ax,
                                       color=color_tim)

ax.set_title(f'{glacierName.title()} SMB')
ax.set_ylabel('SMB (m w.e.)')
plt.tight_layout()

### Compare to GeoB:

In [None]:
geodetic_csv = pd.read_csv(
    '../../../data/GLAMOS/glacier-wide/volumechange_2023_r2023/volumechange_2023_r2023_old.csv',
    sep=';')
geodetic_csv = geodetic_csv.iloc[2:]  # remove unit rows
geodetic_csv['glacier id'] = geodetic_csv['glacier id'].apply(
    lambda x: x.split('-')[0].upper().strip() + '/' + x.split('-')[1].strip())
geodetic_csv.rename(columns={
    'glacier id': 'sgi-id',
    'start date of observation': 'FROM_DATE',
    'end date of observation': 'TO_DATE',
    'annual geodetic mass balance': 'Bgeod'
},
                    inplace=True)
geodetic_csv.head(2)

In [None]:
sgi_id = rgi_df.loc[glacierName]['sgi-id'].strip()
gl_geoMB = geodetic_csv[geodetic_csv['sgi-id'] == sgi_id]


# assign hydr. year
def assignHydrYear(date):
    date = pd.to_datetime(date)
    return date.year


gl_geoMB['FROM_YEAR'] = pd.to_datetime(
    gl_geoMB['FROM_DATE']).apply(assignHydrYear) + 1
gl_geoMB['TO_YEAR'] = pd.to_datetime(gl_geoMB['TO_DATE']).apply(assignHydrYear)
gl_geoMB = gl_geoMB[gl_geoMB['FROM_YEAR'] > 1961]
gl_geoMB['B-Period'] = gl_geoMB['FROM_YEAR'].astype(
    str) + '-' + gl_geoMB['TO_YEAR'].astype(str)
gl_geoMB['Bgeod'] = gl_geoMB['Bgeod'].astype(float)
gl_geoMB['volume change'] = gl_geoMB['volume change'].astype(float)

geodPred_ML, geodPred_TIM = [], []
for i, row in gl_geoMB.iterrows():
    geodPred_ML.append(
        grouped_ids.loc[row.FROM_YEAR:row.TO_YEAR].mean().values[0])
    geodPred_TIM.append(
        df_target.loc[row.FROM_YEAR:row.TO_YEAR].mean().values[0])

geodPred_df = pd.DataFrame({
    'Bgeod':
    np.concatenate([gl_geoMB['Bgeod'].values, geodPred_ML, geodPred_TIM]),
    'Type':
    np.concatenate([
        np.tile('Bgeod', len(gl_geoMB)),
        np.tile('ML', len(gl_geoMB)),
        np.tile('PDD', len(gl_geoMB))
    ]),
    'Period':
    np.concatenate(
        [gl_geoMB['B-Period'], gl_geoMB['B-Period'], gl_geoMB['B-Period']])
})

fig = plt.figure(figsize=(15, 5))
ax = plt.subplot(1, 1, 1)
sns.barplot(geodPred_df,
            x='Period',
            y='Bgeod',
            hue='Type',
            ax=ax,
            orient='v',
            alpha=0.5,
            palette=sns.color_palette(["green", color_xgb, color_tim]))
plt.tight_layout()
ax.set_title(glacierName.title() + ' GeoMB')