## Setting Up:

In [None]:
import pandas as pd
import os
import warnings
from tqdm.notebook import tqdm
import re
import massbalancemachine as mbm
import geopandas as gpd
from shapely.geometry import Polygon, LineString, Point
from scipy.spatial.distance import cdist
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GroupKFold, KFold, train_test_split, GroupShuffleSplit
from calendar import month_abbr
import matplotlib.pyplot as plt
import seaborn as sns
from cmcrameri import cm
from oggm import cfg, utils, workflow, tasks
import logging
import geopandas as gpd
import xarray as xr

import config
from scripts.helpers import *
from scripts.glamos_preprocess import *
from scripts.plots import *
from scripts.xgb_helpers import *

warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

In [None]:
seed_all(config.SEED)

# Specify the short names of the climate variables available in the dataset
vois_climate = ['t2m', 'tp', 'slhf', 'sshf', 'ssrd', 'fal', 'str']
vois_topographical = ['aspect', 'slope', 'dis_from_border']

# in case no memory
free_up_cuda()

# Plot styles:
path_style_sheet = 'scripts/example.mplstyle'
plt.style.use(path_style_sheet)

cmap = cm.devon
color_palette_glaciers = sns.color_palette(get_cmap_hex(cmap, 15))

# For bars and lines:
# color_diff_xgb = '#878787'
color_diff_xgb = '#4d4d4d'

colors = get_cmap_hex(cm.batlow, 10)
color_xgb = colors[0]
color_xgb_winter = colors[1]

color_tim = '#c51b7d'

# Violin and boxplots:
colors_temp_freq = sns.color_palette(get_cmap_hex(cm.devon, 8))
boxplot_style = {
    "width": .6,
    "showcaps": False,
    "palette": colors_temp_freq,
    "flierprops": {
        "marker": "x"
    },
    "showmeans": True,
    "meanprops": {
        "markerfacecolor": "white"
    }
}

marker_tim = 's'
marker_xgb = 'o'
marker_std = '_'

custom_working_dir = '../../../data/OGGM/'

In [None]:
# RGI Ids:
# Read rgi ids:
path_rgi = '../../../data/GLAMOS/CH_glacier_ids_long.csv'
rgi_df = pd.read_csv(path_rgi, sep=',')
rgi_df.rename(columns=lambda x: x.strip(), inplace=True)
rgi_df.sort_values(by='short_name', inplace=True)
rgi_df.set_index('short_name', inplace=True)

In [None]:
# Read stakes data over all glaciers:
data_glamos = pd.read_csv(path_PMB_GLAMOS_csv + 'CH_wgms_dataset.csv')
data_glamos.head(2)

In [None]:
# Heatmap of mean mass balance per glacier:
# Get the mean mass balance per glacier
mean_mb_per_glacier = data_glamos.groupby(['GLACIER',
                                           'YEAR'])['POINT_BALANCE'].mean()
matrix = pd.DataFrame(mean_mb_per_glacier).reset_index().pivot(
    index='GLACIER', columns='YEAR',
    values='POINT_BALANCE').sort_values(by='GLACIER')

# get elevation of glaciers:
gl_per_el = data_glamos.groupby(['GLACIER'])['POINT_ELEVATION'].mean()

matrix = matrix.loc[gl_per_el.sort_values(ascending=True).index]

# make index categorical
matrix.index = pd.Categorical(matrix.index,
                              categories=matrix.index,
                              ordered=True)
fig = plt.figure(figsize=(20, 20))
ax = plt.subplot(1, 1, 1)
sns.heatmap(data=matrix,
            center=0,
            cmap=cm.vik_r,
            cbar_kws={'label': '[m w.e. $a^{-1}$]'},
            ax=ax)

In [None]:
# Plot elevation:
fig = plt.figure(figsize=(10, 2))
ax = plt.subplot(1, 1, 1)
sns.lineplot(gl_per_el.sort_values(ascending=True),
             ax=ax,
             color='gray',
             marker='v')
ax.set_xticklabels('', rotation=90)
ax.set_ylabel('')
plt.tight_layout()


In [None]:
num_gl = data_glamos.groupby(['GLACIER']).size().sort_values()
num_gl.plot(kind='bar', figsize=(15, 5), cmap=cmap)
plt.title('Number of total measurements per glacier since 1961')

In [None]:
len(data_glamos.GLACIER.unique()), data_glamos.GLACIER.unique()

In [None]:
RUN = True
if RUN:
    # Filter data_glamos
    print('Running on {} glaciers'.format(len(data_glamos.GLACIER.unique())))
    print('Glaciers:', data_glamos.GLACIER.unique())
    # Create dataloader:
    dataset_gl = mbm.Dataset(data=data_glamos,
                             region_name='CH',
                             data_path=path_PMB_GLAMOS_csv)
    print('Number of winter and annual samples:', len(data_glamos))
    print('Number of annual samples:',
          len(data_glamos[data_glamos.PERIOD == 'annual']))
    print('Number of winter samples:',
          len(data_glamos[data_glamos.PERIOD == 'winter']))

    # Add climate data:
    # Specify the files of the climate data, that will be matched with the coordinates of the stake data
    era5_climate_data = path_ERA5_raw + 'era5_monthly_averaged_data.nc'
    geopotential_data = path_ERA5_raw + 'era5_geopotential_pressure.nc'

    # Match the climate features, from the ERA5Land netCDF file, for each of the stake measurement dataset
    dataset_gl.get_climate_features(climate_data=era5_climate_data,
                                    geopotential_data=geopotential_data,
                                    change_units=True)

    print('Converting to monthly resolution')
    # For each record, convert to a monthly time resolution
    dataset_gl.convert_to_monthly(meta_data_columns=config.META_DATA,
                                  vois_climate=vois_climate,
                                  vois_topographical=vois_topographical)

    # Create a new DataLoader object with the monthly stake data measurements.
    dataloader_gl = mbm.DataLoader(data=dataset_gl.data,
                                   random_seed=config.SEED,
                                   meta_data_columns=config.META_DATA)

    print('Number of monthly rows:', len(dataloader_gl.data))
    print('Columns in the dataset:', dataloader_gl.data.columns)

    # save the data
    dataloader_gl.data.to_csv(path_PMB_GLAMOS_csv +
                              'CH_wgms_dataset_monthly.csv',
                              index=False)
else:
    # read data
    data_monthly = pd.read_csv(path_PMB_GLAMOS_csv +
                               'CH_wgms_dataset_monthly.csv')
    dataloader_gl = mbm.DataLoader(data=data_monthly,
                                   random_seed=config.SEED,
                                   meta_data_columns=config.META_DATA)
print('Number of monthly rows:', len(dataloader_gl.data))

In [None]:
# Sanity check of variables:
df = dataloader_gl.data
var_to_plot = ['POINT_BALANCE'] + vois_climate
df = df[(df.GLACIER == 'aletsch') & (df.YEAR == 1961)].groupby(
    ['MONTHS'])[var_to_plot].mean().reset_index()
df['month_nb'] = df.MONTHS.apply(
    lambda x: list(month_abbr).index(x.capitalize()))
df.sort_values(by='month_nb', inplace=True)
fig, ax = plt.subplots(2, 4, figsize=(12, 5))

for i, ax in enumerate(ax.flatten()):
    df.plot(x='MONTHS', y=var_to_plot[i], marker='o', ax=ax)
    ax.set_title(var_to_plot[i])

plt.tight_layout()

## Split on glaciers:

In [None]:
# Heatmap of mean mass balance per glacier:
# Get the mean mass balance per glacier
mean_mb_per_glacier = data_glamos.groupby(['GLACIER',
                                           'YEAR'])['POINT_BALANCE'].mean()
matrix = pd.DataFrame(mean_mb_per_glacier).reset_index().pivot(
    index='GLACIER', columns='YEAR',
    values='POINT_BALANCE').sort_values(by='GLACIER')

# get elevation of glaciers:
gl_per_el = data_glamos.groupby(['GLACIER'])['POINT_ELEVATION'].mean()

matrix = matrix.loc[gl_per_el.sort_values(ascending=True).index]

# make index categorical
matrix.index = pd.Categorical(matrix.index,
                              categories=matrix.index,
                              ordered=True)
fig = plt.figure(figsize=(20, 20))
ax = plt.subplot(1, 1, 1)
sns.heatmap(data=matrix,
            center=0,
            cmap=cm.vik_r,
            cbar_kws={'label': '[m w.e. $a^{-1}$]'},
            ax=ax)

In [None]:
test_glaciers = [
    'tortin', 'limmern', 'taelliboden', 'sanktanna', 'schwarzberg', 'hohlaub',
    'rhone'
]
train_glaciers = [
    i for i in data_glamos.GLACIER.unique() if i not in test_glaciers
]

data_test = data_glamos[data_glamos.GLACIER.isin(test_glaciers)]
data_train = data_glamos[data_glamos.GLACIER.isin(train_glaciers)]

test_perc = (len(data_test) / len(data_train)) * 100
print('Percentage of test size: {:.2f}%'.format(test_perc))

In [None]:
splits, test_set, train_set = getCVSplits(dataloader_gl,
                                          test_split_on='GLACIER',
                                          test_splits=test_glaciers)

print('Test glaciers: ({}) {}'.format(len(test_set['splits_vals']),
                                      test_set['splits_vals']))
test_perc = (len(test_set['df_X']) / len(train_set['df_X'])) * 100
print('Percentage of test size: {:.2f}%'.format(test_perc))
print('Size of test set:', len(test_set['df_X']))
print('Train glaciers: ({}) {}'.format(len(train_set['splits_vals']),
                                       train_set['splits_vals']))
print('Size of train set:', len(train_set['df_X']))
visualiseSplits(test_set['y'], train_set['y'], splits)


In [None]:
f, ax = plt.subplots(2, 10, figsize=(16, 6), sharey='row', sharex='col')
train_set['df_X']['POINT_BALANCE'].plot.hist(ax=ax[0, 0],
                                             color=color_xgb,
                                             alpha=0.6,
                                             density=False)
ax[0, 0].set_title('PMB')
ax[0, 0].set_ylabel('Frequency (train)')
train_set['df_X']['POINT_ELEVATION'].plot.hist(ax=ax[0, 1],
                                               color=color_xgb,
                                               alpha=0.6,
                                               density=False)
ax[0, 1].set_title('ELV')
train_set['df_X']['YEAR'].plot.hist(ax=ax[0, 2],
                                    color=color_xgb,
                                    alpha=0.6,
                                    density=False)
ax[0, 2].set_title('YEARS')
train_set['df_X']['t2m'].plot.hist(ax=ax[0, 3],
                                   color=color_xgb,
                                   alpha=0.6,
                                   density=False)

for i, voi_clim in enumerate(vois_climate):
    ax[0, 3 + i].set_title(voi_clim)
    train_set['df_X'][voi_clim].plot.hist(ax=ax[0, 3 + i],
                                          color=color_xgb,
                                          alpha=0.6,
                                          density=False)

test_set['df_X']['POINT_BALANCE'].plot.hist(ax=ax[1, 0],
                                            color=color_tim,
                                            alpha=0.6,
                                            density=False)
ax[1, 0].set_ylabel('Frequency (test)')
test_set['df_X']['POINT_ELEVATION'].plot.hist(bins=50,
                                              ax=ax[1, 1],
                                              color=color_tim,
                                              alpha=0.6,
                                              density=False)
test_set['df_X']['YEAR'].plot.hist(ax=ax[1, 2],
                                   color=color_tim,
                                   alpha=0.6,
                                   density=False)

for i, voi_clim in enumerate(vois_climate):
    test_set['df_X'][voi_clim].plot.hist(ax=ax[1, 3 + i],
                                         color=color_tim,
                                         alpha=0.6,
                                         density=False)
# rotate xticks
for ax in ax.flatten():
    ax.tick_params(axis='x', rotation=45)
    ax.set_xlabel('')

plt.tight_layout()

### All variables:

In [None]:
%%time
# Grid search
# For each of the XGBoost parameter, define the grid range
# param_grid = {
#     'max_depth': [
#         3,
#         4,
#         5,
#         6,
#     ],
#     'learning_rate': [0.01, 0.1, 0.2, 0.3],
#     'n_estimators': [100, 200, 300],
#     'gamma': [0, 1]
# }
# param_grid = {
#     'learning_rate': np.arange(0.01, 0.3, 0.01),
#     'n_estimators': np.arange(50, 400, 15),
#     'max_depth': np.arange(3, 10, 1),
# }

param_ranges = {
    'max_depth': [2, 3, 4, 5, 6, 7, 8],
    'n_estimators':
    [50, 100, 200, 300, 400,
     500],  # number of trees (too many = overfitting, too few = underfitting)
    'learning_rate': [0.01, 0.1, 0.15, 0.2, 0.25, 0.3]
}

param_init = {}
param_init['device'] = 'cuda:0'
param_init['tree_method'] = 'hist'
param_init["random_state"] = config.SEED
param_init["n_jobs"] = config.NUM_JOBS

# Feature columns:
feature_columns = [
    'ELEVATION_DIFFERENCE', 'POINT_ELEVATION', 'ALTITUDE_CLIMATE'
] + list(vois_climate) + list(vois_topographical)
all_columns = feature_columns + config.META_DATA + config.NOT_METADATA_NOT_FEATURES
df_X_train_subset = train_set['df_X'][all_columns]
print('Shape of training dataset:', df_X_train_subset.shape)
print('Shape of testing dataset:', test_set['df_X'][all_columns].shape)
print('Running with features:', feature_columns)

RUN = True
if RUN:
    # Create a CustomXGBoostRegressor instance
    custom_xgboost = mbm.models.CustomXGBoostRegressor(**param_init)
    # custom_xgboost.randomsearch(
    #     parameters=param_grid,
    #     n_iter=45,
    #     splits=splits,
    #     features=df_X_train_subset,
    #     targets=train_set['y'],
    # )
    custom_xgboost.gridsearch(
        parameters=param_grid,
        splits=splits,
        features=df_X_train_subset,
        targets=train_set['y'],
    )
    # save best model
    custom_xgboost.save_model(f'xgb_gl_split_GS.pkl')
else:
    # read model
    custom_xgboost = mbm.models.CustomXGBoostRegressor()
    custom_xgboost.load_model(f'xgb_gl_split_GS.pkl')

# Get best parameters and estimator
best_params = custom_xgboost.param_search.best_params_
best_estimator = custom_xgboost.param_search.best_estimator_
print("Best parameters:\n", best_params)
print("Best score:\n", custom_xgboost.param_search.best_score_)

In [None]:
visualiseValPreds(best_estimator, splits, train_set, feature_columns)

In [None]:
plotGridSearchScore(custom_xgboost)
plotGridSearchParams(custom_xgboost, param_grid,
                     custom_xgboost.param_search.best_params_)

In [None]:
# Set to CPU for predictions:
xgb = best_estimator.set_params(device='cpu')

# Make predictions on test
features_test, metadata_test = xgb._create_features_metadata(
    test_set['df_X'][all_columns], config.META_DATA)
y_pred = xgb.predict(features_test)
print('Shape of the test:', features_test.shape)

# Make predictions aggr to meas ID:
y_pred_agg = xgb.aggrPredict(metadata_test, config.META_DATA, features_test)

# Calculate scores
score = xgb.score(test_set['df_X'][all_columns], test_set['y'])  # negative
mse, rmse, mae, pearson_corr = xgb.evalMetrics(metadata_test, y_pred,
                                               test_set['y'])

# Aggregate predictions to annual or winter:
df_pred = test_set['df_X'][all_columns].copy()
df_pred['target'] = test_set['y']
grouped_ids = df_pred.groupby('ID').agg({
    'target': 'mean',
    'YEAR': 'first',
    'POINT_ID': 'first'
})
grouped_ids['pred'] = y_pred_agg
grouped_ids['PERIOD'] = test_set['df_X'][
    feature_columns + config.META_DATA +
    config.NOT_METADATA_NOT_FEATURES].groupby('ID')['PERIOD'].first()
grouped_ids['GLACIER'] = grouped_ids['POINT_ID'].apply(
    lambda x: x.split('_')[0])

grouped_ids = grouped_ids[grouped_ids.YEAR <= 2021]

fig = plt.figure(figsize=(15, 10))
ax = plt.subplot(2, 2, 1)
grouped_ids_annual = grouped_ids[grouped_ids.PERIOD == 'annual']
predVSTruth(ax, grouped_ids_annual, mae, rmse, pearson_corr)
ax.set_title('Annual MB', fontsize=24)

grouped_ids_annual.sort_values(by='YEAR', inplace=True)
ax = plt.subplot(2, 2, 2)
plotMeanPred(grouped_ids_annual, ax)

if 'winter' in grouped_ids.PERIOD.unique():
    grouped_ids_winter = grouped_ids[grouped_ids.PERIOD == 'winter']
    ax = plt.subplot(2, 2, 3)
    predVSTruth(ax, grouped_ids_winter, mae, rmse, pearson_corr)
    ax.set_title('Winter MB', fontsize=24)

    ax = plt.subplot(2, 2, 4)
    grouped_ids_winter.sort_values(by='YEAR', inplace=True)
    plotMeanPred(grouped_ids_winter, ax)

# ax.set_title('Mean yearly target and prediction')
plt.suptitle(f'XGBoost tested on {test_glaciers}', fontsize=20)
plt.tight_layout()

In [None]:
FIPlot(best_estimator, feature_columns, vois_climate)