## Setting up:

In [None]:
import pandas as pd
import os
import warnings
from tqdm.notebook import tqdm
import re
import massbalancemachine as mbm
import geopandas as gpd
from shapely.geometry import Polygon, LineString, Point
from scipy.spatial.distance import cdist
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GroupKFold, KFold, train_test_split, GroupShuffleSplit

import cupy as cp
import matplotlib.pyplot as plt
import seaborn as sns
from cmcrameri import cm
from oggm import cfg, utils, workflow, tasks
import logging
import geopandas as gpd
import xarray as xr

import config
from scripts.helpers import *
from scripts.glamos_preprocess import *
from scripts.plots import *
from scripts.xgb_helpers import *

warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

In [None]:
seed_all(config.SEED)

# in case no memory
# free_up_cuda()

# Plot styles:
path_style_sheet = 'scripts/example.mplstyle'
plt.style.use(path_style_sheet)

cmap = cm.devon
color_palette_glaciers = sns.color_palette(get_cmap_hex(cmap, 15))

# For bars and lines:
# color_diff_xgb = '#878787'
color_diff_xgb = '#4d4d4d'

colors = get_cmap_hex(cm.batlow, 2)
color_xgb = colors[0]
color_tim = '#c51b7d'

# Violin and boxplots:
colors_temp_freq = sns.color_palette(get_cmap_hex(cm.devon, 8))
boxplot_style = {
    "width": .6,
    "showcaps": False,
    "palette": colors_temp_freq,
    "flierprops": {
        "marker": "x"
    },
    "showmeans": True,
    "meanprops": {
        "markerfacecolor": "white"
    }
}

marker_tim = 's'
marker_xgb = 'o'
marker_std = '_'

custom_working_dir = '../../../data/OGGM/'

In [None]:
# RGI Ids:
# Read rgi ids:
path_rgi = '../../../data/GLAMOS/CH_glacier_ids_long.csv'
rgi_df = pd.read_csv(path_rgi, sep=',')
rgi_df.rename(columns=lambda x: x.strip(), inplace=True)
rgi_df.sort_values(by='short_name', inplace=True)
rgi_df.set_index('short_name', inplace=True)

In [None]:
# Read stakes data over all glaciers:
data_glamos = pd.read_csv(path_PMB_GLAMOS_csv + 'CH_wgms_dataset.csv')
data_glamos.head(2)

In [None]:
num_gl = data_glamos.groupby(['GLACIER']).size().sort_values()
num_gl.plot(kind='bar', figsize=(15, 5), cmap=cmap)
plt.title('Number of total measurements per glacier since 1961')

In [None]:
len(data_glamos.GLACIER.unique()), data_glamos.GLACIER.unique()

In [None]:
# Specify the short names of the climate variables available in the dataset
vois_climate = ['t2m', 'tp', 'slhf', 'sshf', 'ssrd', 'fal', 'str']
voi_topographical = ['aspect', 'slope', 'dis_from_border']

## Gries:

### Create grid:

In [None]:
glacierName = 'gries'
rgi_gl = rgi_df.loc[glacierName]['rgi_id.v6']
data_gl = data_glamos[data_glamos.RGIId == rgi_gl]
dataset_gl = mbm.Dataset(data=data_gl,
                         region_name='CH',
                         data_path=path_PMB_GLAMOS_csv)
ds, glacier_indices, gdir = dataset_gl.get_glacier_mask(custom_working_dir)

# Create pandas dataframe of glacier grid
years = data_gl['YEAR'].unique()
df_grid_annual = dataset_gl.create_glacier_grid(custom_working_dir)
# Add metadata that is not in WGMS dataset
df_grid_annual["PERIOD"] = "annual"
df_grid_annual['GLACIER'] = glacierName

# Load monthly glacier grid (preprocessed in other notebooks)
df_grid_monthly = pd.read_csv(path_glacier_grid + f'{glacierName}_grid.csv')
dataloader = mbm.DataLoader(data=df_grid_monthly,
                            meta_data_columns=config.META_DATA)
print('\nNumber of years: {}, from {} to {}'.format(len(years), years[0],
                                                    years[-1]))
print('\nNumber of total (yearly) measurements:', len(df_grid_annual))
df_grid_monthly.head(2)

In [None]:
# Plot glacier attributes of oggm:
plotGlAttr(ds, cmap=cm.devon)

# Plot glacier grid with stakes:
plotGlGrid(df_grid_annual, data_gl)

### Train ML model on stakes:

In [None]:
data_gl = data_glamos[data_glamos.RGIId == rgi_gl]
dataset_gl = mbm.Dataset(data=data_gl,
                         region_name='CH',
                         data_path=path_PMB_GLAMOS_csv)
print('Number of winter and annual samples:', len(data_gl))
print('Number of annual samples:', len(data_gl[data_gl.PERIOD == 'annual']))
print('Number of winter samples:', len(data_gl[data_gl.PERIOD == 'winter']))
plotNumMeasPerYear(data_gl, glacierName)

In [None]:
# Plot elevation distribution of grid points:
fig = plt.figure(figsize=(15, 5))
ax = fig.add_subplot(111)
df_grid_annual.POINT_ELEVATION.hist(bins=20, color=color_xgb, ax=ax, alpha=0.5)

stake_el = data_gl.POINT_ELEVATION.unique()
for el in stake_el:
    ax.axvline(el, color=color_tim, linestyle='--', alpha=0.2)
plt.title('Elevation distribution of grid points (blue) and stakes (pink)')

In [None]:
# Add climate features and transform to monthly format
dataloader_gl = getMonthlyDataLoaderOneGl(glacierName, vois_climate,
                                          voi_topographical)

# Get train, test and validation data
splits, test_set, train_set = getCVSplits(dataloader_gl, test_split_on='YEAR')

print('Train years:', train_set['splits_vals'])
print('Test years:', test_set['splits_vals'])

# Plot splits
visualiseSplits(test_set['y'], train_set['y'], splits)


In [None]:
%%time
# Grid search
# For each of the XGBoost parameter, define the grid range
parameters = {
    'max_depth': [
        3,
        4,
        5,
        6,
    ],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'n_estimators': [100, 200, 300],
    'gamma': [0, 1]
}

param_init = {}
param_init['device'] = 'cuda:0'
param_init['tree_method'] = 'hist'
param_init["random_state"] = config.SEED

# Feature columns:
feature_columns = ['ELEVATION_DIFFERENCE'
                   ] + list(vois_climate) + list(voi_topographical)
all_columns = feature_columns + config.META_DATA + config.NOT_METADATA_NOT_FEATURES
df_X_train_subset = train_set['df_X'][all_columns]
print('Shape of the dataset:', df_X_train_subset.shape)
print('Running with features:', feature_columns)

# Create a CustomXGBoostRegressor instance
custom_xgboost = mbm.models.CustomXGBoostRegressor(**param_init)
custom_xgboost.randomsearch(
    parameters=parameters,
    n_iter=20,
    splits=splits,
    features=df_X_train_subset,
    targets=train_set['y'],
)

# save best model
custom_xgboost.save_model(f'xgb_{glacierName}.pkl')

# Get best parameters and estimator
best_params = params = custom_xgboost.param_search.best_params_
best_estimator = custom_xgboost.param_search.best_estimator_
print("Best parameters:\n", best_params)
print("Best score:\n", custom_xgboost.param_search.best_score_)

In [None]:
# Set to CPU for predictions:
xgb = best_estimator.set_params(device='cpu')

# Make predictions on test
features_test, metadata_test = xgb._create_features_metadata(
    test_set['df_X'][all_columns], config.META_DATA)
y_pred = xgb.predict(features_test)
print('Shape of the test:', features_test.shape)

# Make predictions aggr to meas ID:
y_pred_agg = xgb.aggrPredict(metadata_test, config.META_DATA, features_test)

# Calculate scores
score = xgb.score(test_set['df_X'][all_columns], test_set['y'])  # negative
mse, rmse, mae, pearson_corr = xgb.evalMetrics(metadata_test, y_pred,
                                               test_set['y'])

# Aggregate predictions to annual or winter:
df_pred = test_set['df_X'][all_columns].copy()
df_pred['target'] = test_set['y']
grouped_ids = df_pred.groupby('ID').agg({'target': 'mean', 'YEAR': 'first'})
grouped_ids['pred'] = y_pred_agg
grouped_ids['PERIOD'] = test_set['df_X'][
    feature_columns + config.META_DATA +
    config.NOT_METADATA_NOT_FEATURES].groupby('ID')['PERIOD'].first()

fig = plt.figure(figsize=(15, 5))
ax = plt.subplot(1, 2, 1)
predVSTruth(ax, grouped_ids, mae, rmse, pearson_corr)

ax = plt.subplot(1, 2, 2)
plotMeanPred(grouped_ids, ax)

ax.legend()
ax.set_title('Mean yearly target and prediction')
plt.suptitle(f'XGBoost on {glacierName.title()} (split years)', fontsize=20)
plt.tight_layout()

In [None]:
feature_columns = ['ELEVATION_DIFFERENCE'] + voi_topographical + vois_climate
FIPlot(best_estimator, feature_columns, vois_climate)

### Make predictions for whole grid:

In [None]:
## Whole grid:
# Make predictions:
df_grid_monthly = pd.read_csv(path_glacier_grid + f'{glacierName}_grid.csv')
# Select only the subset of features
df_grid_monthly = df_grid_monthly[all_columns]

# Set to CPU for predictions:
xgb = xgb.set_params(device='cpu')

# Make predictions on whole glacier grid
features_grid, metadata_grid = xgb._create_features_metadata(
    df_grid_monthly, config.META_DATA)
print('Shape of the dataset:', features_grid.shape)
y_pred_grid = xgb.predict(features_grid)

# Make predictions aggr to meas ID:
y_pred_grid_agg = xgb.aggrPredict(metadata_grid, config.META_DATA,
                                  features_grid)

# Aggregate predictions to annual or winter:
grouped_ids = df_grid_monthly.groupby('ID').agg({'YEAR': 'mean'})
grouped_ids['pred'] = y_pred_grid_agg

# Sum over all points of a glacier to get glacier wide SMB
grouped_ids = grouped_ids.groupby('YEAR').mean()

df_target = pd.read_csv(path_SMB_GLAMOS_csv + 'fix/' +
                        f'{glacierName}_fix.csv')
df_target = transformDates(df_target)
# Remove obvious duplicates:
df_target = df_target.drop_duplicates()
df_target['YEAR'] = df_target['date1'].apply(lambda x: pd.to_datetime(x).year)
df_target['Annual Balance'] = df_target['Annual Balance'] / (1000)
df_target = df_target[['YEAR', 'Annual Balance']].set_index('YEAR')

fig, ax = plt.subplots(1, 1, figsize=(15, 5))
grouped_ids.plot(y='pred', label='Predicted SMB', ax=ax, color=color_xgb)

df_target[df_target.index > 1960].plot(y='Annual Balance',
                                       label='GLAMOS SMB',
                                       ax=ax,
                                       color=color_tim)

ax.set_title(f'{glacierName.title()} SMB')
ax.set_ylabel('SMB (m w.e.)')
plt.tight_layout()

## Silvretta:

### Create grid:

In [None]:
glacierName = 'silvretta'
rgi_gl = rgi_df.loc[glacierName]['rgi_id.v6']
data_gl = data_glamos[data_glamos.RGIId == rgi_gl]
dataset_gl = mbm.Dataset(data=data_gl,
                         region_name='CH',
                         data_path=path_PMB_GLAMOS_csv)
ds, glacier_indices, gdir = dataset_gl.get_glacier_mask(custom_working_dir)

# Create pandas dataframe of glacier grid
years = data_gl['YEAR'].unique()
df_grid_annual = dataset_gl.create_glacier_grid(custom_working_dir)
# Add metadata that is not in WGMS dataset
df_grid_annual["PERIOD"] = "annual"
df_grid_annual['GLACIER'] = glacierName

# Load monthly glacier grid (preprocessed in other notebooks)
df_grid_monthly = pd.read_csv(path_glacier_grid + f'{glacierName}_grid.csv')
dataloader = mbm.DataLoader(data=df_grid_monthly,
                            meta_data_columns=config.META_DATA)
print('\nNumber of years: {}, from {} to {}'.format(len(years), years[0],
                                                    years[-1]))
print('\nNumber of total (yearly) measurements:', len(df_grid_annual))
df_grid_monthly.head(2)

In [None]:
# Plot glacier attributes of oggm:
plotGlAttr(ds, cmap=cm.devon)

# Plot glacier grid with stakes:
plotGlGrid(df_grid_annual, data_gl)

### Train ML model on stakes:

In [None]:
data_gl = data_glamos[data_glamos.RGIId == rgi_gl]

dataset_gl = mbm.Dataset(data=data_gl,
                         region_name='CH',
                         data_path=path_PMB_GLAMOS_csv)
print('Number of winter and annual samples:', len(data_gl))
print('Number of annual samples:', len(data_gl[data_gl.PERIOD == 'annual']))
print('Number of winter samples:', len(data_gl[data_gl.PERIOD == 'winter']))
plotNumMeasPerYear(data_gl, glacierName)

In [None]:
# Plot elevation distribution of grid points:
fig = plt.figure(figsize=(15, 5))
ax = fig.add_subplot(111)
df_grid_annual.POINT_ELEVATION.hist(bins=20, color=color_xgb, ax=ax, alpha=0.5)

stake_el = data_gl.POINT_ELEVATION.unique()
for el in stake_el:
    ax.axvline(el, color=color_tim, linestyle='--', alpha=0.2)
plt.title('Elevation distribution of grid points (blue) and stakes (pink)')

In [None]:
# Add climate features and transform to monthly format
dataloader_gl = getMonthlyDataLoaderOneGl(glacierName, vois_climate,
                                          voi_topographical)

# Get train, test and validation data
splits, test_set, train_set = getCVSplits(dataloader_gl, test_split_on='YEAR')

print('Train years:', train_set['splits_vals'])
print('Test years:', test_set['splits_vals'])

# Plot splits
visualiseSplits(test_set['y'], train_set['y'], splits)


In [None]:
%%time
# Grid search
# For each of the XGBoost parameter, define the grid range
parameters = {
    'max_depth': [
        3,
        4,
        5,
        6,
    ],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'n_estimators': [100, 200, 300],
    'gamma': [0, 1]
}

param_init = {}
param_init['device'] = 'cuda:0'
param_init['tree_method'] = 'hist'
param_init["random_state"] = config.SEED

# Feature columns:
feature_columns = ['ELEVATION_DIFFERENCE'
                   ] + list(vois_climate) + list(voi_topographical)
all_columns = feature_columns + config.META_DATA + config.NOT_METADATA_NOT_FEATURES
df_X_train_subset = train_set['df_X'][all_columns]
print('Shape of the dataset:', df_X_train_subset.shape)
print('Running with features:', feature_columns)

# Create a CustomXGBoostRegressor instance
custom_xgboost = mbm.models.CustomXGBoostRegressor(**param_init)
custom_xgboost.randomsearch(
    parameters=parameters,
    n_iter=20,
    splits=splits,
    features=df_X_train_subset,
    targets=train_set['y'],
)

# save best model
custom_xgboost.save_model(f'xgb_{glacierName}.pkl')

# Get best parameters and estimator
best_params = params = custom_xgboost.param_search.best_params_
best_estimator = custom_xgboost.param_search.best_estimator_
print("Best parameters:\n", best_params)
print("Best score:\n", custom_xgboost.param_search.best_score_)

In [None]:
# Set to CPU for predictions:
xgb = best_estimator.set_params(device='cpu')

# Make predictions on test
features_test, metadata_test = xgb._create_features_metadata(
    test_set['df_X'][all_columns], config.META_DATA)
y_pred = xgb.predict(features_test)
print('Shape of the test:', features_test.shape)

# Make predictions aggr to meas ID:
y_pred_agg = xgb.aggrPredict(metadata_test, config.META_DATA, features_test)

# Calculate scores
score = xgb.score(test_set['df_X'][all_columns], test_set['y'])  # negative
mse, rmse, mae, pearson_corr = xgb.evalMetrics(metadata_test, y_pred,
                                               test_set['y'])

# Aggregate predictions to annual or winter:
df_pred = test_set['df_X'][all_columns].copy()
df_pred['target'] = test_set['y']
grouped_ids = df_pred.groupby('ID').agg({'target': 'mean', 'YEAR': 'first'})
grouped_ids['pred'] = y_pred_agg
grouped_ids['PERIOD'] = test_set['df_X'][
    feature_columns + config.META_DATA +
    config.NOT_METADATA_NOT_FEATURES].groupby('ID')['PERIOD'].first()

fig = plt.figure(figsize=(15, 5))
ax = plt.subplot(1, 2, 1)
predVSTruth(ax, grouped_ids, mae, rmse, pearson_corr)

ax = plt.subplot(1, 2, 2)
plotMeanPred(grouped_ids, ax)

ax.legend()
ax.set_title('Mean yearly target and prediction')
plt.suptitle(f'XGBoost on {glacierName.title()} (split years)', fontsize=20)
plt.tight_layout()

In [None]:
feature_columns = ['ELEVATION_DIFFERENCE'] + voi_topographical + vois_climate
FIPlot(best_estimator, feature_columns, vois_climate)

### Whole grid:

In [None]:
## Whole grid:
# Make predictions:
df_grid_monthly = pd.read_csv(path_glacier_grid + f'{glacierName}_grid.csv')
# Select only the subset of features
df_grid_monthly = df_grid_monthly[all_columns]

# Set to CPU for predictions:
xgb = xgb.set_params(device='cpu')

# Make predictions on whole glacier grid
features_grid, metadata_grid = xgb._create_features_metadata(
    df_grid_monthly, config.META_DATA)
print('Shape of the dataset:', features_grid.shape)
y_pred_grid = xgb.predict(features_grid)

# Make predictions aggr to meas ID:
y_pred_grid_agg = xgb.aggrPredict(metadata_grid, config.META_DATA,
                                  features_grid)

# Aggregate predictions to annual or winter:
grouped_ids = df_grid_monthly.groupby('ID').agg({'YEAR': 'mean'})
grouped_ids['pred'] = y_pred_grid_agg

# Sum over all points of a glacier to get glacier wide SMB
grouped_ids = grouped_ids.groupby('YEAR').mean()

df_target = pd.read_csv(path_SMB_GLAMOS_csv + 'fix/' +
                        f'{glacierName}_fix.csv')
df_target = transformDates(df_target)
# Remove obvious duplicates:
df_target = df_target.drop_duplicates()
df_target['YEAR'] = df_target['date1'].apply(lambda x: pd.to_datetime(x).year)
df_target['Annual Balance'] = df_target['Annual Balance'] / (1000)
df_target = df_target[['YEAR', 'Annual Balance']].set_index('YEAR')

fig, ax = plt.subplots(1, 1, figsize=(15, 5))
grouped_ids.plot(y='pred', label='Predicted SMB', ax=ax, color=color_xgb)

df_target[df_target.index > 1960].plot(y='Annual Balance',
                                       label='GLAMOS SMB',
                                       ax=ax,
                                       color=color_tim)

ax.set_title(f'{glacierName.title()} SMB')
ax.set_ylabel('SMB (m w.e.)')
plt.tight_layout()