## Setting up:

In [None]:
import pandas as pd
import os
import warnings
from tqdm.notebook import tqdm
import re
import massbalancemachine as mbm
import geopandas as gpd
from shapely.geometry import Polygon, LineString, Point
from scipy.spatial.distance import cdist
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GroupKFold, KFold, train_test_split, GroupShuffleSplit

import cupy as cp
import matplotlib.pyplot as plt
import seaborn as sns
from cmcrameri import cm
from oggm import cfg, utils, workflow, tasks
import logging
import geopandas as gpd
import xarray as xr

from scripts.helpers import *
from scripts.glamos_preprocess import *
from scripts.plots import *
# from scripts.glacier_wide import *

warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

In [None]:
seed_all(SEED)
# Plot styles:
path_style_sheet = 'scripts/example.mplstyle'
plt.style.use(path_style_sheet)

In [None]:
# RGI Ids:
# Read rgi ids:
path_rgi = '../../../data/GLAMOS/CH_glacier_ids_long.csv'
rgi_df = pd.read_csv(path_rgi, sep=',')
rgi_df.rename(columns=lambda x: x.strip(), inplace=True)
rgi_df.sort_values(by='short_name', inplace=True)
rgi_df.set_index('short_name', inplace=True)

In [None]:
data_glamos = pd.read_csv(path_PMB_GLAMOS_csv + 'CH_wgms_dataset.csv')
data_glamos.head(2)

## Gries:

### Create grid:

In [None]:
glacierName = 'gries'
rgi_gl = rgi_df.loc[glacierName]['rgi_id.v6']
data_gl = data_glamos[data_glamos.RGIId == rgi_gl]
dataset_gl = mbm.Dataset(data=data_gl,
                            region_name='CH',
                            data_path=path_PMB_GLAMOS_csv)
dataset_gl.data.head(2)

In [None]:
custom_working_dir = '../../../data/OGGM/'
ds, glacier_indices, gdir = dataset_gl.get_glacier_mask(custom_working_dir)

# Plot glacier attributes
fig, ax = plt.subplots(1, 4, figsize=(15, 5))
ds.masked_slope.plot(ax=ax[0])
ax[0].set_title('Slope')
ds.masked_elev.plot(ax=ax[1])
ax[1].set_title('Elevation')
ds.masked_aspect.plot(ax=ax[2])
ax[2].set_title('Aspect')
ds.masked_dis.plot(ax=ax[3])
ax[3].set_title('Dis from border')
plt.tight_layout()

In [None]:
# Create pandas dataframe of glacier grid
years = data_gl['YEAR'].unique()
print('Number of years: {} from {} to {}'.format(len(years), years[0],
                                                 years[-1]))
df_grid = dataset_gl.create_glacier_grid(custom_working_dir)
# Add metadata that is not in WGMS dataset
df_grid["PERIOD"] = "annual"
df_grid['GLACIER'] = glacierName
print('Length of df_grid:', len(df_grid))
df_grid.head(2)

In [None]:
# Plot coordinates
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
df_grid_one_year = df_grid[df_grid.YEAR == 2006]
ax.scatter(df_grid_one_year.POINT_LON,
           df_grid_one_year.POINT_LAT,
           s=1,
           label='OGGM grid')
ax.scatter(data_gl.POINT_LON,
           data_gl.POINT_LAT,
           s=8,
           label='stakes',
           marker='x')
ax.legend()
ax.set_title(f'OGGM grid and GLAMOS stakes for {df_grid.GLACIER.iloc[0]}')

#### Add climate variables & convert to monthly:

In [None]:
# Specify the short names of the climate variables available in the dataset
vois_climate = ['t2m', 'tp', 'slhf', 'sshf', 'ssrd', 'fal', 'str']
# voi_topographical = ['aspect', 'slope', 'dis_from_border', 'topo']
voi_topographical = ['aspect', 'slope']
meta_data_columns = ["RGIId", "POINT_ID", "ID", "N_MONTHS", "MONTHS", "PERIOD"]

RUN = False
if RUN == True:
    # Provide the column name for the column that has the RGI IDs for each of the stakes
    dataset_grid = mbm.Dataset(data=df_grid,
                               region_name='CH',
                               data_path=path_PMB_GLAMOS_csv)

    # Add climate data:
    # Specify the files of the climate data, that will be matched with the coordinates of the stake data
    era5_climate_data = path_ERA5_raw + 'era5_monthly_averaged_data.nc'
    geopotential_data = path_ERA5_raw + 'era5_geopotential_pressure.nc'

    # Match the climate features, from the ERA5Land netCDF file, for each of the stake measurement dataset
    dataset_grid.get_climate_features(climate_data=era5_climate_data,
                                      geopotential_data=geopotential_data,
                                      change_units=True)
    print('Shape after adding climate variables:', dataset_grid.data.shape)

    # For each record, convert to a monthly time resolution
    dataset_grid.convert_to_monthly(meta_data_columns=meta_data_columns,
                                    vois_climate=vois_climate,
                                    vois_topographical=voi_topographical)
    print('Shape after converting to monthly format:', dataset_grid.data.shape)

    # Save grid:
    dataset_grid.data.to_csv(path_glacier_grid + f'{glacierName}_grid.csv', index=False)

df_grid = pd.read_csv(path_glacier_grid + f'{glacierName}_grid.csv')

# Create a new DataLoader object with the monthly stake data measurements.
dataloader = mbm.DataLoader(data=df_grid,
                            random_seed=SEED,
                            meta_data_columns=meta_data_columns)

### Train model on stakes:

In [None]:
data_gl = data_glamos[data_glamos.RGIId == rgi_gl]
print('Number of winter and annual samples:', len(data_gl))
print('Number of annual samples:', len(data_gl[data_gl.PERIOD == 'annual']))
print('Number of winter samples:', len(data_gl[data_gl.PERIOD == 'winter']))

# change mm w.e. to m w.e.
data_gl['POINT_BALANCE'] = data_gl['POINT_BALANCE'] / 1000

dataset_gl = mbm.Dataset(data=data_gl,
                      region_name='CH',
                      data_path=path_PMB_GLAMOS_csv)

# Plot number of measurements per year
# Number of measurements per glacier per year:
fig, ax = plt.subplots(1, 1, figsize=(15, 5))
num_gl_yr = data_gl.groupby(['YEAR', 'PERIOD']).size().unstack().reset_index()
num_gl_yr.plot(x='YEAR', kind='bar', stacked=True, ax=ax, title=f'{glacierName}')
ax.set_ylabel('Number of measurements')
ax.set_title(f'Number of measurements per year: {glacierName}', fontsize=14)
plt.tight_layout()

In [None]:
# Add climate data:
# Specify the files of the climate data, that will be matched with the coordinates of the stake data
era5_climate_data = path_ERA5_raw + 'era5_monthly_averaged_data.nc'
geopotential_data = path_ERA5_raw + 'era5_geopotential_pressure.nc'

# Match the climate features, from the ERA5Land netCDF file, for each of the stake measurement dataset
dataset_gl.get_climate_features(climate_data=era5_climate_data,
                             geopotential_data=geopotential_data,
                             change_units=True)

# For each record, convert to a monthly time resolution
dataset_gl.convert_to_monthly(meta_data_columns=meta_data_columns,
                           vois_climate=vois_climate,
                           vois_topographical=voi_topographical)

# Create a new DataLoader object with the monthly stake data measurements.
dataloader_gl = mbm.DataLoader(data=dataset_gl.data,
                            random_seed=SEED,
                            meta_data_columns=meta_data_columns)
TYPE_SPLIT = 'year'
if TYPE_SPLIT == 'year':
    # Split into training and test years with train_test_split
    train_years, test_years = train_test_split(dataset_gl.data.YEAR.unique(),
                                               test_size=0.2,
                                               random_state=SEED)

    train_indices = dataset_gl.data[dataset_gl.data.YEAR.isin(train_years)].index
    test_indices = dataset_gl.data[dataset_gl.data.YEAR.isin(test_years)].index

    dataloader_gl.set_custom_train_test_indices(train_indices, test_indices)

else:
    # Randomly (though does not separate meas ID)
    train_itr, test_itr = dataloader_gl.set_train_test_split(test_size=0.2,
                                                          shuffle=True)

    train_indices, test_indices = list(train_itr), list(test_itr)

# Get the features and targets of the training data for the indices as defined above, that will be used during the cross validation.
df_X_train = dataset_gl.data.iloc[train_indices]
y_train = df_X_train['POINT_BALANCE'].values

# Get test set
df_X_test = dataset_gl.data.iloc[test_indices]
y_test = df_X_test['POINT_BALANCE'].values

# Create the CV splits based on the training dataset. The default value for the number of splits is 5.
splits = dataloader_gl.get_cv_split(n_splits=5, type_fold='group-meas-id')
test_meas_id = df_X_test['ID'].unique()

# Years in training and test set
train_years = df_X_train.YEAR.unique()
test_years = df_X_test.YEAR.unique()
print('Train years:', train_years)
print('Test years:', test_years)

# Plot splits
visualiseSplits(y_test, y_train, splits)

In [None]:
%%time
# Grid search
# For each of the XGBoost parameter, define the grid range
parameters = {
    'max_depth': [
        3,
        4,
        5,
        6,
    ],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'n_estimators': [100, 200, 300],
    'gamma': [0, 1]
}

feature_columns = [
    'ALTITUDE_CLIMATE', 'ELEVATION_DIFFERENCE', 'aspect', 'fal', 'slhf',
    'slope', 'sshf', 'ssrd', 'str', 't2m', 'tp'
]

param_init = {}
param_init['device'] = 'cuda:0'
param_init['tree_method'] = 'hist'
param_init["random_state"] = SEED

# Create a CustomXGBoostRegressor instance
custom_xgboost = mbm.models.CustomXGBoostRegressor(
    meta_data_columns=meta_data_columns, **param_init)
custom_xgboost.randomsearch(
    parameters=parameters,
    n_iter=20,
    splits=splits,
    features=df_X_train,
    targets=y_train,
    num_jobs=-1,
    random_seed=SEED,
)

# save best model
custom_xgboost.save_model(f'xgb_{glacierName}.pkl')

best_params = params = custom_xgboost.param_search.best_params_
best_estimator = custom_xgboost.param_search.best_estimator_
print("Best parameters:\n", best_params)
print("Best score:\n", custom_xgboost.param_search.best_score_)

In [None]:
# Set to CPU for predictions:
xgb = best_estimator.set_params(device='cpu')

# Make predictions on test
features_test, metadata_test = xgb._create_features_metadata(
    df_X_test, meta_data_columns)
y_pred = xgb.predict(features_test)

# Make predictions aggr to meas ID:
y_pred_agg = xgb.aggrPredict(metadata_test, meta_data_columns, features_test)

# Calculate scores
score = xgb.score(df_X_test, y_test)  # negative
mse, rmse, mae = xgb.evalMetrics(metadata_test, y_pred, y_test)

# Aggregate predictions to annual or winter:
df_pred = df_X_test.copy()
df_pred['target'] = y_test
grouped_ids = df_pred.groupby('ID').agg({'target': 'mean'})
grouped_ids['pred'] = y_pred_agg
grouped_ids['PERIOD'] = df_X_test.groupby('ID')['PERIOD'].first()

predVSTruth(grouped_ids,
            mae,
            rmse,
            title=f'XGBoost on {glacierName} (split years)')


### Make predictions for whole grid:

In [None]:
# Load best estimator from grid search:
# For each of the XGBoost parameter, define the grid range
param_init = {}
param_init["random_state"] = SEED

feature_columns = [
    'ALTITUDE_CLIMATE', 'ELEVATION_DIFFERENCE', 'aspect', 'fal', 'slhf',
    'slope', 'sshf', 'ssrd', 'str', 't2m', 'tp'
]

# Create a CustomXGBoostRegressor instance
custom_xgboost = mbm.models.CustomXGBoostRegressor(
    meta_data_columns=meta_data_columns, **param_init)
clf = custom_xgboost.load_model(f'xgb_{glacierName}.pkl')
print('Params:', clf.best_params_)
xgb = clf.best_estimator_

# Make predictions:
print('Shape of test data:', df_grid.shape)

# Set to CPU for predictions:
xgb = xgb.set_params(device='cpu')

In [None]:
# Make predictions on whole glacier grid
features_grid, metadata_grid = xgb._create_features_metadata(
    df_grid, meta_data_columns)
y_pred_grid = xgb.predict(features_grid)

# Make predictions aggr to meas ID:
y_pred_grid_agg = xgb.aggrPredict(metadata_grid, meta_data_columns,
                                  features_grid)

In [None]:
# Aggregate predictions to annual or winter:
grouped_ids = df_grid.groupby('ID').agg({'YEAR': 'mean'})
grouped_ids['pred'] = y_pred_grid_agg

# Sum over all points of a glacier to get glacier wide SMB
grouped_ids = grouped_ids.groupby('YEAR').mean()

df_target = pd.read_csv(path_SMB_GLAMOS_csv + 'fix/' + f'{glacierName}_fix.csv')
df_target = transformDates(df_target)
# Remove obvious duplicates:
df_target = df_target.drop_duplicates()
df_target['YEAR'] = df_target['date1'].apply(lambda x: pd.to_datetime(x).year)
df_target['Annual Balance'] = df_target['Annual Balance'] / (1000)
df_target = df_target[['YEAR', 'Annual Balance']].set_index('YEAR')

fig, ax = plt.subplots(1, 1, figsize=(15, 5))
grouped_ids.plot(y='pred', label='Predicted SMB', ax=ax)
df_target.plot(y='Annual Balance', label='GLAMOS SMB', ax=ax)

ax.set_title('Gries')
ax.set_ylabel('SMB (m w.e.)')
plt.tight_layout()

In [None]:
# Plot temperature and precipitation aggregated over grid points:
fig = plt.figure(figsize=(15, 15))
for i, var in enumerate(vois_climate):
    temp = df_grid[[var, 'YEAR']].groupby('YEAR').mean()
    mean = temp.mean().values
    std = temp.std().values
    ax = plt.subplot(4, 2, i + 1)
    temp.plot(ax=ax)
    ax.set_title(vois_climate_long_name[var])
    ax.set_ylabel(vois_units[var])

plt.suptitle('Gries input variables')
plt.tight_layout()

In [None]:
# Plot distribution of input variables:
fig = plt.figure(figsize=(15, 10))
for i, feature in enumerate(feature_columns):
    ax = plt.subplot(3, 5, i + 1)
    sns.histplot(data=df_grid[feature],
                 color='blue',
                 alpha=0.5,
                 kde=True,
                 label='Grid',
                 ax=ax,
                 stat='density')
    sns.histplot(data=df_X_test[feature],
                 color='orange',
                 alpha=0.5,
                 kde=True,
                 label='Stakes',
                 ax=ax,
                 stat='density')
    ax.legend()
    ax.set_title(feature)
    ax.set_xlabel('')
plt.tight_layout()

## Silvretta:

### Create grid:

In [None]:
glacierName = 'silvretta'
rgi_gl = rgi_df.loc[glacierName]['rgi_id.v6']
data_gl = data_glamos[data_glamos.RGIId == rgi_gl]
dataset_gl = mbm.Dataset(data=data_gl,
                            region_name='CH',
                            data_path=path_PMB_GLAMOS_csv)
dataset_gl.data.head(2)

In [None]:
custom_working_dir = '../../../data/OGGM/'
ds, glacier_indices, gdir = dataset_gl.get_glacier_mask(custom_working_dir)

# Plot glacier attributes
fig, ax = plt.subplots(1, 4, figsize=(15, 5))
ds.masked_slope.plot(ax=ax[0])
ax[0].set_title('Slope')
ds.masked_elev.plot(ax=ax[1])
ax[1].set_title('Elevation')
ds.masked_aspect.plot(ax=ax[2])
ax[2].set_title('Aspect')
ds.masked_dis.plot(ax=ax[3])
ax[3].set_title('Dis from border')
plt.tight_layout()

In [None]:
# Create pandas dataframe of glacier grid
years = data_gl['YEAR'].unique()
print('Number of years: {} from {} to {}'.format(len(years), years[0],
                                                 years[-1]))
df_grid = dataset_gl.create_glacier_grid(custom_working_dir)
df_grid["PERIOD"] = "annual"
df_grid['GLACIER'] = glacierName
print('Length of df_grid:', len(df_grid))
df_grid.head(2)

In [None]:
# Plot coordinates
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
df_grid_one_year = df_grid[df_grid.YEAR == 2006]
ax.scatter(df_grid_one_year.POINT_LON,
           df_grid_one_year.POINT_LAT,
           s=1,
           label='OGGM grid')
ax.scatter(data_gl.POINT_LON,
           data_gl.POINT_LAT,
           s=8,
           label='stakes',
           marker='x')
ax.legend()
ax.set_title(f'OGGM grid and GLAMOS stakes for {df_grid.GLACIER.iloc[0]}')

#### Add climate variables & convert to monthly:

In [None]:
# Specify the short names of the climate variables available in the dataset
vois_climate = ['t2m', 'tp', 'slhf', 'sshf', 'ssrd', 'fal', 'str']
# voi_topographical = ['aspect', 'slope', 'dis_from_border', 'topo']
voi_topographical = ['aspect', 'slope']
meta_data_columns = ["RGIId", "POINT_ID", "ID", "N_MONTHS", "MONTHS", "PERIOD"]

RUN = True
if RUN == True:
    # Provide the column name for the column that has the RGI IDs for each of the stakes
    dataset_grid = mbm.Dataset(data=df_grid,
                               region_name='CH',
                               data_path=path_PMB_GLAMOS_csv)

    # Add climate data:
    # Specify the files of the climate data, that will be matched with the coordinates of the stake data
    era5_climate_data = path_ERA5_raw + 'era5_monthly_averaged_data.nc'
    geopotential_data = path_ERA5_raw + 'era5_geopotential_pressure.nc'

    # Match the climate features, from the ERA5Land netCDF file, for each of the stake measurement dataset
    dataset_grid.get_climate_features(climate_data=era5_climate_data,
                                      geopotential_data=geopotential_data,
                                      change_units=True)
    print('Shape after adding climate variables:', dataset_grid.data.shape)

    # For each record, convert to a monthly time resolution
    dataset_grid.convert_to_monthly(meta_data_columns=meta_data_columns,
                                    vois_climate=vois_climate,
                                    vois_topographical=voi_topographical)
    print('Shape after converting to monthly format:', dataset_grid.data.shape)

    # Save grid:
    dataset_grid.data.to_csv(path_glacier_grid + f'{glacierName}_grid.csv', index=False)

df_grid = pd.read_csv(path_glacier_grid + f'{glacierName}_grid.csv')

# Create a new DataLoader object with the monthly stake data measurements.
dataloader = mbm.DataLoader(data=df_grid,
                            random_seed=SEED,
                            meta_data_columns=meta_data_columns)

### Train model on stakes:

In [None]:
data_gl = data_glamos[data_glamos.RGIId == rgi_gl]
print('Number of winter and annual samples:', len(data_gl))
print('Number of annual samples:', len(data_gl[data_gl.PERIOD == 'annual']))
print('Number of winter samples:', len(data_gl[data_gl.PERIOD == 'winter']))

# change mm w.e. to m w.e.
data_gl['POINT_BALANCE'] = data_gl['POINT_BALANCE'] / 1000

dataset_gl = mbm.Dataset(data=data_gl,
                      region_name='CH',
                      data_path=path_PMB_GLAMOS_csv)

# Plot number of measurements per year
# Number of measurements per glacier per year:
fig, ax = plt.subplots(1, 1, figsize=(15, 5))
num_gl_yr = data_gl.groupby(['YEAR', 'PERIOD']).size().unstack().reset_index()
num_gl_yr.plot(x='YEAR', kind='bar', stacked=True, ax=ax, title='Gries')
ax.set_ylabel('Number of measurements')
ax.set_title(f'Number of measurements per year: {glacierName}', fontsize=14)
plt.tight_layout()

In [None]:
# Add climate data:
# Specify the files of the climate data, that will be matched with the coordinates of the stake data
era5_climate_data = path_ERA5_raw + 'era5_monthly_averaged_data.nc'
geopotential_data = path_ERA5_raw + 'era5_geopotential_pressure.nc'

# Match the climate features, from the ERA5Land netCDF file, for each of the stake measurement dataset
dataset_gl.get_climate_features(climate_data=era5_climate_data,
                             geopotential_data=geopotential_data,
                             change_units=True)

# For each record, convert to a monthly time resolution
dataset_gl.convert_to_monthly(meta_data_columns=meta_data_columns,
                           vois_climate=vois_climate,
                           vois_topographical=voi_topographical)

# Create a new DataLoader object with the monthly stake data measurements.
dataloader_gl = mbm.DataLoader(data=dataset_gl.data,
                            random_seed=SEED,
                            meta_data_columns=meta_data_columns)
TYPE_SPLIT = 'year'
if TYPE_SPLIT == 'year':
    # Split into training and test years with train_test_split
    train_years, test_years = train_test_split(dataset_gl.data.YEAR.unique(),
                                               test_size=0.2,
                                               random_state=SEED)

    train_indices = dataset_gl.data[dataset_gl.data.YEAR.isin(train_years)].index
    test_indices = dataset_gl.data[dataset_gl.data.YEAR.isin(test_years)].index

    dataloader_gl.set_custom_train_test_indices(train_indices, test_indices)

else:
    # Randomly (though does not separate meas ID)
    train_itr, test_itr = dataloader_gl.set_train_test_split(test_size=0.2,
                                                          shuffle=True)

    train_indices, test_indices = list(train_itr), list(test_itr)

# Get the features and targets of the training data for the indices as defined above, that will be used during the cross validation.
df_X_train = dataset_gl.data.iloc[train_indices]
y_train = df_X_train['POINT_BALANCE'].values

# Get test set
df_X_test = dataset_gl.data.iloc[test_indices]
y_test = df_X_test['POINT_BALANCE'].values

# Create the CV splits based on the training dataset. The default value for the number of splits is 5.
splits = dataloader_gl.get_cv_split(n_splits=5, type_fold='group-meas-id')
test_meas_id = df_X_test['ID'].unique()

# Years in training and test set
train_years = df_X_train.YEAR.unique()
test_years = df_X_test.YEAR.unique()
print('Train years:', train_years)
print('Test years:', test_years)

# Plot splits
visualiseSplits(y_test, y_train, splits)

In [None]:
%%time
# Grid search
# For each of the XGBoost parameter, define the grid range
parameters = {
    'max_depth': [
        3,
        4,
        5,
        6,
    ],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'n_estimators': [100, 200, 300],
    'gamma': [0, 1]
}

feature_columns = [
    'ALTITUDE_CLIMATE', 'ELEVATION_DIFFERENCE', 'aspect', 'fal', 'slhf',
    'slope', 'sshf', 'ssrd', 'str', 't2m', 'tp'
]

param_init = {}
param_init['device'] = 'cuda:0'
param_init['tree_method'] = 'hist'
param_init["random_state"] = SEED

# Create a CustomXGBoostRegressor instance
custom_xgboost = mbm.models.CustomXGBoostRegressor(
    meta_data_columns=meta_data_columns, **param_init)
custom_xgboost.randomsearch(
    parameters=parameters,
    n_iter=20,
    splits=splits,
    features=df_X_train,
    targets=y_train,
    num_jobs=-1,
    random_seed=SEED,
)

# save best model
custom_xgboost.save_model(f'xgb_{glacierName}.pkl')

best_params = params = custom_xgboost.param_search.best_params_
best_estimator = custom_xgboost.param_search.best_estimator_
print("Best parameters:\n", best_params)
print("Best score:\n", custom_xgboost.param_search.best_score_)

In [None]:
# Set to CPU for predictions:
xgb = best_estimator.set_params(device='cpu')

# Make predictions on test
features_test, metadata_test = xgb._create_features_metadata(
    df_X_test, meta_data_columns)
y_pred = xgb.predict(features_test)

# Make predictions aggr to meas ID:
y_pred_agg = xgb.aggrPredict(metadata_test, meta_data_columns, features_test)

# Calculate scores
score = xgb.score(df_X_test, y_test)  # negative
mse, rmse, mae = xgb.evalMetrics(metadata_test, y_pred, y_test)

# Aggregate predictions to annual or winter:
df_pred = df_X_test.copy()
df_pred['target'] = y_test
grouped_ids = df_pred.groupby('ID').agg({'target': 'mean'})
grouped_ids['pred'] = y_pred_agg
grouped_ids['PERIOD'] = df_X_test.groupby('ID')['PERIOD'].first()

predVSTruth(grouped_ids,
            mae,
            rmse,
            title=f'XGBoost on {glacierName} (split years)')


### Make predictions for whole grid:

In [None]:
# Load best estimator from grid search:
# For each of the XGBoost parameter, define the grid range
param_init = {}
param_init["random_state"] = SEED

feature_columns = [
    'ALTITUDE_CLIMATE', 'ELEVATION_DIFFERENCE', 'aspect', 'fal', 'slhf',
    'slope', 'sshf', 'ssrd', 'str', 't2m', 'tp'
]

# Create a CustomXGBoostRegressor instance
custom_xgboost = mbm.models.CustomXGBoostRegressor(
    meta_data_columns=meta_data_columns, **param_init)
clf = custom_xgboost.load_model(f'xgb_{glacierName}.pkl')
print('Params:', clf.best_params_)
xgb = clf.best_estimator_

# Make predictions:
print('Shape of test data:', df_grid.shape)

# Set to CPU for predictions:
xgb = xgb.set_params(device='cpu')

In [None]:
# Make predictions on whole glacier grid
features_grid, metadata_grid = xgb._create_features_metadata(
    df_grid, meta_data_columns)
y_pred_grid = xgb.predict(features_grid)

# Make predictions aggr to meas ID:
y_pred_grid_agg = xgb.aggrPredict(metadata_grid, meta_data_columns,
                                  features_grid)

In [None]:
# Aggregate predictions to annual or winter:
grouped_ids = df_grid.groupby('ID').agg({'YEAR': 'mean'})
grouped_ids['pred'] = y_pred_grid_agg

# Sum over all points of a glacier to get glacier wide SMB
grouped_ids = grouped_ids.groupby('YEAR').mean()

df_target = pd.read_csv(path_SMB_GLAMOS_csv + 'fix/' + f'{glacierName}_fix.csv')
df_target = transformDates(df_target)
# Remove obvious duplicates:
df_target = df_target.drop_duplicates()
df_target['YEAR'] = df_target['date1'].apply(lambda x: pd.to_datetime(x).year)
df_target['Annual Balance'] = df_target['Annual Balance'] / (1000)
df_target = df_target[['YEAR', 'Annual Balance']].set_index('YEAR')

fig, ax = plt.subplots(1, 1, figsize=(15, 5))
grouped_ids.plot(y='pred', label='Predicted SMB', ax=ax)
df_target.plot(y='Annual Balance', label='GLAMOS SMB', ax=ax)

ax.set_title(f'{glacierName}')
ax.set_ylabel('SMB (m w.e.)')
plt.tight_layout()