# Glacier grids from SGI or GLAMOS:

Creates monthly grid files for the MBM to make PMB predictions over the whole glacier grid. The files come from the SGI grid and use OGGM topography. Computing takes a long time because of the conversion to monthly format.
## Setting up:

In [None]:
import pandas as pd
import os
import warnings
from tqdm.notebook import tqdm
import re
import massbalancemachine as mbm
import geopandas as gpd
import matplotlib.pyplot as plt
import geopandas as gpd
import geopandas as gpd

# scripts
from scripts.helpers import *
from scripts.glamos_preprocess import *
from scripts.plots import *
from scripts.geodata import *
from scripts.xgb_helpers import *
from scripts.config_CH import *

warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

cfg = mbm.SwitzerlandConfig()

In [None]:
seed_all(cfg.seed)
free_up_cuda()  # in case no memory

# Plot styles:
path_style_sheet = 'scripts/example.mplstyle'
plt.style.use(path_style_sheet)

# Climate columns
vois_climate = [
    't2m', 'tp', 'slhf', 'sshf', 'ssrd', 'fal', 'str', 'u10', 'v10'
]
# Topographical columns
voi_topographical = [
    "aspect",
    "slope",
    "hugonnet_dhdt",
    "consensus_ice_thickness",
    "millan_v",
    "topo",
]

In [None]:
glaciers_glamos_dem = os.listdir(os.path.join(path_GLAMOS_topo, 'lv95/'))

# Glacier outlines:
glacier_outline_sgi = gpd.read_file(
    os.path.join(path_SGI_topo, 'inventory_sgi2016_r2020',
                 'SGI_2016_glaciers_copy.shp'))  # Load the shapefile
glacier_outline_rgi = gpd.read_file(path_rgi_outlines)

# Sort glaciers by area
gl_area = get_gl_area()
gl_area['clariden'] = gl_area['claridenL']

In [None]:
# Load RGI data
rgi_df = pd.read_csv(path_glacier_ids,
                     sep=',').rename(columns=lambda x: x.strip())

# Sort and set index for easier lookup
rgi_df.sort_values(by='short_name', inplace=True)
rgi_df.set_index('short_name', inplace=True)

# Load geodetic mass balance data
geodeticMB = pd.read_csv(f"{path_geodetic_MB_glamos}dV_DOI2024_allcomb.csv")

rgi_df.reset_index(inplace=True)
sgi_gl = rgi_df.loc[rgi_df.short_name.isin(
    glaciers_glamos_dem)]['sgi-id'].unique()

# add clariden
clariden_L_sgi_id = rgi_df[rgi_df.short_name == 'claridenL']['sgi-id'].unique()

# add to sgi_gl
sgi_gl = np.concatenate((sgi_gl, clariden_L_sgi_id))

# Filter geodeticMB for relevant SGI IDs
geodeticMB = geodeticMB[geodeticMB['SGI-ID'].isin(sgi_gl)]

# Create a mapping dictionary for glacier names
sgi_to_glacier_name = rgi_df[[
    'sgi-id', 'short_name'
]].drop_duplicates().set_index('sgi-id')['short_name'].to_dict()

# Add glacier names based on SGI-ID mapping
geodeticMB['glacier_name'] = geodeticMB['SGI-ID'].map(sgi_to_glacier_name)

# Standardize naming convention
geodeticMB['glacier_name'].replace({'claridenU': 'clariden'}, inplace=True)

# filter to glacier_list
geodeticMB = geodeticMB[geodeticMB.glacier_name.isin(glaciers_glamos_dem)]

# Extract unique start and end years per glacier
years_start_per_gl = geodeticMB.groupby(
    'glacier_name')['Astart'].unique().apply(list).to_dict()
years_end_per_gl = geodeticMB.groupby('glacier_name')['A_end'].unique().apply(
    list).to_dict()

glacier_list_geod = years_start_per_gl.keys()
years_start_per_gl, years_end_per_gl

## Regional predictions (all CH glaciers)

In [None]:
sgi_list = [
    re.split('_',
             re.split('.grid', f)[0])[1]
    for f in os.listdir(os.path.join(path_SGI_topo, 'aspect'))
]

# unique SGI IDs
sgi_list = list(set(sgi_list))
print('Number of unique SGI IDs:', len(sgi_list))

glaciers_glamos_dems = os.listdir(os.path.join(path_GLAMOS_topo, 'lv95'))

RUN = False
if RUN:
    # Create SGI topographical masks
    # Note: This function will take a while to run
    # It creates a mask for each glacier in the SGI list
    # and saves them in the specified directory.
    create_sgi_topo_masks(sgi_list,
                          type='sgi_id',
                          path_save=os.path.join(path_SGI_topo,
                                                 'xr_masked_grids_sgi/'))

In [None]:
path = os.path.join(path_SGI_topo, 'xr_masked_grids_sgi/')
xr.open_dataset(path + 'A10g-02.zarr').masked_aspect.plot()

## Data exploration:

In [None]:
data_glamos = pd.read_csv(path_PMB_GLAMOS_csv + 'CH_wgms_dataset_all.csv')
gl_area = get_gl_area()
areas_train_set = [
    gl_area[gl] for gl in data_glamos['GLACIER'].unique()
    if gl in gl_area.keys()
]

# histogram
plt.hist(areas_train_set, bins=50)
plt.xlabel('Area (km2)')
plt.title('Histogram of glacier areas with stakes')

In [None]:
# Load the shapefile
shapefile_path = os.path.join(path_SGI_topo, 'inventory_sgi2016_r2020',
                              'SGI_2016_glaciers.shp')
gdf_shapefiles = gpd.read_file(shapefile_path)

# Histogram of area:
fig, axs = plt.subplots(1, 2, figsize=(12, 6))
sns.histplot(gdf_shapefiles.area / (10**6),
             color='blue',
             kde=True,
             bins=50,
             ax=axs[0])

# boxplot
sns.boxplot(x=gdf_shapefiles.area / (10**6), color='blue', ax=axs[1])

# set x label to km2
axs[0].set_xlabel('Area (km2)')
axs[1].set_xlabel('Area (km2)')

plt.suptitle('Histogram and Boxplot of all glaciers in SGI 2016')

## Create grids:

In [None]:
year = 2016
path_save_monthly = '../../../data/GLAMOS/topo/gridded_topo_inputs/SGI_regional_preds/2016/'

RUN = False
if RUN:
    emptyfolder(path_save_monthly)
    for sgi_id in tqdm(sgi_list, desc='Processing glaciers'):
        print(f"\n-----------------------------------\nProcessing {sgi_id}")

        # Load SGI masked grid (previously resampled)
        try:
            path_save = os.path.join(path_SGI_topo, 'xr_masked_grids_sgi/')
            path = os.path.join(path_save, f"{sgi_id}.zarr")
            ds_coarsened = xr.open_dataset(path)
        except Exception as e:
            print(f"Error loading dataset for {sgi_id}: {e}")
            continue

        # Create glacier grid
        try:
            rgi_id = None
            df_grid = create_glacier_grid_SGI(sgi_id, year, rgi_id,
                                              ds_coarsened)
            df_grid.reset_index(drop=True, inplace=True)
            dataset_grid = mbm.Dataset(cfg=cfg,
                                       data=df_grid,
                                       region_name='CH',
                                       data_path=path_PMB_GLAMOS_csv)
        except Exception as e:
            print(f"Error creating glacier grid for {sgi_id} in {year}: {e}")
            continue

        # Add climate data
        try:
            era5_climate_data = os.path.join(path_ERA5_raw,
                                             'era5_monthly_averaged_data.nc')
            geopotential_data = os.path.join(path_ERA5_raw,
                                             'era5_geopotential_pressure.nc')
            dataset_grid.get_climate_features(
                climate_data=era5_climate_data,
                geopotential_data=geopotential_data,
                change_units=True)

            # if dataset_grid.data is empty throw error
            if dataset_grid.data.empty:
                raise ValueError(
                    f"No climate data for glacier {sgi_id} in {year}")

        except Exception as e:
            print(f"Error adding climate data for {sgi_id} in {year}: {e}")
            continue

        # Add OGGM topographic data
        try:
            df_y_gl = dataset_grid.data
            df_y_gl.rename(columns={'RGIId': 'RGIId_old'}, inplace=True)

            # Add RGI IDs for OGGM data through intersection with shapefiles
            df_y_gl = mbm.data_processing.utils.get_rgi(
                data=df_y_gl, glacier_outlines=glacier_outline_rgi)

            # Drop points without RGI ID (outside of RGI outlines)
            df_y_gl = df_y_gl.dropna(subset=['RGIId'])

            # if df_y_gl is empty throw error
            if df_y_gl.empty:
                raise ValueError()

        except Exception as e:
            print(
                f"Error: no intersection for glacier {sgi_id} with RGI outlines (too small). Skipping..."
            )
            continue

        try:
            # Variables of interest
            voi = ["hugonnet_dhdt", "consensus_ice_thickness", "millan_v"]

            df_y_gl = add_OGGM_features(df_y_gl, voi, path_OGGM)

            # Add GLWD_ID
            # print('  - Adding GLWD ID...')
            df_y_gl['GLWD_ID'] = df_y_gl.apply(
                lambda x: get_hash(f"{x.GLACIER}_{x.YEAR}"), axis=1)
            df_y_gl['GLWD_ID'] = df_y_gl['GLWD_ID'].astype(str)

            dataset_grid = mbm.Dataset(cfg=cfg,
                                       data=df_y_gl,
                                       region_name='CH',
                                       data_path=path_PMB_GLAMOS_csv)
        except Exception as e:
            print(f"Error adding OGGM data for {sgi_id} in {year}: {e}")
            continue

        # Convert to monthly time resolution
        try:
            dataset_grid.convert_to_monthly(
                meta_data_columns=cfg.metaData,
                vois_climate=vois_climate,
                vois_topographical=voi_topographical)
        except Exception as e:
            print(
                f"Error converting to monthly resolution for {sgi_id} in {year}: {e}"
            )
            continue

        # Rename columns
        df_oggm = dataset_grid.data
        df_oggm.rename(columns={
            'aspect': 'aspect_sgi',
            'slope': 'slope_sgi'
        },
                       inplace=True)

        df_oggm['POINT_ELEVATION'] = df_oggm['topo']

        # Save gridded dataset
        save_path = os.path.join(path_save_monthly,
                                 f"{sgi_id}_grid_{year}.parquet")
        try:
            dataset_grid.data.to_parquet(save_path,
                                         engine="pyarrow",
                                         compression="snappy")
        except Exception as e:
            print(f"Error saving dataset for {sgi_id} in {year}: {e}")

In [None]:
path_save_monthly = '../../../data/GLAMOS/topo/gridded_topo_inputs/SGI_regional_preds/2016/'
year = 2016
sgi_id = 'A50i-16'
# Plot all OGGM variables
fig, axs = plt.subplots(1, 3, figsize=(15, 5))
df = pd.read_parquet(
    os.path.join(path_save_monthly, f"{sgi_id}_grid_{year}.parquet"))
df = df[df.MONTHS == 'sep']
voi = ['hugonnet_dhdt', 'consensus_ice_thickness', 'millan_v']
for i, var in enumerate(voi):
    sns.scatterplot(df,
                    x='POINT_LON',
                    y='POINT_LAT',
                    hue=var,
                    s=5,
                    alpha=0.5,
                    palette='twilight_shifted',
                    ax=axs[i])

In [None]:
len(os.listdir(path_save_monthly))

## Train ML model:

### Set up model:

In [None]:
vois_climate = [
    't2m', 'tp', 'slhf', 'sshf', 'ssrd', 'fal', 'str', 'u10', 'v10'
]

vois_topographical = [
    "aspect_sgi",
    "slope_sgi",
    "hugonnet_dhdt",
    "consensus_ice_thickness",
    "millan_v",
]

In [None]:
data_glamos = pd.read_csv(path_PMB_GLAMOS_csv + 'CH_wgms_dataset_all.csv')

# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

# Transform data to monthly format (run or load data):
paths = {
    'csv_path': path_PMB_GLAMOS_csv,
    'era5_climate_data': path_ERA5_raw + 'era5_monthly_averaged_data.nc',
    'geopotential_data': path_ERA5_raw + 'era5_geopotential_pressure.nc',
    'radiation_save_path': path_pcsr + 'zarr/'
}
RUN = False
dataloader_gl = process_or_load_data(run_flag=RUN,
                                     data_glamos=data_glamos,
                                     paths=paths,
                                     cfg=cfg,
                                     vois_climate=vois_climate,
                                     vois_topographical=vois_topographical)
data_monthly = dataloader_gl.data

In [None]:
test_glaciers = [
    'tortin', 'plattalva', 'sanktanna', 'schwarzberg', 'hohlaub', 'pizol',
    'corvatsch', 'tsanfleuron', 'forno'
]

# Ensure all test glaciers exist in the dataset
existing_glaciers = set(dataloader_gl.data.GLACIER.unique())
missing_glaciers = [g for g in test_glaciers if g not in existing_glaciers]

if missing_glaciers:
    print(
        f"Warning: The following test glaciers are not in the dataset: {missing_glaciers}"
    )

# Define training glaciers correctly
train_glaciers = [i for i in existing_glaciers if i not in test_glaciers]

data_test = dataloader_gl.data[dataloader_gl.data.GLACIER.isin(test_glaciers)]
print('Size of test data:', len(data_test))

data_train = dataloader_gl.data[dataloader_gl.data.GLACIER.isin(
    train_glaciers)]
print('Size of train data:', len(data_train))

if len(data_train) == 0:
    print("Warning: No training data available!")
else:
    test_perc = (len(data_test) / len(data_train)) * 100
    print('Percentage of test size: {:.2f}%'.format(test_perc))

# Number of annual versus winter measurements:
print('Train:')
print('Number of winter and annual samples:', len(data_train))
print('Number of annual samples:',
      len(data_train[data_train.PERIOD == 'annual']))
print('Number of winter samples:',
      len(data_train[data_train.PERIOD == 'winter']))

# Same for test
data_test_annual = data_test[data_test.PERIOD == 'annual']
data_test_winter = data_test[data_test.PERIOD == 'winter']

print('Test:')
print('Number of winter and annual samples:', len(data_test))
print('Number of annual samples:', len(data_test_annual))
print('Number of winter samples:', len(data_test_winter))

print('Total:')
print('Number of monthly rows:', len(dataloader_gl.data))
print('Number of annual rows:',
      len(dataloader_gl.data[dataloader_gl.data.PERIOD == 'annual']))
print('Number of winter rows:',
      len(dataloader_gl.data[dataloader_gl.data.PERIOD == 'winter']))

In [None]:
splits, test_set, train_set = get_CV_splits(dataloader_gl,
                                            test_split_on='GLACIER',
                                            test_splits=test_glaciers,
                                            random_state=cfg.seed)

print('Test glaciers: ({}) {}'.format(len(test_set['splits_vals']),
                                      test_set['splits_vals']))
test_perc = (len(test_set['df_X']) / len(train_set['df_X'])) * 100
print('Percentage of test size: {:.2f}%'.format(test_perc))
print('Size of test set:', len(test_set['df_X']))
print('Train glaciers: ({}) {}'.format(len(train_set['splits_vals']),
                                       train_set['splits_vals']))
print('Size of train set:', len(train_set['df_X']))

visualiseSplits(test_set['y'], train_set['y'], splits)
visualiseInputs(train_set, test_set, vois_climate)

In [None]:
param_init = {}
param_init['device'] = 'cuda:0'
param_init['tree_method'] = 'hist'
param_init["random_state"] = cfg.seed
param_init["n_jobs"] = cfg.numJobs

In [None]:
custom_params = {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 800}

# Feature columns:
feature_columns = ['ELEVATION_DIFFERENCE'
                   ] + list(vois_climate) + list(vois_topographical)
# feature_columns = ['ELEVATION_DIFFERENCE'
#                    ] + list(vois_climate) + list(vois_topographical)
all_columns = feature_columns + cfg.fieldsNotFeatures
df_X_train_subset = train_set['df_X'][all_columns]
print('Shape of training dataset:', df_X_train_subset.shape)
print('Shape of testing dataset:', test_set['df_X'][all_columns].shape)
print('Running with features:', feature_columns)

params = {**param_init, **custom_params}
print(params)
custom_model = mbm.models.CustomXGBoostRegressor(cfg, **params)

# Fit on train data:
custom_model.fit(train_set['df_X'][all_columns], train_set['y'])

# Make predictions on test
custom_model = custom_model.set_params(device='cpu')
features_test, metadata_test = custom_model._create_features_metadata(
    test_set['df_X'][all_columns])
y_pred = custom_model.predict(features_test)
print('Shape of the test:', features_test.shape)

# Make predictions aggr to meas ID:
y_pred_agg = custom_model.aggrPredict(metadata_test, features_test)

# Calculate scores
score = custom_model.score(test_set['df_X'][all_columns],
                           test_set['y'])  # negative
print('Overall score:', np.abs(score))

grouped_ids = getDfAggregatePred(test_set, y_pred_agg, all_columns)
PlotPredictions(grouped_ids, y_pred, metadata_test, test_set, custom_model)
plt.suptitle(f'MBM tested on {test_glaciers}', fontsize=20)
plt.tight_layout()

In [None]:
RUN = True

# Define paths
path_save_glw = '../../../data/GLAMOS/distributed_MB_grids/MBM/swisswide/2016/'
path_xr_grids = '../../../data/GLAMOS/topo/SGI2020/xr_masked_grids_sgi/'  # GLAMOS DEMs

path_monthly_grids = '../../../data/GLAMOS/topo/gridded_topo_inputs/SGI_regional_preds/2016/'

sgi_id_list = [re.split('_', f)[0] for f in os.listdir(path_monthly_grids)]

year = 2016

if RUN:
    # check if path exists
    if not os.path.exists(path_save_glw):
        os.makedirs(path_save_glw)
    else:
        emptyfolder(path_save_glw)

    # Feature columns
    vois_climate = [
        't2m', 'tp', 'slhf', 'sshf', 'ssrd', 'fal', 'str', 'u10', 'v10'
    ]
    feature_columns = ['ELEVATION_DIFFERENCE'
                       ] + list(vois_climate) + list(vois_topographical)
    all_columns = feature_columns + cfg.fieldsNotFeatures
    print('Running for feature columns:', all_columns)

    for sgi_id in tqdm(sgi_id_list, desc='SGI Ids'):
        print(sgi_id)
        # Load parquet input glacier grid file in monthly format (pre-processed)
        df_grid_monthly = pd.read_parquet(
            os.path.join(path_monthly_grids, f"{sgi_id}_grid_{year}.parquet"))

        df_grid_monthly.drop_duplicates(inplace=True)

        # Keep only necessary columns, avoiding missing columns issues
        df_grid_monthly = df_grid_monthly[[
            col for col in all_columns if col in df_grid_monthly.columns
        ]]

        # Create geodata object
        geoData = mbm.GeoData(df_grid_monthly)

        # Computes and saves gridded MB for a year and glacier
        path_glacier_dem = os.path.join(path_xr_grids, f"{sgi_id}.zarr")
        geoData.gridded_MB_pred(custom_model,
                                sgi_id,
                                year,
                                all_columns,
                                path_glacier_dem,
                                path_save_glw,
                                cfg,
                                save_monthly_pred=True)