## Setting up:

In [None]:
import os, sys
sys.path.append(os.path.join(os.getcwd(), '../../')) # Add root of repo to import MBM

import pandas as pd
import warnings
import massbalancemachine as mbm
import pyproj
import seaborn as sns
import matplotlib.pyplot as plt
import xarray as xr
import geopandas as gpd
from oggm import utils
from tqdm import tqdm
from cmcrameri import cm
from scripts.helpers import *
from scripts.italy_austria_preprocess import *
from scripts.config_IT_AT import *


warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

cfg = mbm.ItalyAustriaConfig(dataPath='/home/mburlet/scratch/data/DATA_MB/WGMS/IT_AT/')

In [None]:
seed_all(cfg.seed)
free_up_cuda()

# Plot styles:
path_style_sheet = 'scripts/example.mplstyle'
plt.style.use(path_style_sheet)

cmap = cm.devon

# For bars and lines:
color_diff_xgb = '#4d4d4d'

colors = get_cmap_hex(cm.batlow, 10)
color_1 = colors[0]
color_2 = '#c51b7d'

### Load stakes into 1 df

###### The data has been acquired directly from WGMS's Fluctuations of Glaciers (FoG) Database. Version  10.5904/wgms-fog-2025-02b

In [None]:
df_stakes = pd.read_csv(cfg.dataPath + path_PMB_WGMS_raw + 'mass_balance_point.csv')
df_it_at_RGIId = pd.read_csv(cfg.dataPath + path_PMB_WGMS_raw + 'glacier.csv')

# Filter df_stakes to include only rows where country is AT or IT
df_it_at_stakes = df_stakes[df_stakes['country'].isin(['AT', 'IT'])].reset_index(drop=True)

# Create a mapping dictionary from id to rgi60_ids
id_to_rgi_map = dict(zip(df_it_at_RGIId['id'], df_it_at_RGIId['rgi60_ids']))

# Add the RGIId column to the filtered DataFrame using glacier_id instead of id
df_it_at_stakes['RGIId'] = df_it_at_stakes['glacier_id'].map(id_to_rgi_map)

# Display glacier names with NaN RGIId
display(f"Number of rows with NaN RGIId: {df_it_at_stakes['RGIId'].isna().sum()}")
display(df_it_at_stakes[df_it_at_stakes['RGIId'].isna()]['glacier_name'].unique())


# Only Careser glacier has NaN RGIIds as only RGIId_50 are listed in the csv file.

## find RGIId_60 for Careser glaciers
rgi_file = utils.get_rgi_region_file(region="11", version="6")
glacier_outline = gpd.read_file(rgi_file)

# Search by name
careser_glacier = glacier_outline[glacier_outline['Name'].notna() & glacier_outline['Name'].str.contains('CARESER', case=False)]
display(careser_glacier[['RGIId', 'Name']])

# RGIId_60 and 50 are the same: RGI50-11.01834 and RGI60-11.01834, add to df
for glacier_id in df_it_at_stakes[df_it_at_stakes['RGIId'].isna()]['glacier_id'].unique():
    df_it_at_stakes.loc[df_it_at_stakes['glacier_id'] == glacier_id, 'RGIId'] = 'RGI60-11.01834'

display(f"Number of rows with NaN RGIId: {df_it_at_stakes['RGIId'].isna().sum()}")

In [None]:
display(df_it_at_stakes['remarks'].unique())

#### Rename columns

In [None]:
# Select and rename columns
df_it_at_stakes_renamed = df_it_at_stakes.rename(columns={
    'point_id': 'POINT_ID',
    'latitude': 'POINT_LAT', 
    'longitude': 'POINT_LON',
    'elevation': 'POINT_ELEVATION',
    'begin_date': 'FROM_DATE',
    'end_date': 'TO_DATE',
    'balance': 'POINT_BALANCE',
    'glacier_name': 'GLACIER',
    'year': 'YEAR',
    'country': 'COUNTRY',
    'balance_code': 'PERIOD'
})

# Create new POINT_ID column
df_it_at_stakes_renamed['POINT_ID'] = (
    df_it_at_stakes_renamed['GLACIER'] + '_' + 
    df_it_at_stakes_renamed['YEAR'].astype(str) + '_' + 
    df_it_at_stakes['id'].astype(str) + '_' + 
    df_it_at_stakes_renamed['COUNTRY']
)
# Only keep relevant columns in df
df_it_at_stakes_renamed = df_it_at_stakes_renamed[['POINT_ID', 
                                                 'POINT_LAT', 
                                                 'POINT_LON', 
                                                 'POINT_ELEVATION', 
                                                 'FROM_DATE', 
                                                 'TO_DATE', 
                                                 'POINT_BALANCE', 
                                                 'GLACIER', 
                                                 'PERIOD', 
                                                 'RGIId', 
                                                 'YEAR',
                                                 'begin_date_unc',
                                                 'end_date_unc']]

# Remove rows with NaN values in POINT_LAT, POINT_LON, and POINT_ELEVATION
df_it_at_stakes_renamed = df_it_at_stakes_renamed.dropna(subset=['POINT_LAT', 'POINT_LON', 'POINT_ELEVATION'])

# change date format to YYYYMMDD
df_it_at_stakes_renamed['FROM_DATE'] = df_it_at_stakes_renamed['FROM_DATE'].astype(str).str.replace('-', '')
df_it_at_stakes_renamed['TO_DATE'] = df_it_at_stakes_renamed['TO_DATE'].astype(str).str.replace('-', '')

# Add data modification column to keep track of mannual changes
df_it_at_stakes_renamed['DATA_MODIFICATION'] = ''

display(df_it_at_stakes_renamed.head(2))

###### General Fixes

In [None]:
# Check if any entry anywhere is NaN
display(df_it_at_stakes_renamed[df_it_at_stakes_renamed.isna().any(axis=1)])

# One stake has a wrong elevation of 296 instead of 2960
display(df_it_at_stakes_renamed[df_it_at_stakes_renamed['POINT_ID'] == 'VERNAGT F._2013_15124_AT'])
df_it_at_stakes_renamed.loc[df_it_at_stakes_renamed['POINT_ID'] == 'VERNAGT F._2013_15124_AT', 'POINT_ELEVATION'] = 2960
df_it_at_stakes_renamed.loc[df_it_at_stakes_renamed['POINT_ID'] == 'VERNAGT F._2013_15124_AT', 'DATA_MODIFICATION'] = 'Elevation corrected from 296 to 2960 m'
display(df_it_at_stakes_renamed[df_it_at_stakes_renamed['POINT_ID'] == 'VERNAGT F._2013_15124_AT'])

##### Date Fixes

In [None]:
# Some stakes have the year 1012 instead of 2012 etc. find all these stakes
problematic_dates = []
for i, date in enumerate(df_it_at_stakes_renamed['FROM_DATE']):
    try:
        pd.to_datetime(str(date), format="%Y%m%d")
    except:
        problematic_dates.append((i, date, 'FROM_DATE'))

for i, date in enumerate(df_it_at_stakes_renamed['TO_DATE']):
    try:
        pd.to_datetime(str(date), format="%Y%m%d")
    except:
        problematic_dates.append((i, date, 'TO_DATE'))

print(f"Found {len(problematic_dates)} problematic date entries")
if problematic_dates:
    print(problematic_dates)

# All stakes from same glacier MALAVALLE and date 10120508. Correct the date
df_it_at_stakes_renamed.loc[df_it_at_stakes_renamed['FROM_DATE'] == '10120508', 'DATA_MODIFICATION'] = 'Date corrected from 10120508 to 20120508'
df_it_at_stakes_renamed['FROM_DATE'] = df_it_at_stakes_renamed['FROM_DATE'].replace('10120508', '20120508')


In [None]:
## In the original dataset Glaciers "OE. WURTEN K.", "VERNAGT F." and "GRAND ETRET" have multiple measurements with date_unc 182 or 182.5
## These dates are always entered as start of july, correct them to 30.04 and 01.10

display(df_it_at_stakes_renamed[(df_it_at_stakes_renamed['begin_date_unc'] >= 182)|(df_it_at_stakes_renamed['end_date_unc'] >= 182)])

# Update the DATA_MODIFICATION column for these rows
uncertain_date_mask = (df_it_at_stakes_renamed['begin_date_unc'] >= 182) | (df_it_at_stakes_renamed['end_date_unc'] >= 182)
df_it_at_stakes_renamed.loc[uncertain_date_mask, 'DATA_MODIFICATION'] = "Dates corrected due to high uncertainty (~= 182 days)"

# Update dates
df_it_at_stakes_renamed = fix_uncertain_dates(df_it_at_stakes_renamed)

display(df_it_at_stakes_renamed[(df_it_at_stakes_renamed['begin_date_unc'] >= 182)|(df_it_at_stakes_renamed['end_date_unc'] >= 182)])


# Remove _unc columns, were only needed for fixing uncertain dates
df_it_at_stakes_renamed = df_it_at_stakes_renamed[['POINT_ID', 
                                                 'POINT_LAT', 
                                                 'POINT_LON', 
                                                 'POINT_ELEVATION', 
                                                 'FROM_DATE', 
                                                 'TO_DATE', 
                                                 'POINT_BALANCE', 
                                                 'GLACIER', 
                                                 'PERIOD', 
                                                 'RGIId', 
                                                 'YEAR',
                                                 'DATA_MODIFICATION']]


In [None]:
annual_inconsistent, winter_inconsistent = check_period_consistency(df_it_at_stakes_renamed)

pd.set_option('display.max_rows', None)
display(annual_inconsistent)
display(winter_inconsistent)
pd.reset_option('display.max_rows')

## 2 Cases of inconsistent periods:
# 1. HALLSTAETTER G._2024_63282_AT has MONTH_DIFF of 1, unclear whether this is a date error or an actual measurement (since pmb is also lower than other stakes in that year, just remove it)
df_it_at_stakes_renamed = df_it_at_stakes_renamed.loc[df_it_at_stakes_renamed['POINT_ID'] != 'HALLSTAETTER G._2024_63282_AT']

# 2. GRAND ETRET in Year 2008 goes from 1999 to 2008, assuming this is a date error and changing year to 2007
mask = (df_it_at_stakes_renamed['GLACIER'] == 'GRAND ETRET') & (df_it_at_stakes_renamed['YEAR'] == 2008)
df_it_at_stakes_renamed.loc[mask, 'DATA_MODIFICATION'] = 'FROM_DATE year corrected from 1999 to 2007'
df_it_at_stakes_renamed.loc[mask, 'FROM_DATE'] = df_it_at_stakes_renamed.loc[mask, 'FROM_DATE'].str.replace('1999', '2007')

annual_inconsistent, winter_inconsistent = check_period_consistency(df_it_at_stakes_renamed)


#### Merge stakes that are close

In [None]:
df_it_at_stakes_dropped_stakes = remove_close_points(df_it_at_stakes_renamed)

display(find_close_stakes(df_it_at_stakes_dropped_stakes).sort_values('DISTANCE_M'))

#### Add OGGM data

In [None]:
# initialize OGGM glacier directories
gdirs, rgidf = initialize_oggm_glacier_directories(
    working_dir='/home/mburlet/scratch/data/DATA_MB/WGMS/OGGM/',
    rgi_region="11",
    rgi_version="6",
    base_url=
    "https://cluster.klima.uni-bremen.de/~oggm/gdirs/oggm_v1.6/L3-L5_files/2023.1/elev_bands/W5E5_w_data/",
    log_level='WARNING',
    task_list=None,
)


unique_rgis = df_it_at_stakes_dropped_stakes['RGIId'].unique()

run = True
if run:
    export_oggm_grids(gdirs, subset_rgis=unique_rgis, output_path=cfg.dataPath + path_OGGM_xrgrids)

df_it_at_stakes_dropped_stakes_topo = merge_pmb_with_oggm_data(df_pmb=df_it_at_stakes_dropped_stakes,
                                       gdirs=gdirs,
                                       rgi_region="11",
                                       rgi_version="6")

In [None]:
# Example:
glacierName = 'GOLDBERG K.'
# stakes
df_stakes = df_it_at_stakes_dropped_stakes_topo.copy()
df_stakes = df_stakes[(df_stakes['GLACIER'] == glacierName)]
RGIId = df_stakes.RGIId.unique()[0]
print(RGIId)
# open OGGM xr for glacier
# Get oggm data for that RGI grid
ds_oggm = xr.open_dataset(f'{cfg.dataPath + path_OGGM_xrgrids}/{RGIId}.zarr')

# Define the coordinate transformation
transf = pyproj.Transformer.from_proj(
    pyproj.CRS.from_user_input("EPSG:4326"),  # Input CRS (WGS84)
    pyproj.CRS.from_user_input(ds_oggm.pyproj_srs),  # Output CRS from dataset
    always_xy=True)

# Transform all coordinates in the group
lon, lat = df_stakes["POINT_LON"].values, df_stakes["POINT_LAT"].values
x_stake, y_stake = transf.transform(lon, lat)
df_stakes['x'] = x_stake
df_stakes['y'] = y_stake

# plot stakes
plt.figure(figsize=(8, 6))
ds_oggm.glacier_mask.plot(cmap='binary')
sns.scatterplot(df_stakes,
                x='x',
                y='y',
                hue='within_glacier_shape',
                palette=['r', 'b'])
plt.title(f'Stakes on {glacierName} (OGGM)')
plt.tight_layout()

In [None]:
# restrict to within glacier shape
df_it_at_stakes_dropped_stakes_topo = df_it_at_stakes_dropped_stakes_topo[df_it_at_stakes_dropped_stakes_topo['within_glacier_shape']]
df_it_at_stakes_dropped_stakes_topo = df_it_at_stakes_dropped_stakes_topo.drop(columns=['within_glacier_shape'])

print('Number of winter, summer and annual samples:', len(df_it_at_stakes_dropped_stakes_topo))
print('Number of annual samples:',
      len(df_it_at_stakes_dropped_stakes_topo[df_it_at_stakes_dropped_stakes_topo.PERIOD == 'annual']))
print('Number of winter samples:',
      len(df_it_at_stakes_dropped_stakes_topo[df_it_at_stakes_dropped_stakes_topo.PERIOD == 'winter']))
print('Number of summer samples:',
      len(df_it_at_stakes_dropped_stakes_topo[df_it_at_stakes_dropped_stakes_topo.PERIOD == 'summer']))
# Unique glaciers, sorted
glacier_list = sorted(df_it_at_stakes_dropped_stakes_topo.GLACIER.unique())
print(f"Number of glaciers: {len(glacier_list)}")
print(f"Glaciers: {glacier_list}")

In [None]:
# Check for NaN
display(df_it_at_stakes_dropped_stakes_topo[df_it_at_stakes_dropped_stakes_topo.isna().any(axis=1)])

In [None]:
display(df_it_at_stakes_dropped_stakes_topo.head(2))
df_it_at_stakes_dropped_stakes_topo.to_csv(cfg.dataPath + path_PMB_WGMS_csv + 'IT_AT_wgms_dataset_all_oggm.csv', index=False)