## Setting up:

In [None]:
import os, sys
sys.path.append(os.path.join(os.getcwd(), '../../')) # Add root of repo to import MBM

import pandas as pd
import os
import warnings
import massbalancemachine as mbm
import pyproj
import matplotlib.pyplot as plt
import seaborn as sns
import xarray as xr
import geopandas as gpd
import logging
import glob
from cmcrameri import cm
from oggm import utils

from scripts.helpers import *
from scripts.iceland_preprocess import *
from scripts.config_ICE import *

warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

cfg = mbm.IcelandConfig(dataPath='/home/mburlet/scratch/data/DATA_MB/WGMS/Iceland/')

# Module logger
log = logging.getLogger('.'.join(__name__.split('.')[:-1]))

In [None]:
seed_all(cfg.seed)
free_up_cuda()

# Plot styles:
path_style_sheet = 'scripts/example.mplstyle'
plt.style.use(path_style_sheet)

cmap = cm.devon

# For bars and lines:
color_diff_xgb = '#4d4d4d'

colors = get_cmap_hex(cm.batlow, 10)
color_1 = colors[0]
color_2 = '#c51b7d'

### Load all stake csv files into 1 df

###### The data used in this code comes from the data scraping done in the 1.0 Iceland-data-acquisition notebook in June 2025, only winter and annual measurements are used. Code might have to be adjusted if new data is added to https://joklavefsja.vedur.is/

In [None]:
all_files = glob.glob(os.path.join(cfg.dataPath + path_PMB_WGMS_raw, "*.csv"))

# Initialize empty list to store dataframes
dfs = []

# Read each CSV file into a dataframe and append to list
for file in all_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all dataframes into one
combined_df = pd.concat(dfs, ignore_index=True)

# Print info
print(f"Combined {len(all_files)} CSV files into one dataframe with {len(combined_df)} rows")

# Add data modification column to keep track of mannual changes
combined_df['DATA_MODIFICATION'] = ''


display(combined_df.head(2))


###### Split into annual and winter rows

In [None]:
df_stakes_split = split_stake_measurements(combined_df)

# Convert date columns to string in 'YYYYMMDD' format
df_stakes_split['TO_DATE'] = pd.to_datetime(df_stakes_split['TO_DATE']).dt.strftime('%Y%m%d')
df_stakes_split['FROM_DATE'] = pd.to_datetime(df_stakes_split['FROM_DATE']).dt.strftime('%Y%m%d')

display(df_stakes_split)

#### Date Fixes

###### Fix NaN dates by adding hydrological year dates. (It would be nicer if this code also checked if there was a previous year of the same stake with a date and then takes that date instead of hydr. year)

In [None]:
display(df_stakes_split[df_stakes_split['FROM_DATE'].isna()])
display(df_stakes_split[df_stakes_split['TO_DATE'].isna()])
display(df_stakes_split[df_stakes_split['YEAR'].isna()])

# Change NaN year values to the year of the TO_DATE
df_stakes_split.loc[df_stakes_split['YEAR'].isna(), 'YEAR'] = df_stakes_split.loc[df_stakes_split['YEAR'].isna(), 'TO_DATE'].astype(str).str[:4].astype(float)

# Data modification column update
date_nan_mask = df_stakes_split['FROM_DATE'].isna() | df_stakes_split['TO_DATE'].isna()
df_stakes_split.loc[date_nan_mask, 'DATA_MODIFICATION'] = 'Dates filled in according to hydrological year'
# Set FROM_DATE from NaN to 01 Oct of previous year
df_stakes_split.loc[df_stakes_split['FROM_DATE'].isna(), 'FROM_DATE'] = (
    (df_stakes_split.loc[df_stakes_split['FROM_DATE'].isna(), 'YEAR'].astype(int) - 1).astype(str) + '1001'
)
# Set TO_DATE from NaN to 30 Sept of the year (as only annual rows have NaN, no need for period distinction)
df_stakes_split.loc[df_stakes_split['TO_DATE'].isna(), 'TO_DATE'] = (
    df_stakes_split.loc[df_stakes_split['TO_DATE'].isna(), 'YEAR'].astype(int).astype(str) + '0930'
)


###### Check for problematic date ranges

In [None]:
annual_inconsistent, winter_inconsistent = check_period_consistency(df_stakes_split)


# Display the inconsistent records
if len(annual_inconsistent) > 0:
    print("\nInconsistent annual periods:")
    display(annual_inconsistent)

if len(winter_inconsistent) > 0:
    print("\nInconsistent winter periods:")
    display(winter_inconsistent)

# Only index 5084 is unreasonabl (-2), probably wrong FROM_DATE year, change to year - 1
df_stakes_split.loc[df_stakes_split['stake']=='GL10a', 'FROM_DATE'] = '19960825'
df_stakes_split.loc[df_stakes_split['stake']=='GL10a', 'DATA_MODIFICATION'] = 'FROM_DATE year corrected from 1997 to 1996'

###### Rename Columns and general data cleaning, we can skip the close stake removal, as seen form the leaflet map online, the stakes are spaced out.

In [None]:
df_stakes_renamed = df_stakes_split.rename(columns={
    'lat': 'POINT_LAT', 
    'lon': 'POINT_LON',
    'elevation': 'POINT_ELEVATION',
    'stake': 'ID',
})

In [None]:
# NaN check
display(df_stakes_renamed[df_stakes_renamed.isna().any(axis=1)])

# Remove all rows with any NaN values
df_stakes_renamed = df_stakes_renamed.dropna()

# Confirm removal - this should show 0 rows if all NaNs were removed
print(f"Rows with NaN values after removal: {len(df_stakes_renamed[df_stakes_renamed.isna().any(axis=1)])}")

##### Find RGIId

In [None]:
# Load glacier outlines
rgi_file = utils.get_rgi_region_file(region="06", version="6")
glacier_outline = gpd.read_file(rgi_file)

# Add RGI IDs through intersection
df_stakes_renamed_rgiid = mbm.data_processing.utils.get_rgi(data=df_stakes_renamed,
                                           glacier_outlines=glacier_outline)

display(df_stakes_renamed_rgiid[df_stakes_renamed_rgiid['RGIId'].isna()])
# Remove (nine) stakes without RGIId, as they wont have OGGM data anyways
df_stakes_renamed_rgiid = df_stakes_renamed_rgiid.dropna(subset=['RGIId'])

#### Add OGGM data

In [None]:
# initialize OGGM glacier directories
gdirs, rgidf = initialize_oggm_glacier_directories(
    working_dir= cfg.dataPath + path_OGGM,
    rgi_region="06", #06 iceland
    rgi_version="6",
    base_url=
    "https://cluster.klima.uni-bremen.de/~oggm/gdirs/oggm_v1.6/L3-L5_files/2023.1/elev_bands/W5E5_w_data/",
    log_level='WARNING',
    task_list=None,
)


unique_rgis = df_stakes_renamed_rgiid['RGIId'].unique()

run = True
if run:
    export_oggm_grids(gdirs, subset_rgis=unique_rgis, output_path=cfg.dataPath + path_OGGM_xrgrids)

## Around 10% of all the measurements have no hugonnet_dhdt data, so I removed the entire variable from merge_pmb_with_oggm_data()
df_stakes_topo = merge_pmb_with_oggm_data(df_pmb=df_stakes_renamed_rgiid,
                                       gdirs=gdirs,
                                       rgi_region="06", #06 iceland
                                       rgi_version="6")
                                       


###### Get Glacier names from RGIId

In [None]:
# Create a dictionary mapping from RGIId to glacier name
rgi_to_name_dict = dict(zip(rgidf.RGIId, rgidf.Name))
df_stakes_topo['GLACIER'] = df_stakes_topo['RGIId'].map(rgi_to_name_dict)


display(df_stakes_topo[df_stakes_topo['GLACIER'].isna()])


###### Multiple RGIIds have no associated glacier name, assign the 'RGIId' as the 'GLACIER' name

In [None]:
missing_rgi_ids = df_stakes_topo.loc[df_stakes_topo['GLACIER'].isna(), 'RGIId'].unique()
print(f"Number of unique RGI IDs without names: {len(missing_rgi_ids)}")
print("RGI IDs without names:", missing_rgi_ids)

# Just assign RGIId to 'GLACIER' as name for the ones that are missing
df_stakes_topo.loc[df_stakes_topo['GLACIER'].isna(), 'GLACIER'] = df_stakes_topo.loc[df_stakes_topo['GLACIER'].isna(), 'RGIId']

In [None]:
# Example:
glacierName = 'Thjorsarjoekull (Hofsjoekull E)'
# stakes
df_stakes_topo_1 = df_stakes_topo.copy()
df_stakes_topo_1 = df_stakes_topo_1[(df_stakes_topo_1['GLACIER'] == glacierName)]
RGIId = df_stakes_topo_1['RGIId'].unique()[0]
print(RGIId)
# open OGGM xr for glacier
# Get oggm data for that RGI grid
ds_oggm = xr.open_dataset(f'{cfg.dataPath + path_OGGM_xrgrids}/{RGIId}.zarr')

# Define the coordinate transformation
transf = pyproj.Transformer.from_proj(
    pyproj.CRS.from_user_input("EPSG:4326"),  # Input CRS (WGS84)
    pyproj.CRS.from_user_input(ds_oggm.pyproj_srs),  # Output CRS from dataset
    always_xy=True)

# Transform all coordinates in the group
lon, lat = df_stakes_topo_1["POINT_LON"].values, df_stakes_topo_1["POINT_LAT"].values
x_stake, y_stake = transf.transform(lon, lat)
df_stakes_topo_1['x'] = x_stake
df_stakes_topo_1['y'] = y_stake

# plot stakes
plt.figure(figsize=(8, 6))
ds_oggm.glacier_mask.plot(cmap='binary')
sns.scatterplot(df_stakes_topo_1,
                x='x',
                y='y',
                hue='within_glacier_shape',
                palette=['r', 'b'])
plt.title(f'Stakes on {glacierName} (OGGM)')
plt.tight_layout()

In [None]:
# Restrict to within glacier shape
df_stakes_topo = df_stakes_topo[df_stakes_topo['within_glacier_shape'] == True]
df_stakes_topo = df_stakes_topo.drop(columns=['within_glacier_shape'])

# Display rows that have any NaN values
display(df_stakes_topo[df_stakes_topo.isna().any(axis=1)])

# Drop 3 rows where consensus_ice_thickness is NaN
df_stakes_topo_dropped = df_stakes_topo.dropna(subset=['consensus_ice_thickness'])

display(len(df_stakes_topo_dropped[df_stakes_topo_dropped['consensus_ice_thickness'].isna()]))


In [None]:
# Create new POINT_ID column
df_stakes_topo_dropped['POINT_ID'] = (
    df_stakes_topo_dropped['GLACIER'] + '_' + 
    df_stakes_topo_dropped['YEAR'].astype(str) + '_' + 
    df_stakes_topo_dropped['PERIOD'].astype(str) + '_' +
    df_stakes_topo_dropped['ID'].astype(str)
)

df_stakes_topo_dropped = df_stakes_topo_dropped.drop(columns=['ID'])

display(df_stakes_topo_dropped.head(2))

In [None]:
# Check for NaN
display(df_stakes_topo_dropped[df_stakes_topo_dropped.isna().any(axis=1)])

In [None]:
output_path = (cfg.dataPath + path_PMB_WGMS_csv + 'ICE_dataset_all_oggm.csv')
df_stakes_topo_dropped.to_csv(output_path, index=False)