## Setting up:

In [None]:
import os, sys
sys.path.append(os.path.join(os.getcwd(), '../../')) # Add root of repo to import MBM

import pandas as pd
import warnings
import massbalancemachine as mbm
import pyproj
import matplotlib.pyplot as plt
import seaborn as sns
import xarray as xr
from cmcrameri import cm
from oggm import utils

from scripts.helpers import *
from scripts.norway_preprocess import *
from scripts.config_NOR import *

warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

cfg = mbm.NorwayConfig(dataPath='/home/mburlet/scratch/data/DATA_MB/WGMS/Norway/')

In [None]:
seed_all(cfg.seed)
free_up_cuda()

# Plot styles:
path_style_sheet = 'scripts/example.mplstyle'
plt.style.use(path_style_sheet)

cmap = cm.devon

# For bars and lines:
color_diff_xgb = '#4d4d4d'

colors = get_cmap_hex(cm.batlow, 10)
color_1 = colors[0]
color_2 = '#c51b7d'

### Load stakes, fill missing start dates, split into winter and annual and transform to WGMS format

###### Dataset acquired from https://doi.org/10.58059/sjse-6w92

In [None]:
df_stakes = pd.read_csv(cfg.dataPath + path_PMB_WGMS_raw + 'glaciological_point_mass_balance_Norway.csv')

df_stakes = df_stakes.rename(columns={'rgiid': 'RGIId'})

# Add data modification column to keep track of mannual changes
df_stakes['DATA_MODIFICATION'] = ''

# FROM_DATE is missing in some glaciers despite having pmb measurements, fill with start of hydr. year
df_stakes_filled = fill_missing_dates(df_stakes)

# Split into winter and annual measurements
df_stakes_split = split_stake_measurements(df_stakes_filled)

# Transform to WGMS format
df_stakes_split = df_stakes_split.rename(columns={
    'lat': 'POINT_LAT', 
    'lon': 'POINT_LON',
    'altitude': 'POINT_ELEVATION',
    'breid': 'GLACIER',
})
# Only keep relevant columns in df
df_stakes_split = df_stakes_split[[
                                                 'POINT_LAT', 
                                                 'POINT_LON', 
                                                 'POINT_ELEVATION', 
                                                 'FROM_DATE', 
                                                 'TO_DATE', 
                                                 'POINT_BALANCE', 
                                                 'PERIOD', 
                                                 'RGIId', 
                                                 'YEAR',
                                                 'GLACIER',
                                                 'DATA_MODIFICATION',
                                                 'approx_loc',
                                                 'approx_altitude']]

display(df_stakes_split)



###### convert datetime to yyyymmdd

In [None]:
df_stakes_split['FROM_DATE'] = pd.to_datetime(df_stakes_split['FROM_DATE'], format='%d.%m.%Y').dt.strftime('%Y%m%d')
df_stakes_split['TO_DATE'] = pd.to_datetime(df_stakes_split['TO_DATE'], format='%d.%m.%Y').dt.strftime('%Y%m%d')

###### Get glacier names from RGIId

In [None]:
# initialize OGGM glacier directories
gdirs, rgidf = initialize_oggm_glacier_directories(
    working_dir = cfg.dataPath + path_OGGM,
    rgi_region="08",
    rgi_version="6",
    base_url=
    "https://cluster.klima.uni-bremen.de/~oggm/gdirs/oggm_v1.6/L3-L5_files/2023.1/elev_bands/W5E5_w_data/",
    log_level='WARNING',
    task_list=None,
)

# Create a dictionary mapping from RGIId to glacier name
rgi_to_name_dict = dict(zip(rgidf.RGIId, rgidf.Name))
df_stakes_split['GLACIER'] = df_stakes_split['RGIId'].map(rgi_to_name_dict)

# RGI60-08.02966 has no glacier name in the RGI map so directly give it name Blåbreen
df_stakes_split.loc[df_stakes_split['GLACIER'].isna(), 'GLACIER'] = 'Blabreen'

###### Create unique POINT_ID

In [None]:
display(df_stakes_split.head(2))

# Create new POINT_ID column
df_stakes_split['POINT_ID'] = (
    df_stakes_split['GLACIER'] + '_' + 
    df_stakes_split['YEAR'].astype(str) + '_' + 
    df_stakes_split['PERIOD'].astype(str) + '_' +
    df_stakes_split['POINT_LAT'].astype(str) + '_' +
    df_stakes_split['POINT_LON'].astype(str) + '_' +
    df_stakes_split['approx_loc'].astype(str) + '_' +
    df_stakes_split['approx_altitude'].astype(str) + '_' +
    df_stakes_split.index.astype(str)
)

# Drop columns that are not needed anymore
df_stakes_split = df_stakes_split.drop(columns=['approx_loc', 'approx_altitude'])

display(df_stakes_split.head(2))

#### Fix problematic date ranges

In [None]:
annual_inconsistent, winter_inconsistent = check_period_consistency(df_stakes_split)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
if len(annual_inconsistent) > 0:
    print("\nInconsistent annual periods:")
    display(annual_inconsistent[['GLACIER', 'FROM_DATE', 'TO_DATE', 'MONTH_DIFF', 'PERIOD', 'YEAR', 'RGIId', 'POINT_ID']])

if len(winter_inconsistent) > 0:
    print("\nInconsistent winter periods:")
    display(winter_inconsistent[['GLACIER', 'FROM_DATE', 'TO_DATE', 'MONTH_DIFF', 'PERIOD', 'YEAR', 'RGIId', 'POINT_ID']])

###### First fix is to switch all the months that have been wrongfully recorded as 01 instead of 10

In [None]:
# This function corrects the dates where 01 (Jan) has been entered as the month instead of 10 (Oct)
df_stakes_split_fixed1 = fix_january_to_october_dates(df_stakes_split, annual_inconsistent, winter_inconsistent)

annual_inconsistent, winter_inconsistent = check_period_consistency(df_stakes_split_fixed1)

if len(annual_inconsistent) > 0:
    print("\nInconsistent annual periods:")
    display(annual_inconsistent[['GLACIER', 'FROM_DATE', 'TO_DATE', 'MONTH_DIFF', 'PERIOD', 'YEAR', 'RGIId', 'POINT_ID']])

if len(winter_inconsistent) > 0:
    print("\nInconsistent winter periods:")
    display(winter_inconsistent[['GLACIER', 'FROM_DATE', 'TO_DATE', 'MONTH_DIFF', 'PERIOD', 'YEAR', 'RGIId', 'POINT_ID']])

###### Second fix is some by hand and the rest are wrong years

In [None]:
## Fix outliers that don't have common explanation by hand
# May instead of september
df_stakes_split_fixed1.loc[df_stakes_split_fixed1['POINT_ID']=='Svartisheibreen_1994_annual_66.55012_13.72724_N_N_883', 
                          ['TO_DATE', 'DATA_MODIFICATION']] = ['19940915', 'Changed TO_DATE month from May to September']
df_stakes_split_fixed1.loc[df_stakes_split_fixed1['POINT_ID']=='Svartisheibreen_1994_annual_66.54826_13.73128_N_N_884', 
                          ['TO_DATE', 'DATA_MODIFICATION']] = ['19940915', 'Changed TO_DATE month from May to September']
# TO_DATE annual wrong year
df_stakes_split_fixed1.loc[df_stakes_split_fixed1['POINT_ID']=='Aalfotbreen_1974_annual_61.74236_5.64623_N_N_1386', 
                          ['TO_DATE', 'DATA_MODIFICATION']] = ['19740920', 'Changed TO_DATE year from 1975 to 1974']
df_stakes_split_fixed1.loc[df_stakes_split_fixed1['POINT_ID']=='Aalfotbreen_1971_annual_61.75213_5.63165_N_N_1493', 
                          ['TO_DATE', 'DATA_MODIFICATION']] = ['19711124', 'Changed TO_DATE year from 1970 to 1971']
df_stakes_split_fixed1.loc[df_stakes_split_fixed1['POINT_ID']=='Graafjellsbrea_2009_annual_60.06923_6.38925_N_N_3545', 
                          ['TO_DATE', 'DATA_MODIFICATION']] = ['20091013', 'Changed TO_DATE year from 2019 to 2009']
df_stakes_split_fixed1.loc[df_stakes_split_fixed1['POINT_ID']=='Bondhusbrea_1981_annual_60.03108_6.31014_N_N_3738', 
                          ['TO_DATE', 'DATA_MODIFICATION']] = ['19810827', 'Changed TO_DATE year fomr 1980 to 1981']
# TO_DATE winter wrong year
df_stakes_split_fixed1.loc[df_stakes_split_fixed1['POINT_ID']=='Langfjordjoekulen_2019_winter_70.12528_21.71827_N_N_4019', 
                          ['TO_DATE', 'DATA_MODIFICATION', 'YEAR', 'POINT_ID']] = ['20200526', 'Changed TO_DATE year fomr 2019 to 2020', '2020', 'Langfjordjoekulen_2020_winter_70.12528_21.71827_N_N_4019']

df_stakes_split_fixed1.loc[df_stakes_split_fixed1['POINT_ID']=='Blaaisen_1966_winter_68.33479_17.85005_N_N_4155', 
                          ['TO_DATE', 'DATA_MODIFICATION', 'YEAR', 'POINT_ID']] = ['19670520', 'Changed TO_DATE year fomr 1966 to 1967', '1967', 'Blaaisen_1967_winter_68.33479_17.85005_N_N_4155']

df_stakes_split_fixed1.loc[df_stakes_split_fixed1['POINT_ID']=='Nigardsbreen_1963_winter_61.71461_7.11601_N_N_5802', 
                          ['TO_DATE', 'DATA_MODIFICATION', 'YEAR', 'POINT_ID']] = ['19640507', 'Changed TO_DATE year fomr 1963 to 1964', '1964', 'Nigardsbreen_1964_winter_61.71461_7.11601_N_N_5802']

df_stakes_split_fixed1.loc[df_stakes_split_fixed1['POINT_ID']=='Vesledalsbreen_1967_winter_61.84804_7.25335_N_N_6694', 
                          ['TO_DATE', 'DATA_MODIFICATION', 'YEAR', 'POINT_ID']] = ['19680418', 'Changed TO_DATE year fomr 1967 to 1968', '1968', 'Vesledalsbreen_1968_winter_61.84804_7.25335_N_N_6694']

df_stakes_split_fixed1.loc[df_stakes_split_fixed1['POINT_ID']=='Hellstugubreen_2010_winter_61.57329_8.44438_N_N_6935', 
                          ['TO_DATE', 'DATA_MODIFICATION', 'YEAR', 'POINT_ID']] = ['20110505', 'Changed TO_DATE year fomr 2010 to 2011', '2011', 'Hellstugubreen_2011_winter_61.57329_8.44438_N_N_6935']
                          
# These stakes have nonsensical periods, remove them out of df and index list
stakes_to_remove = ['Austdalsbreen_2017_annual_61.81113_7.36766_Y_N_3038',
                    'Austdalsbreen_2017_annual_61.80888_7.38239_Y_N_3065',
                    'Aalfotbreen_1967_winter_61.74294_5.6365_N_N_5379',
                    'Hansebreen_2012_winter_61.74307_5.66278_N_N_5625',
                    'Austdalsbreen_2017_winter_61.81113_7.36766_Y_N_6792',
                    'Austdalsbreen_2017_winter_61.80888_7.38239_Y_N_6819']
df_stakes_split_fixed1 = df_stakes_split_fixed1[~df_stakes_split_fixed1['POINT_ID'].isin(stakes_to_remove)]

annual_inconsistent, winter_inconsistent = check_period_consistency(df_stakes_split_fixed1)

if len(annual_inconsistent) > 0:
    print("\nInconsistent annual periods:")
    display(annual_inconsistent[['GLACIER', 'FROM_DATE', 'TO_DATE', 'MONTH_DIFF', 'PERIOD', 'YEAR', 'RGIId', 'POINT_ID']])

if len(winter_inconsistent) > 0:
    print("\nInconsistent winter periods:")
    display(winter_inconsistent[['GLACIER', 'FROM_DATE', 'TO_DATE', 'MONTH_DIFF', 'PERIOD', 'YEAR', 'RGIId', 'POINT_ID']])
pd.reset_option('display.max_rows')
pd.reset_option('display.max_colwidth')


###### remaining inconsistencies are all wrong FROM_DATE year

In [None]:
remaining_indices = list(annual_inconsistent.index) + list(winter_inconsistent.index)

# For each remaining inconsistent record, change the year in FROM_DATE to the previous year
for idx in remaining_indices:
    # Get year from the YEAR column 
    year = int(df_stakes_split_fixed1.loc[idx, 'YEAR']) - 1
    
    # Extract month and day part from current FROM_DATE (keeping positions 4-8 which contain MMDD)
    month_day = df_stakes_split_fixed1.loc[idx, 'FROM_DATE'][4:8]
    
    # Create new FROM_DATE by combining YEAR with the extracted month_day
    df_stakes_split_fixed1.loc[idx, 'FROM_DATE'] = f"{year}{month_day}"

annual_inconsistent, winter_inconsistent = check_period_consistency(df_stakes_split_fixed1)

# Display the inconsistent records
if len(annual_inconsistent) > 0:
    print("\nInconsistent annual periods:")
    display(annual_inconsistent[['GLACIER', 'FROM_DATE', 'TO_DATE', 'MONTH_DIFF', 'PERIOD', 'YEAR', 'RGIId', 'POINT_ID']])

if len(winter_inconsistent) > 0:
    print("\nInconsistent winter periods:")
    display(winter_inconsistent[['GLACIER', 'FROM_DATE', 'TO_DATE', 'MONTH_DIFF', 'PERIOD', 'YEAR', 'RGIId', 'POINT_ID']])
pd.reset_option('display.max_rows')
pd.reset_option('display.max_colwidth')


#### Merge close stakes

In [None]:
df_stakes_merged = remove_close_points(df_stakes_split_fixed1)

display(find_close_stakes(df_stakes_merged).sort_values('DISTANCE_M'))

In [None]:
display(df_stakes_merged)

#### Add OGGM data

In [None]:
unique_rgis = df_stakes_merged['RGIId'].unique()

run = True
if run:
    export_oggm_grids(gdirs, subset_rgis=unique_rgis, output_path=cfg.dataPath + path_OGGM_xrgrids)

df_stakes_topo = merge_pmb_with_oggm_data(df_pmb=df_stakes_merged,
                                       gdirs=gdirs,
                                       rgi_region="08",
                                       rgi_version="6")

In [None]:
# Example:
glacierName = 'Langfjordjoekulen'
# stakes
df_stakes_topo_1 = df_stakes_topo.copy()
df_stakes_topo_1 = df_stakes_topo_1[(df_stakes_topo_1['GLACIER'] == glacierName)]
RGIId = df_stakes_topo_1['RGIId'].unique()[0]
print(RGIId)
# open OGGM xr for glacier
# Get oggm data for that RGI grid
ds_oggm = xr.open_dataset(f'{cfg.dataPath + path_OGGM_xrgrids}/{RGIId}.zarr')

# Define the coordinate transformation
transf = pyproj.Transformer.from_proj(
    pyproj.CRS.from_user_input("EPSG:4326"),  # Input CRS (WGS84)
    pyproj.CRS.from_user_input(ds_oggm.pyproj_srs),  # Output CRS from dataset
    always_xy=True)

# Transform all coordinates in the group
lon, lat = df_stakes_topo_1["POINT_LON"].values, df_stakes_topo_1["POINT_LAT"].values
x_stake, y_stake = transf.transform(lon, lat)
df_stakes_topo_1['x'] = x_stake
df_stakes_topo_1['y'] = y_stake

# plot stakes
plt.figure(figsize=(8, 6))
ds_oggm.glacier_mask.plot(cmap='binary')
sns.scatterplot(df_stakes_topo_1,
                x='x',
                y='y',
                hue='within_glacier_shape',
                palette=['r', 'b'])
plt.title(f'Stakes on {glacierName} (OGGM)')
plt.tight_layout()

###### Only keep glaciers within RGIId shape and drop rows with NaN values anywhere

In [None]:
# restrict to within glacier shape
df_stakes_topo = df_stakes_topo[df_stakes_topo['within_glacier_shape'] == True]
df_stakes_topo = df_stakes_topo.drop(columns=['within_glacier_shape'])

# Drop rows with NaN in consensus_ice_thickness
df_stakes_topo = df_stakes_topo.dropna(subset=['consensus_ice_thickness'])

print('Number of winter and annual samples:', len(df_stakes_topo))
print('Number of annual samples:',
      len(df_stakes_topo[df_stakes_topo.PERIOD == 'annual']))
print('Number of winter samples:',
      len(df_stakes_topo[df_stakes_topo.PERIOD == 'winter']))

# Unique glaciers, sorted
glacier_list = sorted(df_stakes_topo.GLACIER.unique())
print(f"Number of glaciers: {len(glacier_list)}")
print(f"Glaciers: {glacier_list}")


In [None]:
# Check for NaN
display(df_stakes_topo[df_stakes_topo.isna().any(axis=1)])

In [None]:
# Save df to folder
df_stakes_topo.to_csv(cfg.dataPath + path_PMB_WGMS_csv + 'Nor_dataset_all_oggm.csv', index=False)
display(df_stakes_topo.head(2))
