# Pre-processing of GLACIOCLIM MB data:

Does the pre-processing of the point MB measurements from GLACIOCLIM (French Alps).

# Point Mass Balance data:

## Setting up:

In [1]:
import pandas as pd
import os
import warnings
import re
import massbalancemachine as mbm
import geopandas as gpd
import matplotlib.pyplot as plt
import xarray as xr
import glob
import numpy as np
from cmcrameri import cm

from scripts.helpers import *
from scripts.glacioclim_preprocess import *
from scripts.config_FR import *

warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2


cfg = mbm.FranceConfig()

In [2]:
seed_all(cfg.seed)
free_up_cuda()

# Plot styles:
path_style_sheet = 'scripts/example.mplstyle'
plt.style.use(path_style_sheet)

cmap = cm.devon

# For bars and lines:
color_diff_xgb = '#4d4d4d'

colors = get_cmap_hex(cm.batlow, 10)
color_1 = colors[0]
color_2 = '#c51b7d'

## 1. Load data into dictionary of dataframes

###### unzip GLACIOCLIM Files

In [3]:
read_in = False

if read_in == True:
    extract_glacioclim_files(path_PMB_GLACIOCLIM_raw)

###### Read all csv from "unzipped" into single dictionary of dataframes

In [4]:
stakes_csv_all = {}

for path, _, files in os.walk(path_PMB_GLACIOCLIM_raw):
    # Find all .csv files in the current directory
    csv_files = glob.glob(os.path.join(path, '*.csv'))
    
    # Read each CSV file and add to dictionary
    for file in csv_files:
        try:
            # Use the filename as the key
            key = os.path.splitext(os.path.basename(file))[0]
            # Read the CSV file
            stakes_csv_all[key] = pd.read_csv(file)
        except Exception as e:
            print(f"Error reading {file}: {str(e)}")

print(f"\nTotal number of files processed: {len(stakes_csv_all)}")
display(stakes_csv_all)


Total number of files processed: 1022


{'mdg_Leschaux_winter_smb_abl_2022':   profile_name  stake_year_setup  stake_number  day_start  month_start  \
 0     Leschaux              2020             1          7           10   
 1     Leschaux              2020             2          7           10   
 2     Leschaux              2020             3          7           10   
 3     Leschaux              2020             4          7           10   
 4     Leschaux              2020             7          7           10   
 5     Leschaux              2021             5          7           10   
 6     Leschaux              2021             6          7           10   
 7     Leschaux              2021             7          7           10   
 
    year_start  day_end  month_end  year_end  x_lambert2e  y_lambert2e  \
 0        2021        6          5      2022    960340.86    108660.30   
 1        2021        6          5      2022    960459.45    108937.98   
 2        2021        6          5      2022    960387.13    1093

#### Special case Sarennes. 
###### The File is not in a typical csv structure, so cant just be read in. For this code to work, in each sheet in the B_SARENNES_94_20.xls file, make a table with start_date, spring_date, end_date, X, Y, Z, winter mb, summer mb, annual mb (requires xlrd package to read in)

In [5]:
sarennes_path = '/home/mburlet/scratch/data/DATA_MB/GLACIOCLIM/Glacier de SARENNES_discontinued/B_SARENNES_94-20.xls'

# Read all sheets into a dictionary of dfs
all_sheets = pd.read_excel(sarennes_path, sheet_name=None)
sarennes_dfs = extract_sarennes_data(all_sheets)
display(sarennes_dfs)


{'sarennes_complete_winter_1994':                           POINT_ID  x_lambert3  y_lambert3  POINT_ELEVATION  \
 0  sarennes_complete_winter_1994_1    898242.0    319367.0           2858.0   
 1  sarennes_complete_winter_1994_2    898327.0    319585.0           2912.0   
 2  sarennes_complete_winter_1994_3    898375.0    319807.0           2946.0   
 3  sarennes_complete_winter_1994_4    898375.0    320107.0           3001.0   
 4  sarennes_complete_winter_1994_5    898385.0    320340.0           3072.0   
 
   FROM_DATE   TO_DATE  POINT_BALANCE   GLACIER  PERIOD GLACIER_ZONE  
 0  19930909  19940602           2.42  sarennes  winter     complete  
 1  19930909  19940602           1.95  sarennes  winter     complete  
 2  19930909  19940602           1.92  sarennes  winter     complete  
 3  19930909  19940602           2.29  sarennes  winter     complete  
 4  19930909  19940602           2.24  sarennes  winter     complete  ,
 'sarennes_complete_summer_1994':                         

##### 2. Convert from Lambert3 / Lambert2 cooridnates to WGS84

In [6]:
## For some reason there is a 2 in front of the y_lambert2e coordinates in certain years, hardcode remove them:
stakes_csv_all['mdg_Tacul_winter_smb_abl_2007']['y_lambert2e'] = stakes_csv_all['mdg_Tacul_winter_smb_abl_2007']['y_lambert2e'].apply(lambda x: x - 2000000 if x > 2000000 else x)
stakes_csv_all['mdg_Talefre_annual_smb_abl_2006']['y_lambert2e'] = stakes_csv_all['mdg_Talefre_annual_smb_abl_2006']['y_lambert2e'].apply(lambda x: x - 2000000 if x > 2000000 else x)

stakes_csv_all = lambert_transform(stakes_csv_all)
sarennes_dfs = lambert_transform(sarennes_dfs)

##### Plot stakes over map for visual conformation of coordinate transformation

###### This needs additional folium package to be installed

In [7]:
import folium

glacier_dfs = {
    'GEBROULAZ': (stakes_csv_all['geb_annual_smb_abl_1979'], 'red'),
    'ARGENTIERE': (stakes_csv_all['Argentiere_annual_smb_accu_1995'], 'blue'),
    'SAINT_SORLIN': (stakes_csv_all['stso_annual_smb_abl_1957'], 'green'),
    'MER_DE_GLACE': (stakes_csv_all['mdg_Leschaux_winter_smb_abl_2020'], 'purple'),
    'SARENNES': (sarennes_dfs['sarennes_complete_winter_2006'], 'orange')
}

center_lat, center_lon = 45.8736, 6.8770

m = folium.Map(location=[center_lat, center_lon], zoom_start=10)

# Add markers for each glacier
for glacier_name, (df, color) in glacier_dfs.items():
    fg = folium.FeatureGroup(name=glacier_name)
    
    for _, row in df.iterrows():
        if glacier_name == 'SARENNES':
            stake_id = row['POINT_ID'].split('_')[-1]
            altitude = row['POINT_ELEVATION']
        else:
            stake_id = row['stake_number']
            altitude = row['altitude']
            
        # Add circle marker with popup
        folium.CircleMarker(
            location=[row['lat'], row['lon']],
            radius=5,
            color=color,
            fill=True,
            fill_color=color,
            popup=f"{glacier_name} - Stake {stake_id}: {altitude}m"
        ).add_to(fg)
    
    fg.add_to(m)

# Add a legend
legend_html = '''
<div style="position: fixed; bottom: 50px; left: 50px; z-index: 1000; background-color: white; padding: 10px; border-radius: 5px;">
    <p><strong>Glaciers</strong></p>
    <p><span style="color: red;">●</span> GEBROULAZ</p>
    <p><span style="color: blue;">●</span> ARGENTIERE</p>
    <p><span style="color: green;">●</span> SAINT SORLIN</p>
    <p><span style="color: purple;">●</span> MER DE GLACE</p>
    <p><span style="color: orange;">●</span> SARENNES</p>
</div>
'''
m.get_root().html.add_child(folium.Element(legend_html))

m

##### 3. WGMS Mapping, appending SARENNES and conversion into single dataframe

In [8]:
stakes_csv_WGMS = {}

# Process each DataFrame in the original dictionary to WGMS format
for key, df in stakes_csv_all.items():
    try:
        required_cols = {'lat', 'lon', 'altitude', 'stake_number', 
                        'day_start', 'month_start', 'year_start',
                        'day_end', 'month_end', 'year_end'}
        if all(col in df.columns for col in required_cols):
            stakes_csv_WGMS[key] = transform_WGMS_df(df, key)
    except Exception as e:
        print(f"Error processing {key}: {str(e)}")

# Append SARENNES to the dictionary
columns = ['POINT_ID', 'POINT_LAT', 'POINT_LON', 'POINT_ELEVATION', 'FROM_DATE', 
           'TO_DATE', 'POINT_BALANCE', 'GLACIER', 'PERIOD', 'GLACIER_ZONE']
for key, df in sarennes_dfs.items():
    stakes_csv_WGMS[key] = df.rename(columns={'lat': 'POINT_LAT', 'lon': 'POINT_LON'})[columns]

###### Combine dictionary of dfs into 1 df

In [9]:
# Convert dictionary to single DataFrame
stakes_csv_WGMS_combined = pd.concat(stakes_csv_WGMS.values(), ignore_index=True)
# Sort by date and glacier
stakes_csv_WGMS_combined = stakes_csv_WGMS_combined.sort_values(['GLACIER', 'GLACIER_ZONE', 'PERIOD', 'FROM_DATE'])
# Add YEAR column to df
stakes_csv_WGMS_combined['YEAR'] = stakes_csv_WGMS_combined['TO_DATE'].astype(str).str[:4].astype(int)
# Add data modification column to keep track of mannual changes
stakes_csv_WGMS_combined['DATA_MODIFICATION'] = ''


#### 4. Merge stakes that are close

In [10]:
close_stakes_df = find_close_stakes(stakes_csv_WGMS_combined)

pd.set_option('display.max_rows', None)
display(close_stakes_df.sort_values('DISTANCE_M'))
pd.reset_option('display.max_rows')

Processing glacier-year-periods: 100%|██████████| 501/501 [00:16<00:00, 30.07it/s] 

Found 128 pairs of stakes that are 10m or closer





Unnamed: 0,GLACIER,YEAR,PERIOD,POINT_ID_1,POINT_ID_2,LAT_1,LON_1,LAT_2,LON_2,POINT_BALANCE_1,POINT_BALANCE_2,DISTANCE_M
8,Argentiere,1988,annual,Argentiere_Profils_2_4_5_7_annual_smb_abl_1988...,Argentiere_Profils_2_4_5_7_annual_smb_abl_1988...,45.953613,6.984702,45.953613,6.984702,-2.74,-2.61,0.0
7,Argentiere,1987,annual,Argentiere_Profils_2_4_5_7_annual_smb_abl_1987...,Argentiere_Profils_2_4_5_7_annual_smb_abl_1987...,45.952543,6.984088,45.952543,6.984088,-0.22,-1.53,0.0
75,mdg,1999,annual,mdg_Tacul_annual_smb_abl_1999_setup1997_20,mdg_Tacul_annual_smb_abl_1999_setup1998_20,45.888731,6.937205,45.888731,6.937205,-4.54,-4.68,0.0
76,mdg,1999,annual,mdg_Tacul_annual_smb_abl_1999_setup1997_17,mdg_Tacul_annual_smb_abl_1999_setup1998_17,45.893322,6.937385,45.893322,6.937385,-4.1,-4.25,0.0
95,mdg,2019,summer,mdg_Leschaux_summer_smb_abl_2019_setup2017_1,mdg_Leschaux_summer_smb_abl_2019_setup2017_3,45.883189,6.981982,45.883189,6.981982,-4.68,-4.55,0.0
65,mdg,1993,annual,mdg_Tacul_annual_smb_abl_1993_setup1991_12,mdg_Tacul_annual_smb_abl_1993_setup1992_12,45.893151,6.93752,45.893151,6.93752,-2.79,-3.73,0.0
70,mdg,1998,summer,mdg_Talefre_summer_smb_abl_1998_setup1995_2,mdg_Talefre_summer_smb_abl_1998_setup1998_2,45.912896,6.988742,45.912896,6.988742,-4.48,-4.36,0.0
88,mdg,2017,annual,mdg_annual_accu_2017_setup2016_4,mdg_annual_accu_2017_setup2017_4,45.859022,6.915811,45.859022,6.915811,-0.53,0.67,0.0
89,mdg,2017,annual,mdg_annual_accu_2017_setup2016_5,mdg_annual_accu_2017_setup2017_5,45.864615,6.924533,45.864615,6.924533,-0.61,0.76,0.0
90,mdg,2017,annual,mdg_annual_accu_2017_setup2016_9,mdg_annual_accu_2017_setup2017_9,45.862225,6.902224,45.862225,6.902224,0.62,0.16,0.0


###### Most of these are stakes with different setup year, merge and drop

In [11]:
stakes_csv_WGMS_combined_dropped = merge_close_stakes(stakes_csv_WGMS_combined, close_stakes_df)

print(f"Original dataframe size: {len(stakes_csv_WGMS_combined)}")
print(f"After merging close stakes: {len(stakes_csv_WGMS_combined_dropped)}")

Merging stakes within 10m: 128it [00:00, 396.17it/s]

Merged 125 pairs of close stakes
Original dataframe size: 10902
After merging close stakes: 10760





##### 5. General data cleaning

In [12]:
print("Rows with NaN values:")
display(stakes_csv_WGMS_combined_dropped[stakes_csv_WGMS_combined_dropped.isna().any(axis=1)])

print("Rows with zero values:")
display(stakes_csv_WGMS_combined_dropped[stakes_csv_WGMS_combined_dropped.eq(0).any(axis=1)])

# Saint-Sorlin has POINT_ELEVATION 0.0 on 8 stakes, also about 20 stakes have point balance 0.00, remove them all
mask_zeros = stakes_csv_WGMS_combined_dropped.eq(0).any(axis=1)
stakes_csv_WGMS_combined_dropped = stakes_csv_WGMS_combined_dropped[~mask_zeros]

Rows with NaN values:


Unnamed: 0,POINT_ID,POINT_LAT,POINT_LON,POINT_ELEVATION,FROM_DATE,TO_DATE,POINT_BALANCE,GLACIER,PERIOD,GLACIER_ZONE,YEAR,DATA_MODIFICATION


Rows with zero values:


Unnamed: 0,POINT_ID,POINT_LAT,POINT_LON,POINT_ELEVATION,FROM_DATE,TO_DATE,POINT_BALANCE,GLACIER,PERIOD,GLACIER_ZONE,YEAR,DATA_MODIFICATION
3612,Argentiere_annual_smb_accu_1997_setup1997_8,45.936506,7.014902,2800.0,19960910,19970909,-0.0,Argentiere,annual,acum,1997,
3498,Argentiere_annual_smb_accu_2012_setup2012_10,45.927993,7.026717,2935.0,20111005,20121013,-0.0,Argentiere,annual,acum,2012,
3084,Argentiere_Profils_2_4_5_7_winter_smb_abl_2018...,45.953133,6.988102,2535.05,20170929,20180620,-0.0,Argentiere,winter,profil5,2018,
7178,geb_annual_smb_accu_2002_setup2002_5,45.288793,6.632667,3005.0,20010901,20021009,-0.0,geb,annual,acum,2002,
7105,geb_annual_smb_accu_2011_setup2011_11,45.277496,6.631194,3425.0,20101006,20111005,-0.0,geb,annual,acum,2011,
188,mdg_langue_winter_smb_abl_2005_setup2004_2,45.909935,6.941207,2019.45,20041024,20050511,0.0,mdg,winter,Langue,2005,
225,mdg_langue_winter_smb_abl_2008_setup2007_3,45.91153,6.940984,1996.0,20071002,20080507,-0.0,mdg,winter,Langue,2008,
202,mdg_langue_winter_smb_abl_2014_setup2013_2,45.909891,6.941098,1981.53,20131018,20140516,-0.0,mdg,winter,Langue,2014,
2284,mdg_summer_smb_accu_1996_setup1996_1,45.876778,6.891359,3570.0,19960501,19960911,-0.0,mdg,summer,acum,1996,
8791,stso_annual_smb_abl_1962_setup1959_47,45.164251,6.167081,0.0,19610928,19621007,-2.32,stso,annual,ablation,1962,


In [13]:
annual_inconsistent, winter_inconsistent, summer_inconsistent = check_period_consistency(stakes_csv_WGMS_combined_dropped)

display(annual_inconsistent)
display(summer_inconsistent)

# 7 short summers but nothing majorly inconsistent, leaving them in
# Annuual, mdg_langue_annual_smb_abl_2008_setup2008_9 is a faulty measurement, goes from 2009 to 2008, pop it
mask = stakes_csv_WGMS_combined_dropped['POINT_ID'] != 'mdg_langue_annual_smb_abl_2008_setup2008_9'
stakes_csv_WGMS_combined_dropped = stakes_csv_WGMS_combined_dropped[mask]

Annual periods: 1 out of 4647 (0.0%) are inconsistent
Winter periods: 0 out of 3339 (0.0%) are inconsistent
Summer periods: 7 out of 2752 (0.3%) are inconsistent


Unnamed: 0,POINT_ID,POINT_LAT,POINT_LON,POINT_ELEVATION,FROM_DATE,TO_DATE,POINT_BALANCE,GLACIER,PERIOD,GLACIER_ZONE,YEAR,DATA_MODIFICATION,FROM_DATE_DT,TO_DATE_DT,MONTH_DIFF
1642,mdg_langue_annual_smb_abl_2008_setup2008_9,45.929113,6.924582,1729.46,20090903,20080930,-1.12,mdg,annual,Langue,2008,,2009-09-03,2008-09-30,-12


Unnamed: 0,POINT_ID,POINT_LAT,POINT_LON,POINT_ELEVATION,FROM_DATE,TO_DATE,POINT_BALANCE,GLACIER,PERIOD,GLACIER_ZONE,YEAR,DATA_MODIFICATION,FROM_DATE_DT,TO_DATE_DT,MONTH_DIFF
5788,Argentiere_Profils_2_4_5_7_summer_smb_abl_2018...,45.953486,6.987828,2531.67,20180620,20180815,-3.03,Argentiere,summer,profil5,2018,,2018-06-20,2018-08-15,2
7716,geb_summer_smb_accu_1995_setup1995_2,45.276201,6.636968,3440.0,19950627,19950817,-0.7,geb,summer,acum,1995,,1995-06-27,1995-08-17,2
7717,geb_summer_smb_accu_1995_setup1995_3,45.278505,6.640456,3390.0,19950627,19950817,-0.7,geb,summer,acum,1995,,1995-06-27,1995-08-17,2
7719,geb_summer_smb_accu_1995_setup1995_11,45.277496,6.631194,3425.0,19950627,19950817,-0.51,geb,summer,acum,1995,,1995-06-27,1995-08-17,2
10639,stso_summer_smb_accu_1994_setup1994_6,45.158667,6.16075,2885.0,19940613,19940818,-2.0,stso,summer,acum,1994,,1994-06-13,1994-08-18,2
10594,stso_summer_smb_accu_1995_setup1995_3,45.156892,6.15008,3230.0,19950620,19950822,-1.13,stso,summer,acum,1995,,1995-06-20,1995-08-22,2
10595,stso_summer_smb_accu_1995_setup1995_5,45.164433,6.150294,3060.0,19950620,19950822,-1.8,stso,summer,acum,1995,,1995-06-20,1995-08-22,2


#### Add RGIId and OGGM Data

In [14]:
# initialize OGGM glacier directories
gdirs, rgidf = initialize_oggm_glacier_directories(
    working_dir = path_OGGM,
    rgi_region="11",
    rgi_version="6",
    base_url=
    "https://cluster.klima.uni-bremen.de/~oggm/gdirs/oggm_v1.6/L3-L5_files/2023.1/elev_bands/W5E5_w_data/",
    log_level='WARNING',
    task_list=None,
)

2025-06-05 09:43:12: oggm.cfg: Reading default parameters from the OGGM `params.cfg` configuration file.
2025-06-05 09:43:12: oggm.cfg: Multiprocessing switched OFF according to the parameter file.
2025-06-05 09:43:12: oggm.cfg: Multiprocessing: using all available processors (N=32)
2025-06-05 09:43:12: oggm.cfg: PARAMS['border'] changed from `80` to `10`.
2025-06-05 09:43:12: oggm.cfg: Multiprocessing switched ON after user settings.
2025-06-05 09:43:12: oggm.cfg: PARAMS['continue_on_error'] changed from `False` to `True`.
2025-06-05 09:43:12: oggm.workflow: init_glacier_directories from prepro level 3 on 3927 glaciers.
2025-06-05 09:43:12: oggm.workflow: Execute entity tasks [gdir_from_prepro] on 3927 glaciers
2025-06-05 09:43:29: oggm.workflow: Execute entity tasks [gridded_attributes] on 3927 glaciers


In [15]:
glacier_outline = gpd.read_file('/home/mburlet/OGGM/rgi/RGIV60/11_rgi60_CentralEurope/11_rgi60_CentralEurope.shp')

# Add RGI IDs through intersection
stakes_csv_WGMS_RGIID = mbm.data_processing.utils.get_rgi(data=stakes_csv_WGMS_combined_dropped,
                                           glacier_outlines=glacier_outline)

display(len(stakes_csv_WGMS_RGIID[stakes_csv_WGMS_RGIID['RGIId'].isna()]))

# Remove stakes without RGIId, as they wont have OGGM data anyways
stakes_csv_WGMS_RGIID = stakes_csv_WGMS_RGIID.dropna(subset=['RGIId'])


# Create a dictionary mapping from RGIId to glacier name
rgi_to_name_dict = dict(zip(rgidf.RGIId, rgidf.Name))
stakes_csv_WGMS_RGIID['GLACIER'] = stakes_csv_WGMS_RGIID['RGIId'].map(rgi_to_name_dict)

display(stakes_csv_WGMS_RGIID['GLACIER'].unique())



162

array(['FR4N01235A08 dArgentiere', 'FR4N01236A02 des Grands Montets',
       'FR4N01146D09+E06 Gebroulaz', 'FR4N01236A01 Mer de Glace/Geant',
       'FR4N01236A01 Leschaux', 'FR4N01236A07 de Talefre',
       'FR4N01163A02 de Sarennes 1',
       'FR4N01162B09+154D03 de Saint Sorlin'], dtype=object)

In [16]:
unique_rgis = stakes_csv_WGMS_RGIID['RGIId'].unique()

run = True
if run:
    export_oggm_grids(gdirs, subset_rgis=unique_rgis, output_path=path_OGGM_xrgrids)

stakes_csv_WGMS_RGIID_oggm = merge_pmb_with_oggm_data(df_pmb=stakes_csv_WGMS_RGIID,
                                       gdirs=gdirs,
                                       rgi_region="11",
                                       rgi_version="6")

In [17]:
# Restrict to within glacier shape and drop the column
stakes_csv_WGMS_RGIID_oggm = stakes_csv_WGMS_RGIID_oggm[stakes_csv_WGMS_RGIID_oggm['within_glacier_shape'] == True]
stakes_csv_WGMS_RGIID_oggm = stakes_csv_WGMS_RGIID_oggm.drop(columns=['within_glacier_shape'])

In [20]:
# Save the combined DataFrame to a CSV file
stakes_csv_WGMS_RGIID_oggm.to_csv(path_PMB_GLACIOCLIM_csv + 'FR_wgms_dataset_all_oggm.csv', index=False)