# Pre-processing of GLACIOCLIM MB data:

Does the pre-processing of the point MB measurements from GLACIOCLIM (French Alps).

# Point Mass Balance data:

## Setting up:

In [None]:
import os, sys
sys.path.append(os.path.join(os.getcwd(), '../../')) # Add root of repo to import MBM

import pandas as pd
import os
import warnings
import massbalancemachine as mbm
import geopandas as gpd
import matplotlib.pyplot as plt
import glob
from cmcrameri import cm
from oggm import utils

from scripts.helpers import *
from scripts.glacioclim_preprocess import *
from scripts.config_FR import *

warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2


cfg = mbm.FranceConfig(dataPath='/home/mburlet/scratch/data/DATA_MB/GLACIOCLIM/')

In [None]:
seed_all(cfg.seed)
free_up_cuda()

# Plot styles:
path_style_sheet = 'scripts/example.mplstyle'
plt.style.use(path_style_sheet)

cmap = cm.devon

# For bars and lines:
color_diff_xgb = '#4d4d4d'

colors = get_cmap_hex(cm.batlow, 10)
color_1 = colors[0]
color_2 = '#c51b7d'

## 1. Load data into dictionary of dataframes

###### unzip GLACIOCLIM Files

In [None]:
read_in = False

if read_in == True:
    extract_glacioclim_files(cfg.dataPath + path_PMB_GLACIOCLIM_raw)

###### Read all csv from "unzipped" into single dictionary of dataframes

In [None]:
stakes_csv_all = {}

for path, _, files in os.walk(cfg.dataPath + path_PMB_GLACIOCLIM_raw):
    # Find all .csv files in the current directory
    csv_files = glob.glob(os.path.join(path, '*.csv'))
    
    # Read each CSV file and add to dictionary
    for file in csv_files:
        try:
            # Use the filename as the key
            key = os.path.splitext(os.path.basename(file))[0]
            # Read the CSV file
            stakes_csv_all[key] = pd.read_csv(file)
        except Exception as e:
            print(f"Error reading {file}: {str(e)}")

print(f"\nTotal number of files processed: {len(stakes_csv_all)}")
display(stakes_csv_all)

#### Special case Sarennes. 
###### The File is not in a typical csv structure, so cant just be read in. For this code to work, in each sheet in the B_SARENNES_94_20.xls file, make a table with start_date, spring_date, end_date, X, Y, Z, winter mb, summer mb, annual mb (requires xlrd package to read in)

In [None]:
sarennes_path = cfg.dataPath + 'Glacier de SARENNES_discontinued/B_SARENNES_94-20.xls'

# Read all sheets into a dictionary of dfs
all_sheets = pd.read_excel(sarennes_path, sheet_name=None)
sarennes_dfs = extract_sarennes_data(all_sheets)
display(sarennes_dfs)


##### 2. Convert from Lambert3 / Lambert2 cooridnates to WGS84

In [None]:
## For some reason there is a 2 in front of the y_lambert2e coordinates in certain years, hardcode remove them:
stakes_csv_all['mdg_Tacul_winter_smb_abl_2007']['y_lambert2e'] = stakes_csv_all['mdg_Tacul_winter_smb_abl_2007']['y_lambert2e'].apply(lambda x: x - 2000000 if x > 2000000 else x)
stakes_csv_all['mdg_Talefre_annual_smb_abl_2006']['y_lambert2e'] = stakes_csv_all['mdg_Talefre_annual_smb_abl_2006']['y_lambert2e'].apply(lambda x: x - 2000000 if x > 2000000 else x)

stakes_csv_all = lambert_transform(stakes_csv_all)
sarennes_dfs = lambert_transform(sarennes_dfs)

##### Plot stakes over map for visual conformation of coordinate transformation

###### This needs additional folium package to be installed

In [None]:
import folium

glacier_dfs = {
    'GEBROULAZ': (stakes_csv_all['geb_annual_smb_abl_1979'], 'red'),
    'ARGENTIERE': (stakes_csv_all['Argentiere_annual_smb_accu_1995'], 'blue'),
    'SAINT_SORLIN': (stakes_csv_all['stso_annual_smb_abl_1957'], 'green'),
    'MER_DE_GLACE': (stakes_csv_all['mdg_Leschaux_winter_smb_abl_2020'], 'purple'),
    'SARENNES': (sarennes_dfs['sarennes_complete_winter_2006'], 'orange')
}

center_lat, center_lon = 45.8736, 6.8770

m = folium.Map(location=[center_lat, center_lon], zoom_start=10)

# Add markers for each glacier
for glacier_name, (df, color) in glacier_dfs.items():
    fg = folium.FeatureGroup(name=glacier_name)
    
    for _, row in df.iterrows():
        if glacier_name == 'SARENNES':
            stake_id = row['POINT_ID'].split('_')[-1]
            altitude = row['POINT_ELEVATION']
        else:
            stake_id = row['stake_number']
            altitude = row['altitude']
            
        # Add circle marker with popup
        folium.CircleMarker(
            location=[row['lat'], row['lon']],
            radius=5,
            color=color,
            fill=True,
            fill_color=color,
            popup=f"{glacier_name} - Stake {stake_id}: {altitude}m"
        ).add_to(fg)
    
    fg.add_to(m)

# Add a legend
legend_html = '''
<div style="position: fixed; bottom: 50px; left: 50px; z-index: 1000; background-color: white; padding: 10px; border-radius: 5px;">
    <p><strong>Glaciers</strong></p>
    <p><span style="color: red;">●</span> GEBROULAZ</p>
    <p><span style="color: blue;">●</span> ARGENTIERE</p>
    <p><span style="color: green;">●</span> SAINT SORLIN</p>
    <p><span style="color: purple;">●</span> MER DE GLACE</p>
    <p><span style="color: orange;">●</span> SARENNES</p>
</div>
'''
m.get_root().html.add_child(folium.Element(legend_html))

m

##### 3. WGMS Mapping, appending SARENNES and conversion into single dataframe

In [None]:
stakes_csv_WGMS = {}

# Process each DataFrame in the original dictionary to WGMS format
for key, df in stakes_csv_all.items():
    try:
        required_cols = {'lat', 'lon', 'altitude', 'stake_number', 
                        'day_start', 'month_start', 'year_start',
                        'day_end', 'month_end', 'year_end'}
        if all(col in df.columns for col in required_cols):
            stakes_csv_WGMS[key] = transform_WGMS_df(df, key)
    except Exception as e:
        print(f"Error processing {key}: {str(e)}")

# Append SARENNES to the dictionary
columns = ['POINT_ID', 'POINT_LAT', 'POINT_LON', 'POINT_ELEVATION', 'FROM_DATE', 
           'TO_DATE', 'POINT_BALANCE', 'GLACIER', 'PERIOD', 'GLACIER_ZONE']
for key, df in sarennes_dfs.items():
    stakes_csv_WGMS[key] = df.rename(columns={'lat': 'POINT_LAT', 'lon': 'POINT_LON'})[columns]

###### Combine dictionary of dfs into 1 df

In [None]:
# Convert dictionary to single DataFrame
stakes_csv_WGMS_combined = pd.concat(stakes_csv_WGMS.values(), ignore_index=True)
# Sort by date and glacier
stakes_csv_WGMS_combined = stakes_csv_WGMS_combined.sort_values(['GLACIER', 'GLACIER_ZONE', 'PERIOD', 'FROM_DATE'])
# Add YEAR column to df
stakes_csv_WGMS_combined['YEAR'] = stakes_csv_WGMS_combined['TO_DATE'].astype(str).str[:4].astype(int)
# Add data modification column to keep track of mannual changes
stakes_csv_WGMS_combined['DATA_MODIFICATION'] = ''


#### 4. Merge stakes that are close

In [None]:
stakes_csv_WGMS_combined_dropped = remove_close_points(stakes_csv_WGMS_combined)

display(find_close_stakes(stakes_csv_WGMS_combined_dropped).sort_values('DISTANCE_M'))

##### 5. General data cleaning

In [None]:
print("Rows with NaN values:")
display(stakes_csv_WGMS_combined_dropped[stakes_csv_WGMS_combined_dropped.isna().any(axis=1)])

print("Rows with zero values:")
display(stakes_csv_WGMS_combined_dropped[stakes_csv_WGMS_combined_dropped.eq(0).any(axis=1)])

print("Rows with extreme POINT_BALANCE values (>5 or <-15):")
display(stakes_csv_WGMS_combined_dropped[
    (stakes_csv_WGMS_combined_dropped['POINT_BALANCE'] > 5) | 
    (stakes_csv_WGMS_combined_dropped['POINT_BALANCE'] < -15)
])

# Saint-Sorlin has POINT_ELEVATION 0.0 on 8 stakes, also about 20 stakes have point balance 0.00, remove them all
mask_zeros = stakes_csv_WGMS_combined_dropped.eq(0).any(axis=1)
stakes_csv_WGMS_combined_dropped = stakes_csv_WGMS_combined_dropped[~mask_zeros]

# stso_summer_smb_abl_2017_setup2015_14 -88 pmb, nonsensical value, remove it
mask = stakes_csv_WGMS_combined_dropped['POINT_ID'] != 'stso_summer_smb_abl_2017_setup2015_14'
stakes_csv_WGMS_combined_dropped = stakes_csv_WGMS_combined_dropped[mask]

In [None]:
annual_inconsistent, winter_inconsistent, summer_inconsistent = check_period_consistency(stakes_csv_WGMS_combined_dropped)

display(annual_inconsistent)
display(summer_inconsistent)

# 7 short summers but nothing majorly inconsistent, leaving them in
# Annuual, mdg_langue_annual_smb_abl_2008_setup2008_9 is a faulty measurement, goes from 2009 to 2008, pop it
mask = stakes_csv_WGMS_combined_dropped['POINT_ID'] != 'mdg_langue_annual_smb_abl_2008_Langue_setup2008_9'
stakes_csv_WGMS_combined_dropped = stakes_csv_WGMS_combined_dropped[mask]

#### Add RGIId and OGGM Data

In [None]:
# initialize OGGM glacier directories
gdirs, rgidf = initialize_oggm_glacier_directories(
    working_dir = cfg.dataPath + path_OGGM,
    rgi_region="11",
    rgi_version="6",
    base_url=
    "https://cluster.klima.uni-bremen.de/~oggm/gdirs/oggm_v1.6/L3-L5_files/2023.1/elev_bands/W5E5_w_data/",
    log_level='WARNING',
    task_list=None,
)

In [None]:
# Load glacier outlines
rgi_file = utils.get_rgi_region_file(region="11", version="6")
glacier_outline = gpd.read_file(rgi_file)

# Add RGI IDs through intersection
stakes_csv_WGMS_RGIID = mbm.data_processing.utils.get_rgi(data=stakes_csv_WGMS_combined_dropped,
                                           glacier_outlines=glacier_outline)

display(len(stakes_csv_WGMS_RGIID[stakes_csv_WGMS_RGIID['RGIId'].isna()]))

# Remove stakes without RGIId, as they wont have OGGM data anyways
stakes_csv_WGMS_RGIID = stakes_csv_WGMS_RGIID.dropna(subset=['RGIId'])


# Create a dictionary mapping from RGIId to glacier name
rgi_to_name_dict = dict(zip(rgidf.RGIId, rgidf.Name))
stakes_csv_WGMS_RGIID['GLACIER'] = stakes_csv_WGMS_RGIID['RGIId'].map(rgi_to_name_dict)

display(stakes_csv_WGMS_RGIID['GLACIER'].unique())



In [None]:
unique_rgis = stakes_csv_WGMS_RGIID['RGIId'].unique()

run = True
if run:
    export_oggm_grids(gdirs, subset_rgis=unique_rgis, output_path= cfg.dataPath + path_OGGM_xrgrids)

stakes_csv_WGMS_RGIID_oggm = merge_pmb_with_oggm_data(df_pmb=stakes_csv_WGMS_RGIID,
                                       gdirs=gdirs,
                                       rgi_region="11",
                                       rgi_version="6")

In [None]:
# Restrict to within glacier shape and drop the column
stakes_csv_WGMS_RGIID_oggm = stakes_csv_WGMS_RGIID_oggm[stakes_csv_WGMS_RGIID_oggm['within_glacier_shape'] == True]
stakes_csv_WGMS_RGIID_oggm = stakes_csv_WGMS_RGIID_oggm.drop(columns=['within_glacier_shape'])

In [None]:
# Check for NaN

#pd.set_option('display.max_rows', None)
display(stakes_csv_WGMS_RGIID_oggm[stakes_csv_WGMS_RGIID_oggm.isna().any(axis=1)])
#pd.reset_option('display.max_rows')

# ~90 hugonnet_dhdt and 1 consensus_ice_thickness are NaN, drop them
stakes_csv_WGMS_RGIID_oggm = stakes_csv_WGMS_RGIID_oggm.dropna()

display(stakes_csv_WGMS_RGIID_oggm[stakes_csv_WGMS_RGIID_oggm.isna().any(axis=1)])

In [None]:
# Save the combined DataFrame to a CSV file
stakes_csv_WGMS_RGIID_oggm.to_csv(cfg.dataPath + path_PMB_GLACIOCLIM_csv + 'FR_wgms_dataset_all_oggm.csv', index=False)
display(stakes_csv_WGMS_RGIID_oggm.head(2))