# Compare modeled and remotely-sensed surface mass balance

Requires mass balance model outputs at each site from [PyGEM](https://github.com/PyGEM-Community/PyGEM) (Rounce et al., 2023), which can be downloaded form the Carnegie Mellon data repository.

The files downloaded for this work were:
- Monthly surface mass balance along glacier centerlines for 2000–2022: "binned", downloaded from [global_ERA5_2000_2022](https://cmu.app.box.com/s/rzk8aeasg40dd3p0xr3yngkc5c0m8kxt/folder/251139952066)
- Calibrated model parameters (degree-day factors of snow and temperature biases): "{RGI ID}_modelprms_dict.json" files downloaded from [pygem_datasets > Calibration](https://cmu.app.box.com/s/p8aiby5s9f3n6ycgmhknbgo4htk3pn9j/folder/298954564072)

All files were placed in a folder called "Rounce_et_al_2023", defined with the `model_path` variable below. 

In [None]:
import os
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import median_abs_deviation as MAD
from tqdm.auto import tqdm
import glob
import geopandas as gpd
import json
import seaborn as sns
import sys

In [None]:
# Define paths for inputs and outputs
scm_path = '/Volumes/LaCie/raineyaberle/Research/PhD/snow_cover_mapping/'
model_path = os.path.join(scm_path, 'Rounce_et_al_2023')
out_path = os.path.join(scm_path, 'analysis')
code_path = '/Users/raineyaberle/Research/PhD/snow_cover_mapping/glacier-snow-cover-analysis'
figures_path = os.path.join(code_path, 'figures')
# import utility functions
sys.path.append(os.path.join(code_path, 'scripts'))
import utils as f

# Load glacier boundaries for RGI IDs
aois_fn = os.path.join(scm_path, 'analysis', 'AOIs.gpkg')
aois = gpd.read_file(aois_fn)

## 1. Monthly snowline altitudes

### Remotely-sensed SLAs

In [None]:
slas_obs_fn = os.path.join(out_path, 'monthly_SLAs_observed.nc')
if not os.path.exists(slas_obs_fn):
    # Initialize a list to store DataFrames
    slas_obs_list = []
    
    # sample SLA observations +/- 1 week of the first of each month
    tdelta = np.timedelta64(1, 'W')  

    # Iterate over RGI IDs
    for rgi_id in tqdm(sorted(aois['RGIId'].drop_duplicates().values)):
        scs_fn = os.path.join(scm_path, 'study-sites', rgi_id, f'{rgi_id}_classifications.zarr')
        scs = f.load_snow_cover_stats(scs_fn)
        scs = scs.assign_coords({'Year': scs['time'].dt.isocalendar().year})
        scs = scs.assign_coords({'Month': scs['time'].dt.month})
        scs = scs.assign_coords({'Day': scs['time'].dt.day})
        
        # Grab monthly snowline altitude
        dates = []
        slas = []
        nobs = []
        for year, month in np.unique(np.array(list(zip(scs['Year'].values, scs['Month'].values))), axis=0):
            target_time = np.datetime64(f'{year}-0{month}-01') if month < 10 else np.datetime64(f'{year}-{month}-01')
            scs_near = scs.sel(time=(scs['time'] >= target_time - tdelta) & 
                                    (scs['time'] <= target_time + tdelta))
            scs_near = scs_near.dropna(dim='time')
            if len(scs_near['SLA'].values) > 0:
                dates.append(target_time)
                slas.append(float(scs_near['SLA'].mean().values))
                nobs.append(len(scs_near['SLA'].values))
        
        scs_monthly = pd.DataFrame({'RGIId': [rgi_id]*len(dates), 
                                    'Date': dates,
                                    'SLA_obs': slas,
                                    'num_obs': nobs})
        slas_obs_list.append(scs_monthly)
    
    # Combine all DataFrames
    slas_obs = pd.concat(slas_obs_list)
    
    # Pivot
    slas_obs_pivot = slas_obs.pivot(index='Date', columns='RGIId', values='SLA_obs')
    slas_obs_pivot = slas_obs_pivot.sort_index()
    
    # Convert to xarray Dataset
    slas_obs_xr = xr.Dataset(
        {"SLA_obs": (['time', 'RGIId'], slas_obs_pivot.values)},
        coords={"time": slas_obs_pivot.index.values,
                "RGIId": slas_obs_pivot.columns.values}
    )

    # set attributes
    slas_obs_xr['SLA_obs'].attrs['long_name'] = 'observed snowline altitude'
    slas_obs_xr['SLA_obs'].attrs['units'] = 'meters above sea level'
    
    # Save to NetCDF file
    slas_obs_xr.to_netcdf(slas_obs_fn)
    print('Remotely-sensed monthly SLAs saved to file:', slas_obs_fn)

else:
    # Load from file
    slas_obs_xr = xr.load_dataset(slas_obs_fn)
    print('Remotely-sensed monthly SLAs loaded from file.')

slas_obs_xr


### Modeled SLAs and SMB at observed SLAs

In [None]:
# Check if file already exists
slas_mod_fn = os.path.join(out_path, 'monthly_SLAs_modeled.nc')
if not os.path.exists(slas_mod_fn):
    
    # Initialize a list to store DataFrames
    slas_mod_list = []
    
    # Iterate over sites
    for rgi_id in tqdm(aois['RGIId'].drop_duplicates().values):
        # Load modeled monthly SMB
        smb_fn = glob.glob(os.path.join(model_path, 'glac_SMB_binned', f"{rgi_id.split('RGI60-0')[1]}*.nc"))[0]
        smb = xr.open_dataset(smb_fn)
        # calculate cumulative SMB
        def water_year(date):
            if date.month >= 10:
                return date.year
            else:
                return date.year - 1
        smb = smb.assign_coords({'water_year': (['time'], [water_year(t) for t in smb.time.values])})
        smb['bin_massbalclim_monthly_cumsum'] = smb['bin_massbalclim_monthly'].groupby('water_year').cumsum()
        smb['time'] = smb.time.values.astype('datetime64[D]')
        h = smb['bin_surface_h_initial'].data.ravel()
        
        # Interpolate modeled SLA as where SMB = 0 and SMB at the observed SLA
        slas = np.nan * np.zeros(len(smb.time.data))
        smb_at_slas = np.nan * np.zeros(len(smb.time.data))
        for j, t in enumerate(smb.time.data):
            smb_time = smb.sel(time=t)['bin_massbalclim_monthly_cumsum'].data[0]
            # when SMB <= 0 everywhere, set SLA to maximum glacier elevation
            if np.all(smb_time <= 0):
                slas[j] = np.max(h)
            # when SMB >= 0 everywhere, set SLA to minimum glacier elevation
            elif np.all(smb_time >= 0):
                slas[j] = np.min(h)
            # otherwise, linearly interpolate SLA
            else:
                sorted_indices = np.argsort(h)
                slas[j] = np.interp(0, smb_time[sorted_indices], h[sorted_indices])
            # interpolate the modeled SMB at the observed SLA
            sla_obs = slas_obs.loc[(slas_obs['RGIId']==rgi_id) & (slas_obs['Date']==t), 'SLA_obs']
            if len(sla_obs) > 0:
                smb_at_slas[j] = np.interp(sla_obs.values[0], h, smb_time)

        # Save results in dataframe
        df = pd.DataFrame({'RGIId': [rgi_id]*len(smb.time.data),
                            'Date': smb.time.data,
                            'SLA_mod': slas,
                            'SMB_at_SLA_obs': smb_at_slas})
        # concatenate to dataframe list
        slas_mod_list.append(df)
        
    # Combine all DataFrames
    slas_mod = pd.concat(slas_mod_list)
    
    # Create xarray Dataset
    slas_mod_pivot = slas_mod.pivot(index='Date', columns='RGIId', values=['SLA_mod', 'SMB_at_SLA_obs'])
    slas_mod_pivot = slas_mod_pivot.sort_index()
    
    # Convert to xarray Dataset
    slas_mod_xr = xr.Dataset(
        {"SLA_mod": (['time', 'RGIId'], slas_mod_pivot['SLA_mod'].values),
         "SMB_at_SLA_obs":(['time', 'RGIId'], slas_mod_pivot['SMB_at_SLA_obs'].values)},
        coords={"time": slas_mod_pivot.index.values,
                "RGIId": slas_mod_pivot.columns.levels[1].values}
    )

    # assign attributes
    slas_mod_xr['SLA_mod'].attrs['long_name'] = 'modeled snowline altitude'
    slas_mod_xr['SLA_mod'].attrs['units'] = 'meters above sea level'
    slas_mod_xr['SMB_at_SLA_obs'].attrs['long_name'] = 'modeled surface mass balance at observed snowline altitude'
    slas_mod_xr['SMB_at_SLA_obs'].attrs['units'] = 'meters water equivalent'

    # Save to NetCDF file
    slas_mod_xr.to_netcdf(slas_mod_fn)
    print('Modeled monthly SLAs and snowline SMB saved to file:', slas_mod_fn)
    
else:
    # Load from file
    slas_mod_xr = xr.load_dataset(slas_mod_fn)
    print('Modeled monthly SLAs loaded from file.')

slas_mod_xr

### Merge

In [None]:
# Define output file
slas_merged_fn = os.path.join(out_path, 'monthly_SLAs_observed_modeled.nc')
if not os.path.exists(slas_merged_fn):

    # Merge modeled and remotely-sensed SLAs and modeled SMB at observed snowline
    slas_merged = xr.merge([slas_obs_xr, slas_mod_xr])
    
    # Remove 2000-2012 (no observed values) and 2023 (no modeled values)
    slas_merged.sel(time=slice("2013-01-01","2023-01-01"))
    
    # Remove observations outside May - November (no observed values)
    def filter_month_range(month):
        return (month >= 5) & (month <= 10)
    slas_merged = slas_merged.sel(time=filter_month_range(slas_merged['time.month']))
    
    # Save results
    slas_merged.to_netcdf(slas_merged_fn)
    print('Merged monthly SLAs saved to file:', slas_merged_fn)

else:
    slas_merged = xr.load_dataset(slas_merged_fn)
    print('Merged monthly SLAs loaded from file.')

slas_merged['SLA_mod-obs'] = slas_merged['SLA_mod'] - slas_merged['SLA_obs']

# Plot
fig, ax = plt.subplots(figsize=(6,5))
ax.hist(slas_merged['SLA_mod-obs'].values.ravel(), bins=50)
ax.set_xlabel('SLA$_{mod}$ - SLA$_{obs}$ [m]')
ax.set_ylabel('Counts')
plt.show()

print('\nDifference stats:')
print(f'Mean diff = {np.nanmean((slas_merged["SLA_mod-obs"]).values)} m')
print(f'Std. diff = {np.nanstd((slas_merged["SLA_mod-obs"]).values)} m')
print(f'Median diff = {np.nanmedian((slas_merged["SLA_mod-obs"]).values)} m')
print(f'MAD diff = {MAD((slas_merged["SLA_mod-obs"]).values.ravel(), nan_policy="omit")} m')

## 2. ELAs

In [None]:
slas_elas_merged_fn = os.path.join(out_path, 'monthly_SLAs_annual_ELAs_observed_modeled.nc')

if not os.path.exists(slas_elas_merged_fn):
    # Make a copy of the SLAs
    slas_elas_merged = slas_merged.copy()
    
    # Identify maximum annual observed SLAs
    slas_elas_merged['ELA_obs'] = slas_elas_merged['SLA_obs'].groupby(['time.year']).max()
    # Set ELAs before 2016 to NaN
    slas_elas_merged['ELA_obs'] = xr.where(slas_elas_merged['ELA_obs'].year < 2016, np.nan, slas_elas_merged['ELA_obs'])
    
    # Identify maximum annual modeled SLAs 
    slas_elas_merged['ELA_mod'] = slas_elas_merged['SLA_mod'].groupby(['time.year']).max()
    
    # Set attributes
    slas_elas_merged['ELA_obs'].attrs['long_name'] = 'observed equilibrium line altitude'
    slas_elas_merged['ELA_obs'].attrs['units'] = 'meters above sea level'
    slas_elas_merged['ELA_mod'].attrs['long_name'] = 'modeled equilibrium line altitude'
    slas_elas_merged['ELA_mod'].attrs['units'] = 'meters above sea level'

    # Save to NetCDF file
    slas_elas_merged.to_netcdf(slas_elas_merged_fn)
    print('Merged SLAs and ELAs saved to file:', slas_elas_merged_fn)

else:
    slas_elas_merged = xr.load_dataset(slas_elas_merged_fn)
    print('Merged SLAs and ELAs loaded from file.')

slas_elas_merged

In [None]:
slas_elas_merged['ELA_mod-obs'] = slas_elas_merged['ELA_mod'] - slas_elas_merged['ELA_obs']
plt.hist(slas_elas_merged['ELA_mod-obs'].values.ravel(), bins=50)
plt.show()

print('\nDifference stats:')
print(f"Mean diff = {np.nanmean(slas_elas_merged['ELA_mod-obs'].values)} m")
print(f"Std. diff = {np.nanstd(slas_elas_merged['ELA_mod-obs'].values)} m")
print(f"Median diff = {np.nanmedian(slas_elas_merged['ELA_mod-obs'].values)} m")
print(f"MAD diff = {MAD(slas_elas_merged['ELA_mod-obs'].values.ravel(), nan_policy='omit')} m")

## 3. Assess agreement with new PyGEM runs at the USGS Benchmark Glaciers

For remotely-sensed and modeled snowline time series, bin estimates by week of year, characterize the timing of snowline rise and fall and maximum snowline altitude. Select the PyGEM run that agrees best.  

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Load glacier IDs from model runs
pygem_new_path = os.path.join(scm_path, 'Brandon_new_PyGEM_runs')
rgi_ids = [x for x in sorted(os.listdir(pygem_new_path)) if os.path.isdir(os.path.join(pygem_new_path, x))]
names = ['Gulkana', 'Wolverine', 'Lemon Creek', 'Sperry', 'South Cascade']
print('RGI IDs for glaciers with PyGEM runs:', rgi_ids)

# Iterate over RGI IDs
df_results_full = pd.DataFrame() # initialize dataframe for full results
for i, rgi_id in enumerate(rgi_ids):
    name = names[i]
    print(name, rgi_id)

    # Load observed snow cover data
    scs_fn = os.path.join(scm_path, 'study-sites', f"RGI60-0{rgi_id}", f"RGI60-0{rgi_id}_classifications.zarr")
    scs = f.load_snow_cover_stats(scs_fn)
    
    # Get file names of runs
    run_fns = sorted(glob.glob(os.path.join(pygem_new_path, rgi_id, '*.nc')))

    # Define output file
    out_fn = os.path.join(scm_path, 'analysis', f"PyGEM_comparison_RGI60-0{rgi_id}.nc")
    if not os.path.exists(out_fn):

        # Compile modeled snowline altitudes and ELAs
        model_runs_list = []
        # iterate over model runs
        for fn in tqdm(run_fns):
            ds = xr.open_dataset(fn)
            ds['time'] = ds.indexes['time'].to_datetimeindex()
            # load model_parameters
            params = json.loads(ds.model_parameters)
            kp = params["kp"]
            tbias = params["tbias"]
            ddfsnow = params["ddfsnow"]
            tsnow_threshold = params["tsnow_threshold"]
            precgrad = params["precgrad"]
            # extract snowline and ELA variables
            snowline = ds['glac_snowline_monthly']
            ela = ds['glac_ELA_annual']
            # create a new dataset with parameters and add to list
            run_ds = xr.Dataset({
                'glac_snowline_monthly': snowline,
                'glac_ELA_annual': ela,
                'kp': xr.DataArray(kp, dims=()),
                'tbias': xr.DataArray(tbias, dims=()),
                'ddfsnow': xr.DataArray(ddfsnow, dims=()),
            })
            model_runs_list.append(run_ds)
        # combine all runs into a single dataset
        combined_ds = xr.concat(model_runs_list, dim='run')
        # trim to post-2013, May to November (no observed snowline data outside then)
        combined_ds = combined_ds.sel(time=slice('2013-01-01', None))
        combined_ds = combined_ds.sel(time=combined_ds['time.month'].isin([5, 6, 7, 8, 9, 10]))
        # add glacier ID
        combined_ds['rgi_id'] = xr.DataArray(rgi_id, dims=())    

        # Merge with observed SLAs
        combined_ds = xr.merge([combined_ds, slas_obs_xr.sel(RGIId=f"RGI60-0{rgi_id}")])

        # Calculate difference
        combined_ds['mod-obs_SLA'] = combined_ds['glac_snowline_monthly'] - combined_ds['SLA_obs_m']

        # Save to file
        combined_ds.to_netcdf(out_fn)

    else:
        # Load combined dataset rom file
        combined_ds = xr.open_dataset(out_fn)

    # Load original model parameters
    modelprms_fn = os.path.join(model_path, '..', 'Rounce_et_al_2023', 'modelprms', f"{rgi_id}-modelprms_dict.pkl")
    modelprms = pd.read_pickle(modelprms_fn)

    # Calculate RMSE for each run's snowline altitudes
    diff = combined_ds['mod-obs_SLA']
    rmse_by_run = np.sqrt((diff**2).mean(dim='time'))
    combined_ds['rmse'] = rmse_by_run

    # identify parameter combinations with the lowest RMSE
    df_plot = combined_ds[['tbias', 'ddfsnow', 'kp', 'rmse', 'mod-obs_SLA']].to_dataframe().reset_index()
    df_plot = df_plot.dropna(subset=['rmse'])
    df_plot_best = df_plot.loc[df_plot['rmse'].idxmin()]

    # subset the model dataset for the original vs. best runs
    # original
    squared_diffs = sum((combined_ds[var] - modelprms['emulator'][var][0])**2 for var in ['kp', 'tbias', 'ddfsnow'])
    best_run_idx = squared_diffs.argmin(dim="run")
    combined_ds_original = combined_ds.sel(run=best_run_idx, glac=0)
    # best
    combined_ds_best = combined_ds.sel(run=int(df_plot_best['run']), glac=0)

    # Plot RMSE as a function of tbias, ddfsnow, and kp
    fig, ax = plt.subplots(3, 1, figsize=(10, 8), gridspec_kw=dict(height_ratios=[2,2,1]))
    sns.scatterplot(df_plot, x='tbias', y='ddfsnow', 
                    size='kp', hue='rmse', palette='viridis_r', sizes=(2,50), 
                    ax=ax[0])
    # original parameter combinations
    ax[0].plot(modelprms['emulator']['tbias'], modelprms['emulator']['ddfsnow'], 's', 
            markersize=15, markeredgecolor='m', markerfacecolor='None', markeredgewidth=2, label='Original')
    # best parameter combination
    ax[0].plot(df_plot_best['tbias'], df_plot_best['ddfsnow'], 
            '*', markersize=15, markeredgecolor='m', markerfacecolor='None', markeredgewidth=2, label='Lowest RMSE')
    ax[0].legend()
    ax[0].set_title(f'{name} ({rgi_id})')
    ax[0].grid(True)
    handles, labels = ax[0].get_legend_handles_labels()
    labels = [x.replace('rmse', 'RMSE [m]').replace('kp', 'Precipitation factor') for x in labels]
    ax[0].legend(handles, labels, loc='center right', bbox_to_anchor=[1.1, 0.4, 0.2, 0.2])
    # modeled and observed snowline time series
    ax[1].plot(combined_ds_original.time, combined_ds_original.glac_snowline_monthly, '-', color='gray', label='Original model')
    ax[1].plot(combined_ds_best.time, combined_ds_best.glac_snowline_monthly, '-k', label='Best model')
    ax[1].plot(scs['time'], scs['SLA'], '.m', markersize=5, label='Observed')
    ax[1].legend(loc='center left', bbox_to_anchor=[1.0, 0.4, 0.2, 0.2])
    ax[1].set_ylabel('Snowline altitude [m.a.s.l.]')
    # observed - modeled snowline altitude
    ax[2].plot(combined_ds_original.time, combined_ds_original['mod-obs_SLA'], '.', color='gray', markersize=10, label='Original model')
    ax[2].plot(combined_ds_best.time, combined_ds_best['mod-obs_SLA'], '.k', markersize=5, label='Best model')
    ax[2].legend(loc='center left', bbox_to_anchor=[1.0, 0.4, 0.2, 0.2])
    ax[2].grid()
    ax[2].set_ylabel('Modeled $-$ observed\nsnowline altitude [m]')
    ax[1].set_xlim(ax[2].get_xlim())

    fig.tight_layout()
    # plt.show()

    # Save figure to file
    fig_fn = os.path.join(figures_path, f"{rgi_id}_PyGEM_comparison.png")
    fig.savefig(fig_fn, dpi=300, bbox_inches='tight')
    print('Figure saved to file:', fig_fn)
    plt.close(fig)

    # Compile results in dataframe
    df_results = pd.DataFrame({'Original': 
                           [modelprms['emulator']['tbias'][0],
                            modelprms['emulator']['ddfsnow'][0],
                            modelprms['emulator']['kp'][0]],
                            'Best': 
                            [df_plot_best['tbias'],
                             df_plot_best['ddfsnow'],
                             df_plot_best['kp']]
                           },
                           index=['tbias', 'ddfsnow', 'kp'])
    # add site RGI ID and name columns
    df_results.index = pd.MultiIndex.from_product([[f"RGI60-0{rgi_id}"], df_results.index], 
                                                names=['RGIId', 'parameter'])
    df_results['name'] = name
    # concatenate to full dataframe
    df_results_full = pd.concat([df_results_full, df_results])

# Add Original - Best column
df_results_full['Original-Best'] = df_results_full['Original'] - df_results_full['Best']

# Save full data frame to file
df_results_full_fn = os.path.join(scm_path, 'analysis', 'PyGEM_comparison_params.csv')
df_results_full.to_csv(df_results_full_fn, index=True)
print('Results saved to file:', df_results_full_fn)
df_results_full
