# Calculate median weekly trends in snow cover for each study site

In [None]:
import os
import glob
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from scipy.stats import iqr
import seaborn as sns
import numpy as np
import xarray as xr
from scipy.stats import median_abs_deviation

## Load compiled glacier boundaries (AOIs) and climate clusters

In [None]:
# Define root path to data for convenience
scm_path = '/Volumes/LaCie/raineyaberle/Research/PhD/snow_cover_mapping/'

# Define output directory
out_path = os.path.join(scm_path, 'analysis')

# Load AOIs
aois_fn = os.path.join(out_path, 'AOIs.gpkg')
aois = gpd.read_file(aois_fn)
aois[['O1Region', 'O2Region']] = aois[['O1Region', 'O2Region']].astype(int)
print('All glacier boundaries loaded from file.')

# Load climate clusters
clusters_fn = os.path.join(out_path, 'climate_clusters.csv')
clusters = pd.read_csv(clusters_fn)
print('Clusters loaded from file.')

## Pre-processing: Compile snow cover stats CSVs for each site

In [None]:
from shapely.geometry import LineString
from ast import literal_eval
from pyproj import Transformer
import shutil

# Define columns to save
out_cols = ['RGIId', 'datetime', 'source', 'SCA_m2', 'AAR', 'SLA_from_AAR_m', 'snowline_elevs_m', 'snowline_elevs_median_m', 'snowline_geometry']

# Define function to convert lists of X and Y coordinates to a LineString and transform to WGS84
def create_linestring_wgs84(x_coords, y_coords, transformer):
    # Create list of (x, y) coordinate tuples
    points = list(zip(x_coords, y_coords))
    # Create LineString in UTM
    line = LineString(points)
    # Transform to WGS84
    line_wgs84 = LineString([transformer.transform(x, y) for x, y in line.coords])
    return line_wgs84

# Iterate over sites
for rgi_id in tqdm(aois['RGIId'].drop_duplicates().values):
    # Define output file name
    scs_fn = os.path.join(scm_path, 'study-sites', rgi_id, f'{rgi_id}_snow_cover_stats.csv')
    if not os.path.exists(scs_fn):
        # Initialize dataframe
        scs = pd.DataFrame()
        
        # Get snow cover stats file names
        sc_fns = sorted(glob.glob(os.path.join(scm_path, 'study-sites', rgi_id, 'snow_cover_stats', '*_snow_cover_stats.csv')))
        if len(sc_fns) < 1:
            continue
        
        # Merge files, not including PlanetScope
        for fn in sc_fns:
            if 'PlanetScope' not in fn:
                sc = pd.read_csv(fn)
                scs = pd.concat([scs, sc])
                
        # Merge any redundant columns (from old versions of the code)
        cols = list(scs.keys())
        if ('site_name' in cols) & ('RGIId' in cols):
            scs['RGIId'] = scs['RGIId'].fillna(scs['site_name'])
            scs.drop(columns=['site_name'], inplace=True)
        elif 'site_name' in cols:
            scs.rename(columns={'site_name': 'RGIId'}, inplace=True)
        if ('dataset' in cols) & ('source' in cols):
            scs['source'] = scs['source'].fillna(scs['dataset'])
            scs.drop(columns=['dataset'], inplace=True)
        elif 'dataset' in cols:
            scs.rename(columns={'dataset': 'source'}, inplace=True)
        scs.reset_index(drop=True, inplace=True)
        
        # Rename "ELA_from_AAR_m" column to "SLA_from_AAR_m" to be more representative
        scs.rename(columns={'ELA_from_AAR_m': 'SLA_from_AAR_m'}, inplace=True)
        
        # Ensure coordinate lists are correctly formatted
        scs['snowlines_coords_X'] = scs['snowlines_coords_X'].apply(lambda x: literal_eval(x) if x != "[]" else [])
        scs['snowlines_coords_Y'] = scs['snowlines_coords_Y'].apply(lambda x: literal_eval(x) if x != "[]" else [])
        
        # Get the UTM CRS for transformation
        crs_utm = scs['HorizontalCRS'].drop_duplicates().dropna().values[0]
        transformer = Transformer.from_crs(crs_utm, "EPSG:4326", always_xy=True)
        
        # Create and transform snowline geometries
        scs['snowline_geometry'] = scs.apply(lambda row: create_linestring_wgs84(row['snowlines_coords_X'], row['snowlines_coords_Y'], transformer), axis=1)
        
        # Select the relevant columns
        scs = scs[out_cols]
        
        # Save merged, adjusted dataframe
        scs.to_csv(scs_fn, index=False)
        
        # Remove old files
        old_folder = os.path.join(scm_path, 'study-sites', rgi_id, 'snow_cover_stats')
        shutil.rmtree(old_folder)


## Calculate weekly median trends for each site

Conduct Monte Carlo simulations for sampling, with number of simulations = 100 and percent sampled = 80%. 

In [None]:
scs_MCs_fn = os.path.join(out_path, 'median_snow_cover_stats_MC_adjusted.nc')
if not os.path.exists(scs_MCs_fn):
    # Set up Monte Carlo parameters
    nMC = 100  # Number of Monte Carlo simulations
    sample_fraction = 0.8  # Fraction of data to sample in each simulation

    # Initialize list to store results for all sites
    scs_MCs_list = []

    # Iterate over study sites
    for rgi_id in tqdm(aois['RGIId'].drop_duplicates().values):
        scs_fn = os.path.join(scm_path, 'study-sites', rgi_id, f'{rgi_id}_snow_cover_stats_adjusted.csv')

        if not os.path.exists(scs_fn):
            print(f'Skipping {rgi_id}, file not found.')
            continue

        # Read and preprocess data
        scs_site = pd.read_csv(scs_fn)
        scs_site['datetime'] = pd.to_datetime(scs_site['datetime'], errors='coerce')
        scs_site.dropna(subset=['datetime'], inplace=True)
        scs_site['WOY'] = scs_site['datetime'].dt.isocalendar().week

        # Number of samples per simulation
        nsamp = int(len(scs_site) * sample_fraction)

        # Monte Carlo simulations
        results = []
        for i in range(nMC):
            sampled_indices = np.random.choice(scs_site.index, size=nsamp, replace=False)
            scs_site_MC = scs_site.loc[sampled_indices].sort_values(by='datetime')

            # Calculate weekly medians for AAR, SCA, and ELA
            weekly_medians = scs_site_MC.groupby('WOY')[['transient_AAR', 'snow_area_m2', 'SLA_m']].median()
            weekly_medians['MC_run'] = i
            results.append(weekly_medians.reset_index())

        # Concatenate results for all Monte Carlo runs
        all_results = pd.concat(results)
        all_results['RGIId'] = rgi_id # add RGI ID
        
        # Convert to xarray.Dataset
        ds = all_results.set_index(['RGIId', 'MC_run', 'WOY']).to_xarray()
        scs_MCs_list.append(ds)

    # Combine results into a single xarray.Dataset
    scs_MCs_ds = xr.concat(scs_MCs_list, dim='RGIId')

    # Save to file
    scs_MCs_ds.to_netcdf(scs_MCs_fn)
    print("Monte Carlo simulations completed and saved to file:", scs_MCs_fn)

else:
    scs_MCs_ds = xr.open_dataset(scs_MCs_fn)
    print('Monte Carlo simulations loaded from file:', scs_MCs_fn)
    
scs_MCs_ds


In [None]:
# -----Compile minimum snow cover statistics
min_aars_woys_fn = os.path.join(out_path, 'minimum_snow_cover_stats_adjusted.csv') 
# check if exists in directory
if not os.path.exists(min_aars_woys_fn):
    # initialize dataframe for RGI stats and minimum snow cover statts
    min_aars_woys = pd.DataFrame()
    
    # iterate over site names in median snow cover stats dataframe
    for rgi_id in tqdm(scs_MCs_ds.RGIId.values):
        
        # Calculate AAR median and MAD across MC simulations
        aar_median = float(scs_MCs_ds.sel(RGIId=rgi_id)['transient_AAR'].min(dim='WOY').median().values)
        aar_mad = float(median_abs_deviation(scs_MCs_ds.sel(RGIId=rgi_id)['transient_AAR'].min(dim='WOY').values))
        
        # Calculate WOY median and MAD across MC simulations
        Imins = scs_MCs_ds.sel(RGIId=rgi_id)['transient_AAR'].argmin(dim='WOY').values
        woys = scs_MCs_ds.WOY.values[Imins]
        woy_median = int(np.nanmedian(woys))
        woy_mad = int(median_abs_deviation(woys))

        # Calculate AAR for only September (weeks 35-40)
        aar_sept = float(scs_MCs_ds.sel(RGIId=rgi_id).sel(WOY=slice(35,40))['transient_AAR'].min(dim='WOY').median().values)
        
        df = pd.DataFrame({
            'RGIId': [rgi_id],
            'WOY_median': [woy_median],
            'WOY_MAD': [woy_mad],
            'AAR_median': [aar_median],
            'AAR_MAD': [aar_mad],
            'AAR_Sept': [aar_sept]
            })
        # concatenate to full dataframe
        min_aars_woys = pd.concat([min_aars_woys, df], axis=0)

    # save to file
    min_aars_woys.to_csv(min_aars_woys_fn, index=False)
    print('Minimum AARs and WOYs saved to file: ', min_aars_woys_fn)
        
else:
    # load from file
    min_aars_woys = pd.read_csv(min_aars_woys_fn)
    print('Minimum AARs and WOYs loaded from file.')

min_aars_woys

In [None]:
# Add subregion and cluster columns
min_aars_woys = min_aars_woys.merge(aois[['RGIId', 'Subregion']], on='RGIId')
min_aars_woys = min_aars_woys.merge(clusters[['RGIId', 'clustName']], on='RGIId')

fig, ax = plt.subplots(2, 1, figsize=(10, 12))
sns.kdeplot(data=min_aars_woys, x='AAR_median', palette='mako', cumulative=True, hue='Subregion', 
             ax=ax[0])
sns.kdeplot(data=min_aars_woys, x='AAR_median', cumulative=True, hue='clustName', 
             ax=ax[1])
plt.show()

In [None]:
# Print stats
print('Median AAR MAD = ', min_aars_woys['AAR_MAD'].median())
print('Median WOY MAD = ', min_aars_woys['WOY_MAD'].median())
min_aars_woys['AAR_Sept-AAR_median'] = min_aars_woys['AAR_Sept'] - min_aars_woys['AAR_median']
print('AAR September - AAR median')
min_aars_woys['AAR_Sept-AAR_median'].describe()

## Assess interannual variability in AAR magnitude and timing at each site

In [None]:
aar_var_stats_fn = os.path.join(os.path.join(out_path, 'minimum_snow_cover_stats_interannual_variability_2016-2023_adjusted.csv'))
if os.path.exists(aar_var_stats_fn):
    aar_var_stats = pd.read_csv(aar_var_stats_fn)
    print('AAR interannual variability stats loaded from file.')

else:
    aar_var_stats = pd.DataFrame()
    for rgi_id in tqdm(aois['RGIId'].drop_duplicates().values):
        # Load snow cover stats
        scs_fn = os.path.join(scm_path, 'study-sites', rgi_id, f'{rgi_id}_snow_cover_stats.csv')
        if not os.path.exists(scs_fn):
            print(f'Skipping {rgi_id}, no snow cover stats file.')
            continue
        scs = pd.read_csv(scs_fn)
        # Add Year and WOY columns
        if 'datetime' not in scs.keys():
            print(f'Error with {rgi_id}')
            continue
        scs['datetime'] = pd.to_datetime(scs['datetime'], format='mixed')
        scs['Year'] = scs['datetime'].dt.isocalendar().year
        scs['WOY'] = scs['datetime'].dt.isocalendar().week
        
        # subset to 2016
        scs = scs.loc[scs['Year'] >= 2016]
        # identify annual AAR magnitudes and WOY timing
        annual_mins_site = scs.groupby('Year')['AAR'].idxmin().reset_index()
        annual_mins_site.rename(columns={'AAR': 'Imin'}, inplace=True)
        annual_mins_site['AAR'] = [scs.loc[i, 'AAR'] for i in annual_mins_site['Imin'].values]
        annual_mins_site['WOY'] = [scs.loc[i, 'WOY'] for i in annual_mins_site['Imin'].values]
        df = pd.DataFrame({'RGIId': [rgi_id],
                           'AAR_min': [annual_mins_site['AAR'].min()],
                           'AAR_max': [annual_mins_site['AAR'].max()],
                           'AAR_median': [annual_mins_site['AAR'].median()],
                           'AAR_IQR': [iqr(annual_mins_site['AAR'])],
                           'WOY_min': [annual_mins_site['WOY'].min()],
                           'WOY_max': [annual_mins_site['WOY'].max()],
                           'WOY_median': [annual_mins_site['WOY'].median()],
                           'WOY_IQR': [iqr(annual_mins_site['WOY'])]})  
        aar_var_stats = pd.concat([aar_var_stats, df])
    
    aar_var_stats.reset_index(drop=True, inplace=True)

    # Save to file
    aar_var_stats.to_csv(aar_var_stats_fn, index=False)
    print('AAR interannual variability stats saved to file:', aar_var_stats_fn)

aar_var_stats


In [None]:
# -----Print stats
aar_var_stats['AAR_range'] = aar_var_stats['AAR_max'] - aar_var_stats['AAR_min']
print(f"AAR range for all sites: {iqr(aar_var_stats['AAR_min'])} - {iqr(aar_var_stats['AAR_max'])}")
# print('By subregion:')
# print('Median')
# print(aar_var_stats.groupby(['Subregion'])['AAR_range'].median())
# print('\n')
# print('IQR')
# print(aar_var_stats.groupby(['Subregion'])['AAR_range'].apply(iqr))

In [None]:
aar_var_stats['WOY_range'] = aar_var_stats['WOY_max'] - aar_var_stats['WOY_min']
print(f"AAR TIMING range for all sites: {aar_var_stats['WOY_range'].median()} +/- {iqr(aar_var_stats['WOY_range'])}\n")
# print('By subregion:')
# print('Median')
# print(aar_var_stats.groupby(['Subregion'])['WOY_range'].median())
# print('\n')
# print('IQR')
# print(aar_var_stats.groupby(['Subregion'])['WOY_range'].apply(iqr))

## Identify the approximate start and end of the melt season in each subregion from ERA data

In [None]:
melt_season_fn = os.path.join(out_path, 'melt_season_timing.csv')

if not os.path.exists(melt_season_fn):
    melt_season_df = pd.DataFrame()
    
    # Iterate over sites
    for rgi_id in tqdm(aois['RGIId'].drop_duplicates().values):
        # Load ERA data
        era_fn = os.path.join(scm_path, 'study-sites', rgi_id, 'ERA', f'{rgi_id}_ERA5-Land_daily_means.csv')
        era = pd.read_csv(era_fn)
        era['Date'] = pd.to_datetime(era['Date'])
    
        # Add WOY and year columns
        era['WOY'] = era['Date'].dt.isocalendar().week
        era['year'] = era['Date'].dt.isocalendar().year
    
        # Calculate weekly medians for 2013–2022
        era = era.loc[era['year'] > 2012]
        if '.geo' in era.keys():
            era = era.drop(columns=['.geo'])
        era_weekly_median = era.groupby('WOY').median().reset_index()
    
        # Estimate start and end of melt seasons
        # Start = positive PDDs
        try:
            woy_start = era_weekly_median.loc[era_weekly_median['positive_degree_days_annual_cumsum'] > 0, 'WOY'].values[0]
        except:
            woy_start = 52
        # End = after July, 0 PDDs, positive snowfall
        woy_end = era_weekly_median.loc[(era_weekly_median['WOY'] > 30) 
                                     & (era_weekly_median['positive_degree_days'] == 0) 
                                     & (era_weekly_median['mean_snowfall_sum'] > 0), 'WOY'].values[0]
        
        # Add to full dataframe
        df = pd.DataFrame({'RGIId': [rgi_id], 
                           'melt_season_start_WOY': [woy_start],
                           'melt_season_end_WOY': [woy_end],
                          })
        melt_season_df = pd.concat([melt_season_df, df], axis=0)
    
    # Save to file
    melt_season_df.reset_index(drop=True, inplace=True)
    melt_season_df.to_csv(melt_season_fn, index=False)
    print('Melt season timing CSV saved to file:', melt_season_fn)

else:
    melt_season_df = pd.read_csv(melt_season_fn)
    print('Melt season timing CSV loaded from file.')

melt_season_df
        

In [None]:
# Plot some results

# Add subregion and cluster columns
if 'Subregion' not in melt_season_df.keys():
    melt_season_df['Subregion'] = ''
    melt_season_df['clustName'] = ''
    for rgi_id in melt_season_df['RGIId'].drop_duplicates().values:
        melt_season_df.loc[melt_season_df['RGIId']==rgi_id, 'Subregion'] = aois.loc[aois['RGIId']==rgi_id, 'Subregion'].values
        melt_season_df.loc[melt_season_df['RGIId']==rgi_id, 'clustName'] = clusters.loc[clusters['RGIId']==rgi_id, 'clustName'].values

nsubregions = len(melt_season_df['Subregion'].drop_duplicates().values)
fig, ax = plt.subplots(nsubregions, 1, figsize=(8, nsubregions*4))
for i, subregion in enumerate(melt_season_df['Subregion'].drop_duplicates().values):
    melt_season_subregion_df = melt_season_df.loc[melt_season_df['Subregion']==subregion]
    ax[i].hist(melt_season_subregion_df['melt_season_start_WOY'], bins=20, facecolor='m', alpha=0.5)
    ax[i].axvline(melt_season_subregion_df['melt_season_start_WOY'].mean(), color='m', linewidth=2)
    ax[i].hist(melt_season_subregion_df['melt_season_end_WOY'], bins=20, facecolor='b', alpha=0.5)
    ax[i].axvline(melt_season_subregion_df['melt_season_end_WOY'].mean(), color='b', linewidth=2)
    ax[i].set_title(subregion)

plt.show()

In [None]:
# Check total PDDs across sites and climate clusters

# Iterate over sites
average_pdds_df = pd.DataFrame()
for rgi_id in tqdm(aois['RGIId'].drop_duplicates().values):
    # get the climate cluster
    clust_name = clusters.loc[clusters['RGIId'] == rgi_id, 'clustName'].values[0]

    # Load ERA data
    era_fn = os.path.join(scm_path, 'study-sites', rgi_id, 'ERA', f'{rgi_id}_ERA5-Land_daily_means.csv')
    era = pd.read_csv(era_fn)
    era['Date'] = pd.to_datetime(era['Date'])

    # calculate average PDDs for May-August
    era['month'] = era['Date'].dt.month
    era['year'] = era['Date'].dt.isocalendar().year
    era_sept = era.loc[era['month'].isin([9])]
    pdd_avg = era_sept['positive_degree_days'].mean()
    pdd_cumsum_avg = era_sept['positive_degree_days_annual_cumsum'].mean()

    # Add to dataframe
    df = pd.DataFrame({'RGIId': [rgi_id],
                       'clustName': [clust_name], 
                       'average_PDDs_Sept': [pdd_avg],
                       'average_cumulative_PDDs_Sept': [pdd_cumsum_avg]})
    average_pdds_df = pd.concat([average_pdds_df, df], axis=0)

average_pdds_df

In [None]:
import seaborn as sns 
# Plot average PDDs by cluster
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
sns.boxplot(data=average_pdds_df, x='clustName', y='average_PDDs_Sept', palette='Set3', ax=ax[0])
sns.boxplot(data=average_pdds_df, x='clustName', y='average_cumulative_PDDs_Sept', palette='Set3', ax=ax[1])
plt.show()

In [None]:
era_sept['positive_degree_days'].describe()