# Construct training data for model development

In [None]:
import os
import glob
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot
import sys
import rioxarray as rxr
import xarray as xr
import numpy as np
from tqdm.auto import tqdm

## Define paths in directory, import functions

In [None]:
scm_path = '/Volumes/LaCie/raineyaberle/Research/PhD/snow_cover_mapping/'
base_path = '/Users/raineyaberle/Research/PhD/snow_cover_mapping/snow-cover-mapping-application/'
sys.path.append(os.path.join(base_path, 'functions'))
import model_analyze_utils as f

## Load RGI glacier boundaries, ERA time series, and snowline time series for all sites

In [None]:
# -----Load RGI glacier boundaries (AOIs)
aois_fn = 'all_aois.shp'
aois = gpd.read_file(os.path.join(scm_path, 'all_AOIs', aois_fn))
aois[['O1Region', 'O2Region']] = aois[['O1Region', 'O2Region']].astype(int)
print('All AOIs loaded from file')

# -----Load ERA data
eras_fn = 'all_era_data.csv'
eras = pd.read_csv(os.path.join(scm_path, 'all_ERA_data', eras_fn))
eras['Date'] = pd.to_datetime(eras['Date'], format='mixed')
print('All ERA data loaded from file')
    
# -----Load all snowlines
snowlines_fn = 'all_snowlines.csv'
snowlines = pd.read_csv(os.path.join(scm_path, 'all_snowlines', snowlines_fn))
snowlines['datetime'] = pd.to_datetime(snowlines['datetime'], format='mixed')
print('All snowlines loaded from file')


## Add Hypsometric Index and Subregion columns

In [None]:
# -----Define some functions
# Adjust DEM data variables
def adjust_data_vars(dem_xr):
    if 'band_data' in dem_xr.data_vars:
        dem_xr = dem_xr.rename({'band_data': 'elevation'})
    if 'band' in dem_xr.dims:
        elev_data = dem_xr.elevation.data[0]
        dem_xr = dem_xr.drop_dims('band')
        dem_xr['elevation'] = (('y', 'x'), elev_data)
    return dem_xr

# Calculate Hypsometric Index (HI)
# Jiskoot et al. (2009): https://doi.org/10.3189/172756410790595796
def calculate_hypsometric_index(dem_fn, aoi):
    # load DEM as DataArray
    dem = rxr.open_rasterio(dem_fn)
    # reproject DEM to AOI CRS
    dem = dem.rio.reproject('EPSG:'+str(aoi.crs.to_epsg()))
    # clip DEM to AOI
    dem_aoi = dem.rio.clip(aoi.geometry, aoi.crs)
    # convert to dataset
    dem_aoi_ds = dem_aoi.to_dataset(name='elevation')
    # adjust DEM data variables
    dem_aoi_ds = adjust_data_vars(dem_aoi_ds)
    # set no data values to NaN
    dem_aoi_ds = xr.where((dem_aoi_ds > 1e38) | (dem_aoi_ds <= -9999), np.nan, dem_aoi_ds)
    # calculate elevation statistics
    h_max = np.nanmax(np.ravel(dem_aoi_ds.elevation.data))
    h_min = np.nanmin(np.ravel(dem_aoi_ds.elevation.data))
    h_med = np.nanmedian(np.ravel(dem_aoi_ds.elevation.data))
    # calculate HI, where HI = (H_max - H_med) / (H_med - H_min). If 0 < HI < 1, HI = -1/HI.
    hi = (h_max - h_med) / (h_med - h_min)
    if (0 < hi) and (hi < 1):
        hi = -1 / hi
    # determine HI category
    if hi <= -1.5:
        hi_category = 'Very top heavy'
    elif (hi > -1.5) and (hi <= -1.2):
        hi_category = 'Top heavy'
    elif (hi > -1.2) and (hi <= 1.2):
        hi_category = 'Equidimensional'
    elif (hi > 1.2) and (hi <= 1.5):
        hi_category = 'Bottom heavy'
    elif hi > 1.5:
        hi_category = 'Very bottom heavy'

    return hi, hi_category


In [None]:
# -----Define columns to save in training data for each dataset
aoi_columns = ['O1Region', 'O2Region', 'Subregion', 'Area', 'Zmed', 'Slope', 'Aspect', 'Lmax', 'TermType', 'Surging']
era_columns = ['Date', 'Cumulative_Precipitation_mwe', 'Cumulative_Snowfall_mwe', 
               'Cumulative_Snowmelt_mwe', 'Positive_Degree_Days', 'Cumulative_Positive_Degree_Days']
snowlines_columns = ['Date', 'site_name', 'snowline_elevs_median_m', 'SCA_m2', 'AAR', 'ELA_from_AAR_m']

# -----Initialize full training data frame
training_data = pd.DataFrame()

# -----Iterate over site names
for site_name in tqdm(aois['RGIId'].drop_duplicates().values):
    
    # subset AOIs
    aoi = aois.loc[aois['RGIId']==site_name]

    # subset snowlines
    snowlines_site = snowlines.loc[snowlines['site_name']==site_name]
    # add date column
    snowlines_site.loc[:, 'Date'] = snowlines_site['datetime'].values.astype('datetime64[D]')
    # subset columns
    snowlines_site = snowlines_site[snowlines_columns]
    
    # subset ERA data
    eras_site = eras.loc[eras['site_name']==site_name]

    # Merge snowlines and ERA time series
    training_data_site = snowlines_site.merge(eras_site, how='left', on='Date')
    # Identify subregion name
    o1, o2 = aoi[['O1Region', 'O2Region']].values[0].astype(int)

    # Add AOI columns to merged snowlines and ERA dataframe
    subregion, color = f.determine_subregion_name_color(o1, o2)
    aoi.loc[:, 'Subregion'] = subregion
    for aoi_column in aoi_columns:
        training_data_site[aoi_column] = aoi[aoi_column].values[0]
    
    # determine DEM file name
    dem_fns = glob.glob(os.path.join(scm_path, 'study-sites', site_name, 'DEMs', site_name + '*.tif'))
    if len(dem_fns) < 1:
        continue
    if ('ArcticDEM' in dem_fns[0]) | ('USGS' in dem_fns[0]):
        dem_fn = [x for x in dem_fns if '_geoid.tif' in x][0]
    else:
        dem_fn = dem_fns[0]

    # calculate hyspometric index using DEM and AOI
    hi, hi_category = calculate_hypsometric_index(dem_fn, aoi)
    
    # add to training data table
    training_data_site['Hypsometric_Index'] = hi
    training_data_site['Hypsometric_Index_Category'] = hi_category

    # Concatenate site training data to full training data dataframe
    training_data = pd.concat([training_data, training_data_site])
    
training_data.rename(columns={'site_name_x': 'site_name'})
training_data