# Construct training data

In [1]:
import os, glob
import xarray as xr
import rioxarray as rxr
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
from shapely import wkt
import numpy as np
import xrspatial
import datetime

  _pyproj_global_context_initialize()
Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



## Define paths to inputs

In [None]:
# Site info used for output file names
site_name = 'MCS'
date = '2024-03-15'

# Path where output training data will be saved
out_dir = '/Volumes/LaCie/raineyaberle/Research/PhD/SnowMaL/study-sites/MCS/'
out_fn = os.path.join(out_dir, f'{site_name}_{date}_training_data.csv')
# Reference DEM
refdem_fn = '/Volumes/LaCie/raineyaberle/Research/PhD/Skysat-Stereo/study-sites/MCS/refdem/MCS_REFDEM_WGS84.tif'
# Snow depth map
sd_fn = '/Volumes/LaCie/raineyaberle/Research/PhD/Skysat-Stereo/study-sites/MCS/lidar/20240315_MCS-snowdepth_RF_5m.tif'
# SNOTEL data and site info
snotel_fn = '/Volumes/LaCie/raineyaberle/Research/PhD/Skysat-Stereo/study-sites/MCS/snotel/MCS_2020-01-01_2024-06-07_adj.csv'
snotel_info_fn = '/Volumes/LaCie/raineyaberle/Research/PhD/Skysat-Stereo/study-sites/MCS/snotel/MCS_SNOTEL_site_info.csv'

## Load inputs and plot

In [None]:
# -----Snow depth map
sd = xr.open_dataset(sd_fn).squeeze()
sd = sd.rename({'band_data': 'snow_depth_m'})

# -----Reference DEM
refdem = xr.open_dataset(refdem_fn).squeeze()
refdem_crs = refdem.rio.crs.to_epsg()
# Replace no data values with NaNs
refdem = xr.where(refdem <= -1e38, np.nan, refdem)
# Interpolate at snow depth map coordinates
refdem = refdem.interp(x=sd.x.data, y=sd.y.data)
# Calculate slope and aspect
refdem = refdem.rename({'band_data': 'elevation'})
refdem['slope'] = xrspatial.slope(refdem.elevation)
refdem['aspect'] = xrspatial.aspect(refdem.elevation)
# Add snow depth as band
refdem['snow_depth_m'] = sd.snow_depth_m
refdem = refdem.rio.write_crs(f'EPSG:{refdem_crs}')

# -----SNOTEL
snotel = pd.read_csv(snotel_fn)
snotel['datetime'] = pd.to_datetime(snotel['datetime'])
snotel['datetime'] = [x.replace(tzinfo=None) for x in snotel['datetime']] 
# Calculate PDDs using average daily temperature
snotel['PDD'] = [x if x > 0 else 0 for x in snotel['TAVG_C']]
# Calculate cumulative sum of PDDs by water year
snotel['water_year'] = snotel['datetime'].apply(lambda x: x.year + 1 if x.month >= 10 else x.year)
snotel['PDD_cumsum'] = snotel.groupby('water_year')['PDD'].cumsum()
# Grab SNOTEL values on snow depth date
sd_dt = datetime.datetime(int(sd_date[0:4]), int(sd_date[5:7]), int(sd_date[8:]))
snotel_date = snotel.loc[snotel['datetime']==sd_dt]
# Grab site coordinates
snotel_info = pd.read_csv(snotel_info_fn)
snotel_info['geometry'] = snotel_info['geometry'].apply(wkt.loads)
snotel_info = gpd.GeoDataFrame(snotel_info, geometry='geometry', crs='EPSG:4326')
snotel_info = snotel_info.to_crs(f'EPSG:{sd.rio.crs.to_epsg()}')
# Sample terrain info at SNOTEL coordinates
refdem_snotel = refdem.sel(x=snotel_info.geometry[0].coords.xy[0][0], y=snotel_info.geometry[0].coords.xy[1][0], method='nearest')
snotel_date['elevation'] = float(refdem_snotel.elevation.data)
snotel_date['slope'] = float(refdem_snotel.slope.data)
snotel_date['aspect'] = float(refdem_snotel.aspect.data)


fig, ax = plt.subplots(2, 2, figsize=(10,12))
ax = ax.flatten()
columns = ['snow_depth_m', 'elevation', 'slope', 'aspect']
labels = ['Snow depth [m]', 'Elevation [m]', 'Slope [degrees]', 'Aspect [degrees]']
cmaps = ['Blues', 'terrain', 'Greens', 'twilight']
for i in range(len(columns)):
    im = ax[i].imshow(refdem[columns[i]].data, cmap=cmaps[i],
                  extent=(np.min(refdem.x.data)/1e3, np.max(refdem.x.data)/1e3, 
                          np.min(refdem.y.data)/1e3, np.max(refdem.y.data)/1e3))
    fig.colorbar(im, ax=ax[i], label=labels[i], orientation='horizontal', shrink=0.8)
    ax[i].plot(snotel_info['geometry'].values[0].coords.xy[0][0] / 1e3, 
               snotel_info['geometry'].values[0].coords.xy[1][0] / 1e3, '*m')
for axis in [ax[2], ax[3]]:
    axis.set_xlabel('Easting [km]')
for axis in [ax[0], ax[2]]:
    axis.set_ylabel('Northing [km]')

fig.tight_layout()
plt.show()


## Construct the training data

In [None]:
# Reference DEM and snow depth data
training_data_df = pd.DataFrame()
for column in ['elevation', 'slope', 'aspect', 'snow_depth_m']:
    df = pd.DataFrame({column: np.ravel(refdem.elevation.data)})
    training_data_df = pd.concat([training_data_df, df], axis=1)

# SNOTEL data
training_data_df['SNOTEL_snow_depth'] = snotel_date['SNWD_m'].values[0]
training_data_df['SNOTEL_SWE'] = snotel_date['SWE_m'].values[0]
training_data_df['SNOTEL_pdd_cumsum'] = snotel_date['PDD_cumsum'].values[0]
training_data_df['SNOTEL_elevation'] = snotel_date['elevation'].values[0]
training_data_df['SNOTEL_slope'] = snotel_date['slope'].values[0]
training_data_df['SNOTEL_aspect'] = snotel_date['aspect'].values[0]

training_data_df.dropna(inplace=True)
training_data_df.reset_index(drop=True, inplace=True)

# Convert terrain values to ints to save on memory
for col in training_data_df.columns:
    if ('elevation' in col) or ('slope' in col) or ('aspect' in col):
        training_data_df[col] = training_data_df[col].astype(int)

# Save to file
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
training_data_df.to_csv(out_fn, index=False)
print('Training data saved to file:', out_fn)

training_data_df

## Plot pairplot of features