# Model snow depth using terrain parameters

In [None]:
import os
import glob
import sys
import xdem
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
import joblib
import json

## Define input and output files

In [None]:
code_dir = '/Users/raineyaberle/Research/PhD/SnowDEMs/skysat_stereo_snow/skysat_stereo_snow'
data_dir = '/Volumes/LaCie/raineyaberle/Research/PhD/SkySat-Stereo/study-sites'

# Inputs
site_name = "MCS"
date = "20240420"
if site_name=='MCS':
    sd_fn = os.path.join(data_dir, site_name, date, 'post_process', f"coregAll_ba-u5m_{site_name}_{date}_DEM_GCPshift_snow_depth.tif")
else:
    sd_fn = os.path.join(data_dir, site_name, date, 'post_process', f"{site_name}_{date}_DEM_GCPshift_snow_depth.tif")    
# sd_fn = os.path.join(data_dir, site_name, 'SNEX_QSI_SD', 'SNEX20_QSI_SD_0.5M_USIDBS_20210315_20210315.tif')
if site_name=="JacksonPeak":
    refdem_fn = os.path.join(data_dir, site_name, 'refdem', 'USGS_LPC_ID_FEMAHQ_2018_D18_merged_filtered_UTM11_filled.tif')
else:
    refdem_fn = os.path.join(data_dir, site_name, 'refdem', f"{site_name}_REFDEM_WGS84.tif")

# Determine whether to scale input features
scale_inputs = False

# Outputs
out_dir = os.path.join(data_dir, site_name, date, 'snow_depth_modeling')
# out_dir = os.path.join(os.path.dirname(sd_fn), 'snow_depth_modeling')
training_data_fn = os.path.join(out_dir, 'training_data.csv')
error_fn = os.path.join(out_dir, 'model_error.csv')
model_fn = os.path.join(out_dir, 'trained_model.joblib')

# Check that inputs exist
for fn, name in [[sd_fn, 'Snow depth map'], [refdem_fn, 'Reference DEM']]:
    if not os.path.exists(fn):
        print(f"{name} not found, please correct before continuing.")
# Create output directory
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

# Import processing functions
sys.path.append(code_dir)
import post_process_utils as pprocess

## Construct model training data

In [None]:
# Check if training data already exist
if not os.path.exists(training_data_fn):
    ### Load input files
    # Snow depth
    sd = xdem.DEM(sd_fn).reproject(res=2)
    # Terrain parameters
    elev = xdem.DEM(refdem_fn.replace('.tif', '_ELEVATION.tif')).reproject(sd)
    slope = xdem.DEM(refdem_fn.replace('.tif', '_SLOPE.tif')).reproject(sd)
    aspect = xdem.DEM(refdem_fn.replace('.tif', '_ASPECT.tif')).reproject(sd)
    tpi = xdem.DEM(refdem_fn.replace('.tif', '_TPI.tif')).reproject(sd)
    sx_fn = glob.glob(refdem_fn.replace('.tif', '*Sx*.tif'))[0]
    sx = xdem.DEM(sx_fn).reproject(sd)
    # Compile into pandas.DataFrame
    training_data = pd.DataFrame({'elevation': elev.data.ravel(),
                                  'slope': slope.data.ravel(),
                                  'aspect': aspect.data.ravel(),
                                  'topographic_position_index': tpi.data.ravel(),
                                  'Sx': sx.data.ravel(),
                                  'snow_depth': sd.data.ravel()})
    training_data.dropna(inplace=True)
    training_data.reset_index(drop=True, inplace=True)
    
    # Reduce precision (don't need 10 digits of elevation, e.g.)
    training_data = training_data.round(2)
    
    # Save to file
    training_data.to_csv(training_data_fn, index=False)
    print('Training data saved to file:', training_data_fn)
    
    # Plot pairplot
    # fig_fn = os.path.join(out_dir, 'training_data_pairplot.png')
    # fig = sns.pairplot(training_data, corner=True, kind='hist', diag_kind='kde')
    # fig.savefig(fig_fn, dpi=250, bbox_inches='tight')
    # print('Pairplot saved to file:', fig_fn)
    # plt.show()

else:

    training_data = pd.read_csv(training_data_fn)
    print('Training data loaded from file.')

training_data

## Prepare the input features and targets

In [None]:
feature_cols = ['elevation', 'slope', 'aspect', 'topographic_position_index', 'Sx']
target_cols = ['snow_depth']
X = training_data[feature_cols]
y = training_data[target_cols]

if scale_inputs:
    # Fit a standard scaler to the feature columns
    scaler_fn = os.path.join(out_dir, 'feature_scaler.joblib')
    if not os.path.exists(scaler_fn):
        scaler = StandardScaler().fit(X)
        joblib.dump(scaler, scaler_fn)
        print('Feature scaler saved to file:', scaler_fn)
    else:
        scaler = joblib.load(scaler_fn)
        print('Feature scaler loaded from file.')
        
    # Transform the feature columns
    X_scaled = scaler.transform(X)


## Train, tune, and test the Random Forest model

In [None]:
model_fn = os.path.join(out_dir, 'trained_model.joblib')
if not os.path.exists(model_fn):

    ### Use ~20,000 random points from the training data
    nsamp = int(len(X) / 20e3)
    if scale_inputs:
        X_sub, y_sub = X_scaled.iloc[::nsamp,:], y.iloc[::nsamp,:]
    else:
        X_sub, y_sub = X.iloc[::nsamp,:], y.iloc[::nsamp,:]
    X_sub.reset_index(drop=True, inplace=True)
    y_sub.reset_index(drop=True, inplace=True)

    ### Set up the hyperparameter grid
    # Number of trees in random forest
    n_estimators = [10] + [int(x) for x in np.linspace(100, 1000, num = 10)]
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4, 10]
    # Create the random grid
    param_grid = {'n_estimators': n_estimators,
                  'max_depth': max_depth,
                  'min_samples_split': min_samples_split,
                  'min_samples_leaf': min_samples_leaf}
    print('Hyperparameter Grid:')
    print(json.dumps(param_grid, sort_keys=True, indent=4))

    ### Search for best hyperparameters
    rf = RandomForestRegressor()
    rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, 
                                   n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
    rf_random.fit(X_sub, y_sub)
    print('Best parameters:')
    print(rf_random.best_params_)
    
    ### Compare to the default (base) model
    cv_model = rf_random.best_estimator_
    y_pred = cv_model.predict(X_sub)
    cv_rmse = np.sqrt(mean_squared_error(y_sub, y_pred))
    print(f"Best model RMSE = {np.round(cv_rmse,2)} m")
    
    ### Save the best model
    best_model = cv_model
    joblib.dump(best_model, model_fn)
    print('Best model saved to file:', model_fn)

else:
    best_model = joblib.load(model_fn)
    print('Best model loaded from file.')
    
best_model


## Estimate feature importances in model

In [None]:
feature_importances_fn = os.path.join(out_dir, 'feature_importances.csv')
if not os.path.exists(feature_importances_fn):
    # Feature importances via mean decrease in impurity
    mdi_importances = best_model.feature_importances_
    std = np.std([tree.feature_importances_ for tree in best_model.estimators_], axis=0)
    mdi_importances = pd.Series(mdi_importances, index=feature_cols)

    # Feature importances via permutation
    result = permutation_importance(best_model, X_sub, y_sub, n_repeats=10, random_state=42, n_jobs=2)
    perm_importances = pd.Series(result.importances_mean, index=feature_cols)

    # Save in dataframe
    feature_importances = pd.DataFrame(mdi_importances.values, feature_cols, ['MDI'])
    feature_importances['MDI_std'] = std
    feature_importances['permutation'] = perm_importances
    feature_importances['permutation_std'] = result.importances_std
    feature_importances.to_csv(feature_importances_fn, index=True)
    print('Feature importances saved to file:', feature_importances_fn)
    
    # Plot
    fig, ax = plt.subplots(1, 2, figsize=(10,5))
    mdi_importances.plot.bar(yerr=std, ax=ax[0])
    ax[0].set_title("Feature importances via MDI")
    ax[0].set_ylabel("Mean decrease in impurity")
    perm_importances.plot.bar(yerr=result.importances_std, ax=ax[1])
    ax[1].set_title("Feature importances via permutation")
    ax[1].set_ylabel("Mean accuracy decrease")
    fig.tight_layout()
    plt.show()
    
    # Save figure
    fig_fn = os.path.join(out_dir, 'feature_importances.png')
    fig.savefig(fig_fn, dpi=250, bbox_inches='tight')
    print('Figure saved to file:', fig_fn)
    
else:
    feature_importances = pd.read_csv(feature_importances_fn)
    print('Feature importances loaded from file.')

feature_importances

## Use best model to predict snow depth at site

In [None]:
# Check if modeled snow depth already exists
sd_pred_fn = os.path.join(out_dir, f'modeled_snow_depth_{site_name}_{date}.tif')
if not os.path.exists(sd_pred_fn):
    
    # Load terrain parameters 
    print('Loading input files')
    elev = xdem.DEM(refdem_fn.replace('.tif', '_ELEVATION.tif')).reproject(res=2)
    slope = xdem.DEM(refdem_fn.replace('.tif', '_SLOPE.tif')).reproject(elev)
    aspect = xdem.DEM(refdem_fn.replace('.tif', '_ASPECT.tif')).reproject(elev)
    tpi = xdem.DEM(refdem_fn.replace('.tif', '_TPI.tif')).reproject(elev)
    sx_fn = glob.glob(refdem_fn.replace('.tif', '*Sx*.tif'))[0]
    sx = xdem.DEM(sx_fn).reproject(elev)
    
    # Load SkySat snow depth
    sd = xdem.DEM(sd_fn).reproject(elev)
    
    # Identify real value indices (for reshaping results later)
    print('Constructing features')
    rasters = [elev, slope, aspect, tpi, sx]
    ix = [np.where((np.isfinite(raster.data) & ~np.isnan(raster.data)), True, False) 
        for raster in rasters]
    ireal = np.full(sd.shape, True)
    for ixx in ix:
        ireal = ireal & ixx
    # create df of raster values
    df = pd.DataFrame({'elevation': elev.data[ireal].ravel(),
                       'slope': slope.data[ireal].ravel(),
                       'aspect': aspect.data[ireal].ravel(),
                       'topographic_position_index': tpi.data[ireal].ravel(),
                       'Sx': sx.data[ireal].ravel()})
    df = df.round(2) # round values

    # Scale the input features
    if scale_inputs:
        df = scaler.transform(df)

    # Predict snow depth
    print('Modeling snow depth')
    with joblib.parallel_backend('threading', n_jobs=12):
        y_pred = best_model.predict(df)

    # Reshape into the raster shape
    snow_depth_pred = np.full(np.shape(ireal), np.nan)
    snow_depth_pred[ireal] = y_pred

    # Reformat as xdem.DEM
    sd_pred = xdem.DEM.from_array(snow_depth_pred, transform=sd.transform, crs=sd.crs, nodata=np.nan)
    
    # Save to file
    sd_pred.save(sd_pred_fn)
    print('Modeled snow depth saved to file:', sd_pred_fn)
    
    # Compare to observations
    fig, ax = plt.subplots(2, 2, figsize=(8,8))
    ax = ax.flatten()
    sd_pred.plot(cmap='Blues', vmin=0, vmax=5, ax=ax[0], add_cbar=False)
    ax[0].set_title('Modeled')
    sd.plot(cmap='Blues', vmin=0, vmax=5, ax=ax[1], cbar_title='Snow depth [m]')
    ax[1].set_title('Observed')
    diff = sd_pred - sd
    diff.plot(cmap='coolwarm_r', vmin=-5, vmax=5, ax=ax[2], cbar_title='Difference [m]')
    ax[2].set_title('Modeled - Observed')
    for axis in ax[0:3]:
        axis.set_xticks([])
        axis.set_yticks([])
    ax[3].hist(diff.data.ravel(), bins=np.arange(-5, 5, step=0.2), facecolor='skyblue', edgecolor='k', linewidth=0.5)
    ax[3].set_xlabel('Modeled - Observed [m]')
    ax[3].set_ylabel('Frequency')
    ax[3].set_yticks([])
    ax[3].axvline(0, color='k', linewidth=1)

    fig.tight_layout()
    plt.show()
    
    # Save figure to file
    fig_fn = os.path.splitext(sd_pred_fn)[0] + '.png'
    fig.savefig(fig_fn, dpi=250, bbox_inches='tight')
    print('Figure saved to file:', fig_fn)

else:
    print('Modeled snow depth already exists, skipping.')
