# Analyze controls on snowlines/ELAs using machine learning and ERA-derived PDDs

In [1]:
# Import packages
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import glob
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import KFold
import os

### Determine settings

In [None]:
# define path to study-sites/
study_sites_path = '/Users/raineyaberle/Google Drive/My Drive/Research/CryoGARS-Glaciology/Advising/student-research/Alexandra-Friel/snow_cover_mapping_application/study-sites/'

# define sites to use
site_names = ['Wolverine', 'Gulkana', 'LemonCreek', 'SouthCascade', 'Sperry']

### Set up training data: snowline/AAR time series and ERA-derived PDDs

In [None]:
# -----Check if training data already exist in directory
training_data_fn = 'full_snowline_timeseries_training_data.csv'
if training_data_fn in os.listdir():
    print('Training dataset already exists in directory, loading...')
    training_full_df = pd.read_csv(training_data_fn)
    
else:

    print('Constructing training dataset...')
    
    # -----Initialize dataframe for full training dataset
    training_full_df = pd.DataFrame()

    # -----Iterate over site names
    for site_name in site_names:

        print(site_name)

        # Load snowlines
        snowlines_df = pd.DataFrame()
        snowline_fns = glob.glob(study_sites_path + site_name + '/imagery/snowlines/*.csv')
        for snowline_fn in snowline_fns:
            try:
                snowline = pd.read_csv(snowline_fn)
                snowlines_df = pd.concat([snowlines_df, snowline])
            except:
                continue
        snowlines_df.reset_index(drop=True, inplace=True)
        snowlines_df['datetime'] = pd.to_datetime(snowlines_df['datetime'], format='mixed')
        snowlines_df['Date'] = snowlines_df['datetime'].values.astype('datetime64[D]')

        # Load AOI
        AOI_fn = glob.glob(study_sites_path + site_name + '/AOIs/*RGI*shp')[0]
        AOI = gpd.read_file(AOI_fn)
        # add terrain parameters to training df
        AOI_columns = ['Area', 'Zmin', 'Zmax', 'Zmed', 'Slope', 'Aspect']
        for column in AOI_columns:
            snowlines_df[column] = AOI[column].values[0]

        # Load ERA data
        era_fn = glob.glob(study_sites_path + site_name + '/ERA/*.csv')[0]
        era = pd.read_csv(era_fn)
        era.reset_index(drop=True, inplace=True)
        era['Date'] = pd.to_datetime(era['Date'])
        # merge era and snowline dates
        snowlines_df = pd.merge(snowlines_df, era)

        # Compile all info into single dataframe
        columns = ['snowline_elevs_median_m', 'AAR', 'Cumulative_Positive_Degree_Days'] + AOI_columns
        training_df = snowlines_df[columns]

        # Compile and concatenate to training_df
        training_full_df = pd.concat([training_full_df, training_df])

    # -----Save training data to file
    training_full_df.reset_index(drop=True, inplace=True)
    training_data.to_csv(training_full_df, index=False)
    print('Training data saved to file: ' + training_data_fn)

training_full_df


## Split training dataset into X (_features_)and y (_labels_)

In [None]:
feature_columns = ['Cumulative_Positive_Degree_Days', 'Area', 'Zmin', 'Zmax', 'Zmed', 'Slope', 'Aspect']
labels = ['snowline_elevs_median_m']

# Remove NaNs from training dataset
training_full_df.dropna(inplace=True)
training_full_df.reset_index(drop=True, inplace=True)

# Separate features and labels
X = training_full_df[feature_columns]
y = training_full_df[labels]

## Define supervised machine learning models to test


See the [SciKitLearn Classifier comparison page](https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html) for more models, etc.

In [None]:
# Classifier names
names = [
    "Linear Regression",
    "Random Forest Regression",
    "Decision Tree Regression",
    "Support Vector Regression",
    "Gradient Boosting Regression",
    "Ridge Regression"
]

# Classifiers
classifiers = [
    LinearRegression(),
    RandomForestRegressor(),
    DecisionTreeRegressor(),
    SVR(),
    GradientBoostingRegressor(),
    Ridge()
]


## Train and test machine learning models using K-folds cross-validation

In [None]:
# -----Initialize performance metrics
abs_err = np.zeros(len(names)) # absolute error [m]

# Iterate over classifiers
for i, (name, clf) in enumerate(zip(names, classifiers)):
    
    print(name)

    # Conduct K-Fold cross-validation
    num_folds = 10
    kfold = KFold(n_splits=num_folds, shuffle=True, random_state=1)
    abs_err_folds = np.zeros(num_folds) # absolute error for all folds
    j = 0 # fold counter

    # loop through fold indices
    for train_ix, test_ix in kfold.split(X):
        # split data into training and testing using kfold indices
        X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
        y_train, y_test = np.ravel(y.iloc[train_ix].values), np.ravel(y.iloc[test_ix].values)

        # fit model to X_train and y_train
        clf.fit(X_train, y_train)

        # predict outputs for X_test values
        y_pred = clf.predict(X_test)

        # calculate performance metrics
        abs_err_folds[j] = np.nanmean(np.abs(y_test - y_pred))
        j += 1

    # take average performance metrics for all folds
    abs_err[i] = np.nanmean(abs_err_folds)
    
    # grab feature importances from random forest and plot
    if name=='Random Forest Regression':
        # train model with full dataset
        clf.fit(X, y)
        
        # grab feature importances
        importances = clf.feature_importances_
        std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
        forest_importances = pd.Series(importances, index=feature_columns)
        
        # plot
        fig, ax = plt.subplots()
        forest_importances.plot.bar(yerr=std, ax=ax)
        ax.set_xticks(ax.get_xticks())
        ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha='right')
        ax.set_title("Feature importances using MDI")
        ax.set_ylabel("Mean decrease in impurity")
        fig.tight_layout()
        plt.show()

    # display performance results
    print('    Mean absolute error = ' + str(np.round(abs_err[i])) + ' m')
print(' ')

# -----Determine best model
ibest = np.argwhere(abs_err==np.min(abs_err))[0][0]
best_clf = classifiers[ibest]
best_clf_name = names[ibest]
print('Most accurate classifier: ' + best_clf_name)
