# Test machine learning models for predicting median snowline elevations using terrain parameters

In [1]:
# Import packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import KFold
import os

In [2]:
# If using Google Colab, mount Google Drive so you can access the files in this folder
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Uncomment the line below to locate the snow-cover-mapping-application folder in your Drive using os.listdir()
# os.listdir('drive/MyDrive/Research/PhD/snow_cover_mapping/snow_cover_mapping_application/snow-cover-mapping-application/')

In [4]:
# Load data

# ALEXANDRA'S PATH TO DATA
# path_to_folder = ('drive/MyDrive/snow_cover_mapping_application/snow-cover-mapping-application/') 

# RAINEY'S PATH TO DATA
path_to_folder = 'drive/MyDrive/Research/PhD/snow_cover_mapping/snow_cover_mapping_application/snow-cover-mapping-application/' 

fn = 'training_data.csv' # file name
df = pd.read_csv(path_to_folder + fn)
df.drop('Unnamed: 0', axis=1, inplace=True) # there's a weird "Unnamed" column I need to get rid of
df

Unnamed: 0,study_site,datetime,snowlines_elevs_median_m,SCA_m2,AAR,O1Region,O2Region,Area,Zmin,Zmax,Zmed,Slope,Aspect
0,Wolverine,2013-09-15 15:10:00,1217.377579,9281700.0,0.602324,1,4,16.749,426,1636,1267,11.0,188
1,Wolverine,2015-08-04 15:07:39,1130.237648,11235600.0,0.728695,1,4,16.749,426,1636,1267,11.0,188
2,Wolverine,2016-08-31 15:02:02,1128.165947,11121300.0,0.72145,1,4,16.749,426,1636,1267,11.0,188
3,Wolverine,2017-09-29 15:15:10,1234.305453,7414300.0,0.969114,1,4,16.749,426,1636,1267,11.0,188
4,Wolverine,2018-09-13 15:07:39,1259.427673,8747100.0,0.592622,1,4,16.749,426,1636,1267,11.0,188
5,Wolverine,2019-08-25 15:18:35,1262.483019,2764600.0,0.366945,1,4,16.749,426,1636,1267,11.0,188
6,Wolverine,2020-08-12 15:28:33,1254.780865,9115800.0,0.632019,1,4,16.749,426,1636,1267,11.0,188
7,Wolverine,2021-09-11 15:28:44,1267.770837,9125400.0,0.592582,1,4,16.749,426,1636,1267,11.0,188
8,Wolverine,2022-08-02 15:28:48,1173.325647,2418300.0,0.364344,1,4,16.749,426,1636,1267,11.0,188
9,Gulkana,2015-09-21 15:07:13,1971.07959,15261300.0,0.984441,1,2,17.567,1162,2438,1858,14.0,172


## Split training data into X (predictive features) and y (output labels)

In [5]:
# Define which columns to use as predictive "features" and which to use as outcome "labels"
# Feel free to adjust and play around with this
training_columns = ['Area','Zmin', 'Zmax', 'Zmed', 'Slope', 'Aspect']
labels = 'snowlines_elevs_median_m'

# Separate features and labels
X = df[training_columns]
y = df[labels]

## Define supervised machine learning models to test


Feel free to add more! See the [SciKitLearn Classifier comparison page](https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html) for more models, etc.

In [6]:
# Define supervised machine learning models to test
# Adjust hyperparameters to see if it improves the model errors
# (see documentation for each model)

# Classifier names
names = [
    "Linear Regression",
    "Random Forest Regression",
    "Decision Tree Regression",
    "Support Vector Regression",
    "Gradient Boosting Regression",
    "Ridge Regression"
]

# -----Classifiers
classifiers = [
    LinearRegression(),
    RandomForestRegressor(),
    DecisionTreeRegressor(),
    SVR(),
    GradientBoostingRegressor(),
    Ridge()
]


In [7]:
# Initialize performance metrics
abs_err = np.zeros(len(names)) # absolute error [m]

# Iterate over classifiers
for i, name, clf in zip(np.arange(0, len(names)), names, classifiers):

  print(name)

  # Conduct K-Fold cross-validation
  num_folds = 10
  kfold = KFold(n_splits=num_folds, shuffle=True, random_state=1)
  abs_err_folds = np.zeros(num_folds) # absolute error for all folds
  j=0 # fold counter
  # loop through fold indices
  for train_ix, test_ix in kfold.split(X):
    
    # split data into training and testing using kfold indices
    X_train, X_test = X.loc[train_ix], X.loc[test_ix]
    y_train, y_test = y[train_ix], y[test_ix]

    # fit model to X_train and y_train
    clf.fit(X_train, y_train)
        
    # predict outputs for X_test values
    y_pred = clf.predict(X_test)
    
    # calculate performance metrics
    abs_err_folds[j] = np.nanmean(np.abs(y_test - y_pred))
  
    # PLOT DECISION BOUNDARIES IF YOU WANT
  
  # take average performance metrics for all folds
  abs_err[i] = np.nanmean(abs_err_folds)

  # display performance results
  print('    Mean absolute error = '+str(np.round(abs_err[i]))+' m')
  
  print(' ')

# Select best model using performance metrics


Linear Regression
    Mean absolute error = 3.0 m
 
Random Forest Regression
    Mean absolute error = 3.0 m
 
Decision Tree Regression
    Mean absolute error = 3.0 m
 
Support Vector Regression
    Mean absolute error = 47.0 m
 
Gradient Boosting Regression
    Mean absolute error = 3.0 m
 
Ridge Regression
    Mean absolute error = 3.0 m
 
