# Analyze controls on AARs/ELAs using ERA-derived climate conditions and machine learning models

In [None]:
# Import packages
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import glob
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
import os
from tqdm.auto import tqdm
import sys
from joblib import dump, load
import warnings
warnings.filterwarnings("ignore")

### Define paths in directory

In [None]:
# define path to study-sites/
study_sites_path = '/Users/raineyaberle/Google Drive/My Drive/Research/CryoGARS-Glaciology/Advising/student-research/Alexandra-Friel/snow_cover_mapping_application/study-sites/'

# define path to snow-cover-mapping-application/
base_path = os.path.join(study_sites_path, '..', 'snow-cover-mapping-application')

# path to save output figures
figures_out_path = os.path.join(study_sites_path, '..', 'snow-cover-mapping-application', 'figures')

# Load necessary functions
sys.path.insert(1, os.path.join(base_path, 'functions'))
import model_analyze_utils as f

### Construct and/or update training data

In [None]:
# define path and file name of training data
training_data_path = os.path.join(study_sites_path, '..', 'snow-cover-mapping-application', 'inputs-outputs')
training_data_fn = 'snowline_timeseries_full_training_data.csv'

# load training data from file
training_data_df = f.construct_update_training_data(study_sites_path, training_data_path, training_data_fn)

# remove NaNs and reset index
training_data_df.dropna(inplace=True)
training_data_df.reset_index(drop=True, inplace=True)
training_data_df

## Subset the training data: sample conditions for median week of minimum AAR in each subregion

In [None]:
# define path and file name for training data subset
training_data_subset_path = os.path.join(study_sites_path, '..', 'snow-cover-mapping-application', 'inputs-outputs')
training_data_subset_fn = 'snowline_timeseries_subset_training_data.csv'

# load training data subset from file
training_data_subset_df = f.subset_training_data(training_data_df, training_data_subset_path, training_data_subset_fn)

# remove NaNs and reset index
training_data_subset_df.dropna(inplace=True)  
training_data_subset_df.reset_index(drop=True, inplace=True)
training_data_subset_df

## Plot pairplot of training data

In [None]:
# plt.rcParams.update({'font.size':14, 'font.sans-serif':'Arial'})
# df = training_data_df.copy()
# # change column names for plotting
# df['$\Sigma$PDDs'] = df['Cumulative_Positive_Degree_Days']
# df['$\Sigma$Snowfall'] = df['Cumulative_Snowfall_mwe']
# df['Hyps. Index'] = df['Hypsometric_Index']
# feature_cols = ['AAR', '$\Sigma$PDDs', '$\Sigma$Snowfall', 
#                 'Area', 'Zmin', 'Zmax', 'Zmed', 'Slope', 'Aspect', 'Hyps. Index']
# fig = sns.pairplot(df, vars=feature_cols, corner=True, diag_kind='kde', hue='AAR')

# plt.show()

# # save figure
# if 'Area' in feature_cols:
#     fig_fn = os.path.join(figures_out_path, 'training_data_pairplot.png')
# else:
#     fig_fn = os.path.join(figures_out_path, 'training_data_climate_vars_pairplot.png')
# fig.savefig(fig_fn, dpi=300, bbox_inches='tight')
# print('figure saved to file: ' + fig_fn)

## Define feature columns and labels

In [None]:
# columns to use for prediction in training data
feature_columns = [
                   # 'Cumulative_Positive_Degree_Days', 
                   # 'Cumulative_Snowfall_mwe', 
                   # 'PA_Ratio',
                   'Hypsometric_Index',
                   'Area', 
                   # 'Zmin', 
                   # 'Zmax', 
                   'Zmed', 
                   'Slope', 
                   'Aspect']
# how to display each feature column in plots, etc.
feature_columns_display = [
                   # '$\Sigma$PDDs', 
                   # '$\Sigma$Snowfall [m.w.e.]', 
                   # 'P / \sqrt(A)',
                   'Hypsometric Index',
                   'Area', 
                   # 'Zmin', 
                   # 'Zmax', 
                   'Z$_{med}$', 
                   'Slope', 
                   'Aspect']
# variable to predict
labels = ['AAR']


## Define supervised machine learning models to test


See the [SciKitLearn Classifier comparison page](https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html) for more models, etc.

In [None]:
# Models
models = [
    LinearRegression(),
    RandomForestRegressor(),
    DecisionTreeRegressor(),
    SVR(),
    GradientBoostingRegressor(),
    Ridge()
]

# Model names (used for plotting, etc.)
model_names = [
    "Linear Regression",
    "Random Forest Regression",
    "Decision Tree Regression",
    "Support Vector Regression",
    "Gradient Boosting Regression",
    "Ridge Regression"
]


## Train and test machine learning models using K-folds cross-validation, conduct permutation feature importance

### One model for all subregions

In [None]:
importlib.reload(f)

In [None]:
# -----Determine best model
out_path = os.path.join(base_path, 'inputs-outputs')
best_model_fn = 'best_model_all_regions.json'
best_model, X, y = f.determine_best_model(training_data_subset_df, models, model_names, 
                                          feature_columns, labels, out_path, best_model_fn)

# -----Assess model feature importances
importances_fn = 'best_model_all_regions_feature_importances.csv'
figure_fn = 'best_model_all_regions_feature_importances.png'
feature_importances = f.assess_model_feature_importances(best_model, X, y, feature_columns, feature_columns_display, 
                                                         out_path, importances_fn, figures_out_path, figure_fn)


In [None]:
import importlib
importlib.reload(f)

### Separate model for each subregion

In [None]:
# -----Grab list of all unique regions and subregions in dataset
unique_subregion_counts = training_data_subset_df[['O1Region', 'O2Region']].value_counts().reset_index(name='count')
unique_subregion_counts = unique_subregion_counts.sort_values(by=['O1Region', 'O2Region']).reset_index(drop=True)
unique_subregions = unique_subregion_counts[['O1Region', 'O2Region']].values
unique_subregion_counts

In [None]:
# -----Iterate over unique subregions
for o1region, o2region in unique_subregions:
    
    # subset training data to subregion
    training_subregion_df = training_data_subset_df.loc[(training_data_subset_df['O1Region']==o1region) 
                                                        & (training_data_subset_df['O2Region']==o2region)]
    # grab subregion name and color for plotting
    subregion_name, color = f.determine_subregion_name_color(o1region, o2region)
    subregion_name_save = subregion_name.replace('.','').replace(' ','') # remove periods and spaces for saving

    print('----------')
    print(subregion_name)
    print('----------')

    # -----Determine best model
    out_path = os.path.join(base_path, 'inputs-outputs')
    best_model_fn = 'best_model_' + subregion_name_save + '.json'
    best_model, X, y = f.determine_best_model(training_subregion_df, models, model_names, 
                                              feature_columns, labels, out_path, best_model_fn)
    
    # -----Assess model feature importances
    importances_fn = 'best_model_' + subregion_name_save + '_feature_importances.csv'
    figure_fn = 'best_model_' + subregion_name_save + '_feature_importances.png'
    feature_importances = f.assess_model_feature_importances(best_model, X, y, feature_columns, feature_columns_display, 
                                                             out_path, importances_fn, figures_out_path, figure_fn)
    print(' ')

## Conduct simulations for assessing regional AAR response to changes in terrain parameters

In [None]:
# # -----Test for all regions model
# # Construct input data
# input_df = pd.DataFrame()
# for column in feature_columns:
#     input_df[column] = [training_data_df[column].drop_duplicates().mean()]
# # predict AAR using input data
# clf_fn = os.path.join(base_path, 'inputs-outputs', 'best_classifier_all_regions.joblib')
# clf = load(clf_fn)
# AAR_mean = clf.predict(input_df)[0]

# # Set up figure
# plt.rcParams.update({'font.size':14, 'font.sans-serif': 'Arial'})
# fig, ax = plt.subplots(2, 3, figsize=(16, 8))
# ax = ax.flatten()
# # define settings for each column
# columns = ['Area', 'Zmed', 'Hypsometric_Index', 'Aspect', 'Slope']
# ranges = [np.arange(0.5, 1000, step=1),    # Areas
#           np.arange(500, 3000, step=100),  # Zmeds
#           np.arange(-3, 3, step=0.2),     # HIs
#           np.arange(0, 360, step=20),      # Aspects
#           np.arange(5, 30, step=1)]        # Slopes
# units = ['km$^2$', 'm.a.s.l.', 'm/m', 'degrees', 'degrees']
# colors = ['k', 'g', 'c', 'r', 'm', 'b']
# # iterate over columns
# i=0
# for column, range, unit, color in list(zip(columns, ranges, units, colors)):
#     # initialize AARs
#     aars = np.zeros(len(range))
#     # predict AAR for each value in range
#     for j, value in enumerate(range):
#         input_adj_df = input_df.copy()
#         input_adj_df[column] = value
#         aars[j] = clf.predict(input_adj_df)[0]
#     # plot results   
#     ax[i].plot(range, aars, '-', color=color, linewidth=3)
#     ax[i].set_ylim(0.4, 0.9)
#     ax[i].grid()
#     ax[i].set_title(column.replace('_', ' '))
#     ax[i].set_xlabel('[' + unit + ']')
#     ax[i].set_ylabel('AAR')

#     i+=1

# fig.delaxes(ax[-1])
# fig.tight_layout()
# plt.show()

# # Save figure
# fig_fn = os.path.join(figures_out_path, 'model_sensitivities_terrain_parameters.png')
# fig.savefig(fig_fn, dpi=300, bbox_inches='tight')
# print('figure saved to file: ' + fig_fn)

## Conduct simulations for assessing regional sensitivity to climate perturbations

In [None]:
# # -----Iterate over unique subregions
# perturb_df = pd.DataFrame()
# for o1region, o2region in unique_subregions:
    
#     # Subset training data to subregion
#     snowlines_subregion = training_data_df.loc[(training_data_df['O1Region']==o1region) & (training_data_df['O2Region']==o2region)]
#     subregion_name, color = f.determine_subregion_name_color(float(o1region), float(o2region))
#     print(subregion_name)

#     # Construct input data
#     input_df = pd.DataFrame()
#     for column in feature_columns:
#         input_df[column] = [snowlines_subregion[column].drop_duplicates().mean()]
#     # predict AAR using input data
#     clf_fn = os.path.join(base_path, 'inputs-outputs', 'best_classifier_' + subregion_name + '.joblib')
#     clf = load(clf_fn)
#     AAR_mean = clf.predict(input_df)[0]
    
#     # predict AAR if 150 additional PDDs
#     input_adj_df = input_df.copy()
#     input_adj_df['Cumulative_Positive_Degree_Days'] = input_adj_df['Cumulative_Positive_Degree_Days'] + 150
#     AAR_PDD_perturb = clf.predict(input_adj_df)[0]

#     # predict AAR if - 10% cumulative snow
#     input_adj_df = input_df.copy()
#     input_adj_df['Cumulative_Snowfall_mwe'] = input_adj_df['Cumulative_Snowfall_mwe'] * 0.9
#     AAR_snowfall_perturb = clf.predict(input_adj_df)[0]

#     # compile in dataframe
#     df = pd.DataFrame({'Subregion Name': [subregion_name],
#                        'Mean conditions AAR': [AAR_mean],
#                        '+150 PPDs AAR': [AAR_PDD_perturb],
#                        '+150 PDDs AAR % change': [(AAR_mean-AAR_PDD_perturb)/AAR_mean * 100],
#                        '-10% snowfall AAR': [AAR_snowfall_perturb],
#                        '-10% snowfall AAR % change': [(AAR_mean-AAR_snowfall_perturb)/AAR_mean * 100]
#                       })
#     perturb_df = pd.concat([perturb_df, df])

# perturb_df.reset_index(drop=True, inplace=True)
# perturb_df['+150 PDDs AAR % change'] = perturb_df['+150 PDDs AAR % change'].apply(np.round)
# perturb_df['-10% snowfall AAR % change'] = perturb_df['-10% snowfall AAR % change'].apply(np.round)

# perturb_df[['Subregion Name', '+150 PDDs AAR % change', '-10% snowfall AAR % change']]

In [None]:
# # Plot PDDs and AAR for one site as an example
# site_name = 'SouthCascade'
# training_site = training_data_df.loc[training_data_df['site_name']==site_name]
# training_site = training_site.sort_values(by='datetime')

# # load ERA data
# ERA_fn = glob.glob(os.path.join(study_sites_path, site_name, 'ERA', '*.csv'))[0]
# ERA = pd.read_csv(ERA_fn)
# ERA['Date'] = pd.to_datetime(ERA['Date'])

# plt.rcParams.update({'font.size':16, 'font.sans-serif':'Arial'})
# fig, ax = plt.subplots(2, 1, figsize=(12,8))
# # AAR
# ax[0].plot(training_site['datetime'], training_site['AAR'], '.k')
# ax[0].grid()
# ax[0].set_xlim(np.datetime64('2016-01-01'), np.datetime64('2023-01-01'))
# ax[0].set_ylabel('AAR')
# ax[0].set_title('South Cascade Glacier')

# # PDDs and snowfall
# ax[1].bar(ERA['Date'], ERA['Cumulative_Snowfall_mwe'], color='#4eb3d3', width=1)
# ax[1].set_ylabel('$\Sigma$ Snowfall [m.w.e.]', color='#4eb3d3')
# ax[1].set_xlim(np.datetime64('2016-01-01'), np.datetime64('2023-01-01'))
# ax[1].grid()
# ax[1].tick_params(axis='y', colors='#4eb3d3')
# ax2 = ax[1].twinx()
# ax2.plot(ERA['Date'], ERA['Cumulative_Positive_Degree_Days'], '.m')
# ax2.set_ylabel('$\Sigma$ Positive degree days', color='m')
# ax2.tick_params(axis='y', colors='m')

# plt.show()

# # save figure
# fig_fn = os.path.join(figures_out_path, 'example_time_series_SouthCascadeGlacier.png')
# fig.savefig(fig_fn, dpi=300, bbox_inches='tight')
# print('figure saved to file: ' + fig_fn)