# Develop machine learning models for each subregion to predict the AAR

In [None]:
# Import packages
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import glob
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
import os
from tqdm.auto import tqdm
import sys
from joblib import dump, load
import warnings
warnings.filterwarnings("ignore")

## Define paths in directory, import functions

In [None]:
# define path to study-sites/
scm_path = '/Volumes/LaCie/raineyaberle/Research/PhD/snow_cover_mapping/'
# define path to snow-cover-mapping-application/
base_path = '/Users/raineyaberle/Research/PhD/snow_cover_mapping/snow-cover-mapping-application/'
# path to save output figures
figures_out_path = os.path.join(base_path, 'figures')

# Load necessary functions
sys.path.insert(1, os.path.join(base_path, 'functions'))
import model_analyze_utils as f

## Define the feature columns and labels

In [None]:
# columns to use for prediction in training data
feature_columns = ['Cumulative_Positive_Degree_Days', 
                   'Cumulative_Snowfall_mwe', 
                   'Hypsometric_Index',
                   'Area', 
                   'Zmed', 
                   'Slope', 
                   'Aspect'
                  ]
# how to display each feature column in plots, etc.
feature_columns_display = ['$\Sigma$PDDs', 
                           '$\Sigma$Snowfall [m.w.e.]',
                           'Hypsometric Index',
                           'Area', 
                           'Z$_{med}$', 
                           'Slope', 
                           'Aspect'
                          ]
# variable to predict
labels = ['AAR']


## Plot pairplots of (un-scaled) training data

In [None]:
plt.rcParams.update({'font.size':14, 'font.sans-serif':'Arial'})

# Grab training data file names
training_data_fns = sorted(glob.glob(os.path.join(scm_path, 'machine_learning', 'training_data_*.csv')))
training_data_fns = [x for x in training_data_fns if '_scaled' not in x]

for training_data_fn in training_data_fns:
    # Grab subregion name from file name
    subregion_name = training_data_fn.split('training_data_')[1].split('.csv')[0]

    # Load training data
    training_data = pd.read_csv(training_data_fn)
    
    # Rename columns for plotting
    d = {}
    for (x, x_display) in zip(feature_columns, feature_columns_display):
        d[x] = x_display
    training_data.rename(columns=d, inplace=True)
    
    # Plot
    plot = sns.pairplot(training_data, vars=feature_columns_display, corner=True, diag_kind='kde', hue='AAR')
    plot.fig.suptitle(subregion_name)
    plt.show()

    # Save figure
    fig_fn = os.path.join(figures_out_path, 'training_data_pairplot_' + subregion_name + '.png')
    fig.savefig(fig_fn, dpi=300, bbox_inches='tight')
    print('figure saved to file: ' + fig_fn)

## Define supervised regression models to test

See the [SciKitLearn Supervised Learning page](https://scikit-learn.org/stable/supervised_learning.html) for more models, etc.

In [None]:
# Models
models = [
    LinearRegression(),
    RandomForestRegressor(),
    DecisionTreeRegressor(),
    SVR(),
    GradientBoostingRegressor(),
    Ridge()
]

# Model names (used for plotting, etc.)
model_names = [
    "Linear Regression",
    "Random Forest Regression",
    "Decision Tree Regression",
    "Support Vector Regression",
    "Gradient Boosting Regression",
    "Ridge Regression"
]


## Iterate over subregions, apply the machine learning workflow

In [None]:
# Grab scaled training data file names
training_data_scaled_fns = sorted(glob.glob(os.path.join(scm_path, 'machine_learning', 'training_data_*_scaled.csv')))

# Iterate over scaled training data file names
for training_data_scaled_fn in training_data_scaled_fns:
    # Grab subregion name from file name
    subregion_name = training_data_scaled_fn.split('training_data_')[1].split('_scaled.csv')[0]
    print('\n' + subregion_name)
    
    # Load scaled training data
    training_data_scaled = pd.read_csv(training_data_scaled_fn)
    training_data_scaled['Date'] = pd.to_datetime(training_data_scaled['Date'])

    # Split training data into X and y
    X = training_data_scaled[feature_columns]
    y = training_data_scaled[labels]

    # Determine best model using K-folds cross-validation and save to file
    out_path = os.path.join(scm_path, 'machine_learning')
    best_model_fn = 'best_model_' + subregion_name + '.joblib'
    best_model = f.determine_best_model(training_data_scaled, models, model_names, 
                                        feature_columns, labels, out_path, best_model_fn)



## Conduct weather sensitivity tests

In [None]:
# Set up figure
plt.rcParams.update({'font.size':12, 'font.sans-serif':'Arial'})
fig, ax = plt.subplots(1, 2, figsize=(20,6), sharey=True)

# Grab model file names
model_fns = sorted(glob.glob(os.path.join(scm_path, 'machine_learning', 'best_model*')))

# Iterate over model file names
for model_fn in model_fns:
    # Grab subregion name from file name
    subregion_name = model_fn.split('best_model_')[1].split('.joblib')[0]
    print('\n' + subregion_name)
    
    # Load trained model
    model = load(model_fn)
    
    # Load training data
    training_data_fn = os.path.join(scm_path, 'machine_learning', 'training_data_' + subregion_name + '.csv')
    training_data = pd.read_csv(training_data_fn)
    
    # Load fit scaler for training data
    scaler_fit_fn = os.path.join(scm_path, 'machine_learning', 'scaler_fit_' + subregion_name + '.gz')
    scaler_fit = load(scaler_fit_fn)

    # Create dataframe of mean conditions
    training_data_mean = pd.DataFrame(training_data[scaler_fit_columns].mean()).transpose()
    training_data_mean['scenario'] = 'mean'
    
    # Grab names of columns that need to be scaled
    scaler_fit_columns = scaler_fit.get_feature_names_out()
    
    # Iterate over cumsum(PDDs) and cumsum(snowfall)
    for i, column in enumerate(['Cumulative_Positive_Degree_Days', 'Cumulative_Snowfall_mwe']):
        # Define range of values
        values_range = np.linspace(training_data[column].min(), training_data[column].max(), num=100)
        df = training_data_mean.copy(deep=True)
        df = pd.concat([df]*(len(values_range)+1),ignore_index=True)
        df.loc[1:, column] = values_range
        df.loc[1:, 'scenario'] = np.arange(1, len(values_range)+1, step=1)
        
        # Scale values
        df_scaled = df.copy(deep=True)
        df_scaled.loc[:, scaler_fit_columns] = scaler_fit.transform(df_scaled[scaler_fit_columns])
    
        # Predict AAR
        df['AAR'] = model.predict(df_scaled[feature_columns])

        # Save results
        df_fn = os.path.join(scm_path, 'machine_learning', 
                             'aar_sensitivity_tests_' + subregion_name + '_' + column + '.csv')
        df.to_csv(df_fn, index=False)
        print(column + ' sensitivity tests saved to file:', df_fn)

        # Plot results
        plot = ax[i].plot(df.loc[1:, column], df.loc[1:, 'AAR'], '-', label=subregion_name)

ax[0].set_ylim(0,1)
ax[0].set_xlabel('$\Sigma$PDDs')
ax[0].set_ylabel('AAR')
ax[0].grid()
ax[1].set_xlabel('$\Sigma$Snowfall [m.w.e.]')
ax[1].grid()
ax[1].legend(bbox_to_anchor=[1.2, 0.5, 0.2, 0.2])
plt.show()