# Fit linear trendline to AARs, PDDs, and snowfall for all sites

In [None]:
import os
import glob
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import sys
from sklearn.linear_model import LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.isotonic import IsotonicRegression
from sklearn.svm import SVR
from sklearn.model_selection import KFold
import numpy as np
import seaborn as sns
from scipy.stats import iqr
from scipy.stats import median_abs_deviation as MAD
# Suppress warnings to prevent kernel crashing (future warning from pandas)
import warnings
warnings.filterwarnings("ignore")

## Define paths in directory

In [None]:
base_path = '/Users/raineyaberle/Research/PhD/snow_cover_mapping/snow-cover-mapping-application/'
sys.path.append(os.path.join(base_path, 'functions'))
import model_analyze_utils as f

# scm_path = '/Volumes/LaCie/raineyaberle/Research/PhD/snow_cover_mapping/'
scm_path = '/Users/raineyaberle/Research/PhD/snow_cover_mapping/'
figures_out_path = os.path.join(base_path, 'figures')

## Load compiled glacier boundaries and snowlines

In [None]:
# -----Load glacier boundaries with climate clusters
aois_fn = os.path.join(scm_path, 'compiled_data', 'all_aois_climate_cluster.shp')
aois = gpd.read_file(aois_fn)
aois[['O1Region', 'O2Region']] = aois[['O1Region', 'O2Region']].astype(int)
print('All AOIs with climate clusters loaded from file.')

# -----Load ERA data
eras_fn = os.path.join(scm_path, 'compiled_data', 'all_era_data.csv')
eras = pd.read_csv(eras_fn)
# format dates as datetimes
eras['Date'] = pd.to_datetime(eras['Date'])
print('All ERA data loaded from file.')

# -----Load compiled snowlines
snowlines_fn = os.path.join(scm_path, 'compiled_data', 'all_snowlines.csv')
snowlines = pd.read_csv(snowlines_fn)
snowlines['datetime'] = pd.to_datetime(snowlines['datetime'], format='mixed')
print('All snowlines loaded from file.')
# snowlines

## Filter snowlines to before September, merge snowlines and ERA data

In [None]:
# Add Month column to snowlines
snowlines['Month'] = pd.DatetimeIndex(snowlines['datetime']).month.values
eras['Month'] = pd.DatetimeIndex(eras['Date']).month.values
# Remove observations after August
snowlines = snowlines.loc[snowlines['Month'] <= 8]
eras = eras.loc[eras['Month'] <= 8]
# Unify date columns for merging
snowlines['Date'] = snowlines['datetime'].values.astype('datetime64[D]')
eras['Date'] = eras['Date'].values.astype('datetime64[D]')
# Merge on site name and dates
merged = pd.merge(snowlines, eras, on=['site_name', 'Date'])
merged

## Add climate cluster column to merged dataframe

In [None]:
merged[['cluster', 'clustName']] = '', ''
for cluster in aois['cluster'].drop_duplicates().values:
    aois_cluster = aois.loc[aois['cluster']==cluster]
    site_names = aois_cluster['RGIId'].drop_duplicates().values
    merged.loc[merged['site_name'].isin(site_names), 'cluster'] = cluster
    merged.loc[merged['site_name'].isin(site_names), 'clustName'] = aois_cluster['clustName'].values[0]
merged

In [None]:
# Define non-parametric fit function
def svr_fit(X, y):    
    model = SVR().fit(X, y)
    score = model.score(X, y)
    return model, score

# Define linear fit function
def linear_fit(X, y):
    model = LinearRegression().fit(X,y)
    score = model.score(X, y)
    return model, score

# Define function for K-folds cross-validation model fitting
def kfolds_linear_fit(X, y, n_folds=10):
    # Define K-folds
    kf = KFold(n_splits=n_folds)
    # Initialize parameters
    coefs_PDD, coefs_snowfall, scores = [], [], []
    # Iterate over fold indices
    for i, (train_index, test_index) in enumerate(kf.split(X)):
        # Split X and y into training and testing
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # Fit model to testing
        model, score = linear_fit(X_train, y_train)
        coefs_PDD.append(model.coef_[0])
        coefs_snowfall.append(model.coef_[1])
        scores.append(score)
    # Calculate stats, compile in dataframe
    df = pd.DataFrame({'coef_PDD_mean': [np.nanmean(coefs_PDD)],
                       'coef_PDD_std': [np.nanstd(coefs_PDD)],
                       'coef_PDD_median': [np.nanmedian(coefs_PDD)],
                       'coef_PDD_MAD': [MAD(coefs_PDD)],
                       'coef_snowfall_mean': [np.nanmean(coefs_snowfall)],
                       'coef_snowfall_std': [np.nanstd(coefs_snowfall)],
                       'coef_snowfall_median': [np.nanmedian(coefs_snowfall)],
                       'coef_snowfall_MAD': [MAD(coefs_snowfall)],
                       'score_mean': [np.nanmean(scores)],
                       'score_median': [np.nanmedian(scores)]
                      })
    return df
    
        

## Fit linear and non-parametric models to PDDs and Snowfall vs. AARs 

### For each site separately

In [None]:
# Suppress warnings to prevent kernel crashing (future warning from pandas)
import warnings
warnings.filterwarnings("ignore")

# Initialize results dataframe
fit_sites_df = pd.DataFrame()

# Iterate over site names
for site_name in tqdm(merged['site_name'].drop_duplicates().values):
    # Subset data
    merged_site = merged.loc[merged['site_name']==site_name]
    # Grab O1 and O2 regions
    o1 = aois.loc[aois['RGIId']==site_name, 'O1Region'].values[0]
    o2 = aois.loc[aois['RGIId']==site_name, 'O2Region'].values[0]
    # Grab cluster and cluster name
    cluster, cluster_name = merged_site['cluster'].values[0], merged_site['clustName'].values[0]
    # Fit linear trendline to AAR and Cumulative PDDs
    X = merged_site[['Cumulative_Positive_Degree_Days', 'Cumulative_Snowfall_mwe']].values.reshape(-1, 2)
    y = merged_site['AAR'].values
    if (np.ravel(X)=='').all():
        coefs_df = np.nan
        print('none')
    else:
        coefs_df = kfolds_linear_fit(X, y)
    # Save in dataframe
    for col, value in list(zip(['site_name', 'O1Region', 'O2Region', 
                                'cluster', 'clustName', 'N'], 
                               [site_name, o1, o2, cluster, cluster_name, len(y)])):
        coefs_df[col] = [value]
    # Concatenate to full dataframe
    fit_sites_df = pd.concat([fit_sites_df, coefs_df])

# Save to file
fit_sites_fn = os.path.join(scm_path, 'results', 'aar_pdd_snowfall_linear_fit_sites.csv')
fit_sites_df.to_csv(fit_sites_fn, index=False)
print('Data table saved to file:', fit_sites_fn)
fit_sites_df

## Median trends by subregion and cluster based on each site's SVR model

In [None]:
# Add subregion name to merged data
for site_name in tqdm(merged['site_name'].drop_duplicates().values):
    o1 = aois.loc[aois['RGIId']==site_name, 'O1Region'].values[0]
    o2 = aois.loc[aois['RGIId']==site_name, 'O2Region'].values[0]
    subregion_name, color = f.determine_subregion_name_color(o1, o2)
    merged.loc[merged['site_name']==site_name, 'O1Region'] = o1
    merged.loc[merged['site_name']==site_name, 'O2Region'] = o2
    merged.loc[merged['site_name']==site_name, 'Subregion'] = subregion_name

In [None]:
# Climate clusters colormap
cluster_cmap_dict = {'W. Aleutians': '#1f78b4', 
                     'Continental': '#e31a1c',
                     'Transitional-Continental': '#fb9a99',
                     'Transitional-Temperate': '#b2df8a',
                     'Temperate': '#33a02c'}

In [None]:
# Initialize df
svr_fit_sites_df = pd.DataFrame()

# Iterate over sites
for site_name in tqdm(merged['site_name'].drop_duplicates().values):
    # Subset merged data to site
    merged_site = merged.loc[merged['site_name']==site_name]
    # Split into X and y
    X = merged_site[['Cumulative_Positive_Degree_Days', 'Cumulative_Snowfall_mwe']].values.reshape(-1, 2)
    y = merged_site['AAR'].values
    # Fit SVR model
    model, score = svr_fit(X, y)
    # Predict values
    y_pred = model.predict(X)
    # Save in dataframe
    svr_fit_df = pd.DataFrame({'site_name': [site_name],
                               'O1Region': [merged_site['O1Region'].values[0]],
                               'O2Region': [merged_site['O2Region'].values[0]],
                               'Subregion': [merged_site['Subregion'].values[0]],
                               'cluster': [merged_site['cluster'].values[0]],
                               'clustName': [merged_site['clustName'].values[0]],
                               'PDD': [X[:,0]],
                               'snowfall': [X[:,1]],
                               'AAR_pred': [y_pred],
                               'score': [score]})
    # Concatenate to full dataframe
    svr_fit_sites_df = pd.concat([svr_fit_sites_df, svr_fit_df])
svr_fit_sites_df

In [None]:
compiled_df = pd.DataFrame()
fig, ax = plt.subplots(5, 2, figsize=(10,12))
ax = ax.flatten()

# Iterate over subregion
for i, subregion_name in enumerate(merged['Subregion'].drop_duplicates().values):
    merged_subregion = merged.loc[merged['Subregion']==subregion_name]
    for cluster_name in merged_subregion['clustName'].drop_duplicates().values:
        merged_subregion_cluster = merged_subregion.loc[merged_subregion['clustName']==cluster_name]
        X = merged_subregion_cluster[['Cumulative_Positive_Degree_Days', 'Cumulative_Snowfall_mwe']].values.reshape(-1, 2)
        y = merged_subregion_cluster['AAR'].values
        model, score = svr_fit(X, y)
        aar_pred = model.predict(X)
        df = pd.DataFrame({'Subregion': [subregion_name],
                           'clustName': [cluster_name],
                           'PDD': [X[:,0]],
                           'snowfall': [X[:,1]],
                           'AAR_pred': [aar_pred],
                           'score': [score],
                           'N': [len(y)]
                          })
        compiled_df = pd.concat([compiled_df, df])
        ax[i].plot(X[:,0], aar_pred, '.', markersize=2, label=cluster_name,
                   color=cluster_cmap_dict[cluster_name])
    ax[i].set_title(subregion_name)
    ax[i].set_xlim(0, 1500)
    ax[i].set_ylim(0, 1)

handles, labels = ax[0].get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center', ncols=4, markerscale=5)
fig.subplots_adjust(hspace=0.4)
# fig.tight_layout()
plt.show()


In [None]:
plt.hist(compiled_df['score'], bins=10)
plt.xlim(0,1)
plt.show()

In [None]:
merged_subset = merged.loc[(merged['Subregion']=='St. Elias Mtns.') & (merged['clustName']=='Transitional-Continental')]
# X = merged_subset[['Cumulative_Positive_Degree_Days', 'Cumulative_Snowfall_mwe']].values.reshape(-1, 2)
# y = merged_subset['AAR'].values
# model_linear, score_linear = kfolds_linear_fit(X, y)
# y_pred_linear = model_linear.predict(X)
# model_svr, score_svr = svr_fit(X, y)
# y_pred_svr = model_svr.predict(X)
plt.plot(merged_subset['Cumulative_Positive_Degree_Days'], merged_subset['AAR'], '.')
# plt.plot(X[:,0], y_pred_linear, '.m')
# plt.plot(X[:,0], y_pred_svr, '.c')
plt.xlabel('$\Sigma$PDDs')
plt.ylabel('AAR')
plt.show()

### For each cluster

In [None]:
# # Initialize dataframe for storing results
# fit_clusters_df = pd.DataFrame()

# # Iterate over subregions 
# for cluster in tqdm(merged['cluster'].drop_duplicates().values):
#     # Subset merged data to subregion
#     merged_cluster = merged.loc[merged['cluster']==cluster]
#     # Grab cluster name 
#     cluster_name = merged_cluster['clustName'].values[0]
#     # Fit linear and SVR models to data
#     X = merged_cluster[['Cumulative_Positive_Degree_Days', 'Cumulative_Snowfall_mwe']].values
#     y = merged_cluster['AAR'].values
#     if (np.ravel(X)=='').all():
#         model_linear, score_linear = np.nan, np.nan
#         model_svr, score_svr = np.nan, np.nan
#     else:
#         model_linear, score_linear = linear_fit(X, y)
#         model_svr, score_svr = svr_fit(X, y)
#         # plot
#         plt.figure(figsize=(8,4))
#         plt.plot(X[:,0], y, '.')
#         plt.plot(X[:,0], model_linear.predict(X), '.b', label='Linear')
#         plt.plot(X[:,0], model_svr.predict(X), '.m', label='SVR')
#         plt.legend(loc='upper right')
#         plt.xlabel('$\Sigma$PDDs')
#         plt.ylabel('AAR')
#         plt.ylim(0,1)
#         plt.title(cluster_name 
#                   + '\nLinear score = ' + str(np.round(score_linear, 4)) 
#                   + '\nSVR score = ' + str(np.round(score_svr, 4)))
#         plt.show()
#     # Save in dataframe
#     df = pd.DataFrame({'cluster': [cluster],
#                        'clustName': [cluster_name],
#                        'coef_PDD': [model_linear.coef_[0]],
#                        'coef_snowfall': [model_linear.coef_[1]],
#                        'intercept': [model_linear.intercept_],
#                        'score': [score_linear],
#                        'N': [len(y)]})
#     # Concatenate to full dataframe
#     fit_clusters_df = pd.concat([fit_clusters_df, df])

# # Sort by cluster
# fit_clusters_df.sort_values(by='cluster', inplace=True)
    
# # Save to file
# fit_clusters_fn = os.path.join(scm_path, 'results', 'aar_pdd_snowfall_linear_fit_clusters.csv')
# fit_clusters_df.to_csv(fit_clusters_fn, index=False)
# print('Data table saved to file:', fit_clusters_fn)
# fit_clusters_df

In [None]:
plt.hist(fit_sites_df['score'], bins=20)
plt.show()

## Fit a linear trend to max. PDDs and max. Snowfall vs. min. AARs for each subregion

In [None]:
# Suppress warnings to prevent kernel crashing (future warning from pandas)
import warnings
warnings.filterwarnings("ignore")

# Initialize results dataframe
fit_subregion_df = pd.DataFrame()

# Iterate over site names
for o1, o2 in tqdm(aois[['O1Region', 'O2Region']].drop_duplicates().values):
    # Grab subregion name
    subregion_name, color = f.determine_subregion_name_color(o1, o2)
    print(subregion_name)
    # Grab site names
    site_names = aois.loc[(aois['O1Region']==o1) & (aois['O2Region']==o2), 'RGIId'].values
    # Subset data to subregion
    merged_subregion = merged.loc[merged['site_name'].isin(site_names)]
    # Fit linear trendline to AAR and Cumulative PDDs
    X = merged_subregion[['Cumulative_Positive_Degree_Days', 'Cumulative_Snowfall_mwe']].values.reshape(-1, 2)
    y = merged_subregion['AAR']
    if (np.ravel(X)=='').all():
        model_linear, score_linear = np.nan, np.nan
        print('none')
    else:
        model_linear, score_linear = linear_fit(X, y)
        # plot
        plt.figure(figsize=(8,4))
        plt.plot(X[:,0], y, '.')
        plt.plot(X[:,0], model_linear.predict(X), '.m', label='Linear')
        plt.legend(loc='upper right')
        plt.xlabel('$\Sigma$PDDs')
        plt.ylabel('AAR')
        plt.ylim(0,1)
        plt.title(subregion_name 
                  + '\nLinear score = ' + str(np.round(score_linear, 4)))
        plt.show()
    # Save in dataframe
    df = pd.DataFrame({'Subregion': [subregion_name],
                       'O1Region': [o1],
                       'O2Region': [o2],
                       'coef_PDD': [model_linear.coef_[0]],
                       'coef_snowfall': [model_linear.coef_[1]],
                       'intercept': [model_linear.intercept_],
                       'score': [score_linear],
                       'N': [len(y)]})
    # Concatenate to full dataframe
    fit_subregion_df = pd.concat([fit_subregion_df, df])

# Save to file
fit_subregion_fn = os.path.join(scm_path, 'results', 'aar_pdd_snowfall_linear_fit_subregions.csv')
fit_subregion_df.to_csv(fit_subregion_fn, index=False)
print('Data table saved to file:', fit_subregion_fn)
fit_subregion_df