# Correlation analyses

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely import wkt
from tqdm.auto import tqdm
import sys
import os
import seaborn as sns

In [None]:
scm_path = '/Users/raineyaberle/Research/PhD/snow_cover_mapping/'
base_path = '/Users/raineyaberle/Research/PhD/snow_cover_mapping/snow-cover-mapping-application/'
sys.path.append(os.path.join(base_path, 'functions'))
import model_analyze_utils as f

## Load and compile snowlines

In [None]:
# -----Load and compile snowlines
snowlines_path = os.path.join(scm_path, 'all_snowlines')
snowlines_fn = 'all_snowlines.csv'
# check if snowlines path exists
if not os.path.exists(snowlines_path):
    os.mkdir(snowlines_path)
# check if all snowlines CSV exists
if not os.path.exists(os.path.join(snowlines_path, snowlines_fn)):
    # compile all RGI glacier boundaries
    snowlines = pd.DataFrame()
    for site_name in tqdm(site_names):
        snowline_path = os.path.join(study_sites_path, site_name)
        snowline_fns = glob.glob(os.path.join(snowline_path, '*_snowlines.csv'))
        if len(snowline_fns) > 0:
            snowline_fn = snowline_fns[0]
            snowline = pd.read_csv(snowline_fn)
            snowlines = pd.concat([snowlines, snowline])
    snowlines.reset_index(drop=True, inplace=True)
    snowlines.to_csv(os.path.join(snowlines_path, snowlines_fn), index=False)
    print('All snowlines saved to file: ', os.path.join(snowlines_path, snowlines_fn))

else:
    # load from file if it already exists
    snowlines = pd.read_csv(os.path.join(snowlines_path, snowlines_fn))
    snowlines['datetime'] = pd.to_datetime(snowlines['datetime'], format='mixed')
    snowlines.index = snowlines['datetime']
    print('All snowlines loaded from file.')

snowlines

## Load and compile glacier boundaries

In [None]:
# -----Load and compile AOIs
aois_path = os.path.join(scm_path, 'all_AOIs')
aois_fn = 'all_aois.shp'
# check if aois path exists
if not os.path.exists(aois_path):
    os.mkdir(aois_path)
# check if all aois shapefile exists
if not os.path.exists(os.path.join(aois_path, aois_fn)):
    # compile all RGI glacier boundaries
    aois = gpd.GeoDataFrame()
    for site_name in tqdm(site_names):
        aoi_path = os.path.join(study_sites_path, site_name, 'AOIs')
        aoi_fns = glob.glob(os.path.join(aoi_path, '*RGI*.shp'))
        if len(aoi_fns) > 0:
            aoi_fn = aoi_fns[0]
            aoi = gpd.read_file(aoi_fn)
            aoi = aoi.to_crs('EPSG:4326')
            aois = pd.concat([aois, aoi])
    aois.reset_index(drop=True, inplace=True)
    aois.to_file(os.path.join(aois_path, aois_fn), index=False)
    print('All glacier boundaries saved to file: ', os.path.join(aois_path, aois_fn))

else:
    # load from file if it already exists
    aois = gpd.read_file(os.path.join(aois_path, aois_fn))
    print('All glacier boundaries loaded from file.')
aois[['O1Region', 'O2Region']] = aois[['O1Region', 'O2Region']].astype(int)
aois

## Calculate coefficients for AAR time series within subregions

In [None]:
plt.rcParams.update({'font.size':12, 'font.sans-serif': 'Arial'})

# iterate over subregions
for o1region, o2region in aois[['O1Region', 'O2Region']].drop_duplicates().values:
    
    # identify subregion name and color for plotting
    subregion_name, color = f.determine_subregion_name_color(o1region, o2region)
    print(subregion_name)

    # initialize dataframe for subregion correlation coefficients
    correlations_subregion = pd.DataFrame()

    # subset AOIs to subregion
    aois_subregion = aois.loc[(aois['O1Region']==o1region) & (aois['O2Region']==o2region)]

    # identify number of sites
    unique_site_names = aois_subregion['RGIId'].drop_duplicates().values

    # iterate over sites in subregion
    for i in tqdm(range(0, len(unique_site_names))):

        # grab site 1 time series
        site1_df = snowlines.loc[snowlines['site_name']==unique_site_names[i]]
        if len(site1_df) < 1:
            continue
        # remove duplicate dates, sort by date
        site1_df = site1_df[~site1_df.index.duplicated(keep='first')].sort_index()  

        # iterate over all other sites in subregion
        for j in range(i+1, len(unique_site_names)):
            
            # grab site 2 time series
            site2_df = snowlines.loc[snowlines['site_name']==unique_site_names[j]]
            if len(site2_df) < 1:
                continue
            # remove duplicate dates, sort by date
            site2_df = site2_df[~site2_df.index.duplicated(keep='first')].sort_index()  

            # resample both dataframes at a daily time interval
            site1_df = site1_df.resample('1D').bfill()
            site2_df = site2_df.resample('1D').bfill()
            min_date = np.min([site1_df.iloc[0]['datetime'], site2_df.iloc[0]['datetime']])
            max_date = np.max([site1_df.iloc[-1]['datetime'], site2_df.iloc[-1]['datetime']])
            site1_df = site1_df.loc[(site1_df['datetime'] >= min_date) & (site1_df['datetime'] <= max_date)]
            site2_df = site2_df.loc[(site2_df['datetime'] >= min_date) & (site2_df['datetime'] <= max_date)]

            # calculate correlation coefficient
            aar_correlation = pd.DataFrame({'Site1': site1_df['AAR'],
                                            'Site2': site2_df['AAR']}).corr().iloc[0,1]
            correlation_sites = pd.DataFrame({'Site1': [unique_site_names[i]],
                                              'Site2': [unique_site_names[j]],
                                              'AAR Corr. Coeff.': [aar_correlation]})
            # append to dataframe
            correlations_subregion = pd.concat([correlations_subregion, correlation_sites])

    # save CSV
    correlations_subregion_pivot = correlations_subregion.pivot_table(index='Site1', 
                                                                      columns='Site2', 
                                                                      values='AAR Corr. Coeff.')
    correlations_subregion_fn = 'correlation_coefficients_' + subregion_name.replace('.','').replace(' ','') + '.csv'
    correlations_subregion_pivot.to_csv(os.path.join(snowlines_path, correlations_subregion_fn),index=False)
    print('Correlation coefficients saved to file: ', os.path.join(snowlines_path, correlations_subregion_fn))
    
    # plot
    fig, ax = plt.subplots(1, 2, figsize=(10, 8), gridspec_kw={'width_ratios': [4,1]})
    # heatmap
    sns.heatmap(correlations_subregion_pivot, ax=ax[0], cmap='coolwarm', vmin=-1, vmax=1, cbar=False)
    ax[0].set_title(subregion_name)
    ax[0].set_xlabel('')
    ax[0].set_xticks([])
    ax[0].set_ylabel('')
    ax[0].set_yticks([])
    # boxplot
    ax[1].boxplot(correlations_subregion['AAR Corr. Coeff.'].values)
    ax[1].set_ylim(-1,1)
    fig.tight_layout()
    plt.show()


## Calculate correlation coefficients between AAR and PDD time series