# Basin analysis
Code to calculate autocorrelations and cross-correlations for data within a specific basin or set of basins


In [1]:
import pandas as pd
import os
import numpy as np
from matplotlib import pyplot as plt
import scipy
import statsmodels.api as sm

In [2]:
variables = ['e_gleam_mm_mon','glws2_gwater_mm','grace_lwe_cm','grun_ens_runoff_mm_day','prec_fldas_mm_day','smroot_gleam_mm3_mm3','smsurf_gleam_mm3_mm3','lai_gimms4g_m2_m2']
variable2s = ['e_gleam_mm_mon','glws2_gwater_mm','grace_lwe_cm','grun_ens_runoff_mm_day','prec_fldas_mm_day','smroot_gleam_mm3_mm3','smsurf_gleam_mm3_mm3','lai_gimms4g_m2_m2']
basins = ['LIMPOPO','ORANGE','NILE','JUBBA (also GENALE WENZ)']

bName=[];v1=[];v2=[];vcc=[]
for basin in basins:
    for variable in variables:
        for variable2 in variable2s:
            #read basin dataframe
            path = './Basin_data/'+variable+'_'+basin+'.csv'
            path2 = './Basin_data/'+variable2+'_'+basin+'.csv'

            bDF = pd.read_csv(path)
            bDF2 = pd.read_csv(path2)
 
            #plot climatology
            plt.figure()
            bDF.iloc[:,1:].mean(0).plot(title='Climatology of '+variable+'\nin '+basin+' basin')
            plt.savefig('./Figures/Climatologies/'+variable+'_'+basin+'.png')
            plt.close()
            
            bDFclim = bDF.iloc[:,1:].mean(0)
            bDFclim = bDFclim.reset_index().rename(columns={'index':'Month',0:'value'})
            bDF2clim = bDF2.iloc[:,1:].mean(0)
            bDF2clim = bDF2clim.reset_index().rename(columns={'index':'Month',0:'value'})
            
            #Calculate anomalies by remove monthly climatology from each month
            bDF.iloc[:,1:] = bDF.iloc[:,1:] - bDF.iloc[:,1:].mean(0)
            bDF2.iloc[:,1:] = bDF2.iloc[:,1:] - bDF2.iloc[:,1:].mean(0)
            #reshape the data
            bDF = bDF.melt(id_vars='Year',var_name='Month')
            bDF2 = bDF2.melt(id_vars='Year',var_name='Month')
            #create a datetime index
            dtInd = pd.to_datetime(bDF.Year.astype(int).astype(str) + bDF['Month'], format='%Y%b')
            bDF['date'] = dtInd
            dtInd2 = pd.to_datetime(bDF2.Year.astype(int).astype(str) + bDF2['Month'], format='%Y%b')
            bDF2['date'] = dtInd2
            bDF=bDF.set_index('date')
            bDF2=bDF2.set_index('date')

            if (variable=='grun_ens_runoff_mm_day')&(variable2=='grun_ens_runoff_mm_day'):
                dtInd = dtInd[(dtInd.dt.year>1981)] #limit to satellite era to be more similar to other vars   
                dtInd2 = dtInd2[(dtInd2.dt.year>1981)]
            if variable=='grace_lwe_cm':
                #replace missing values with climatology (which is = 0 in anomaly space)
                bDF.loc[np.isnan(bDF['value']),'value'] = 0 
                dtInd = dtInd[(dtInd.dt.year<=2022)]
            if variable2=='grace_lwe_cm':
                #replace missing values with climatology (which is = 0 in anomaly space)
                bDF2.loc[np.isnan(bDF2['value']),'value'] = 0 
                dtInd = dtInd[(dtInd.dt.year<=2022)]

            #choose only values that exist in both datasets to calculate the cross correlation
            dts = set(dtInd).intersection(dtInd2)
            bDF=bDF.loc[list(dts)]
            bDF2=bDF2.loc[list(dts)]

            #reorder the data chronologically
            bDF = bDF.sort_values(by='date')
            bDF2 = bDF2.sort_values(by='date')
        
            #Calculate the autocorrelation
            if variable == variable2:
                anoms = bDF.value.values
                anoms = anoms-np.nanmean(anoms)
                std = np.std(anoms)
                corr = sm.tsa.stattools.ccf(anoms/std,anoms/std,adjusted=True,fft=False)
                plt.figure()
                plt.plot(corr[:9])
                plt.title(variable+' autocorrelation out to 9 months')
                plt.savefig('./Figures/crosscorrelations_individual_vars/'+variable+'_'+basin+'.png')
                plt.close()
            else:
                anoms = bDF.value.values
                anoms2 = bDF2.value.values
                anoms = anoms-np.nanmean(anoms)
                anoms2 = anoms2-np.nanmean(anoms2)
                std = np.std(anoms)
                std2 = np.std(anoms2)
                corr = sm.tsa.stattools.ccf(anoms/std,anoms2/std2,adjusted=True,fft=False)
                plt.figure()
                plt.plot(corr[:9])
                plt.title(variable+' lags \n'+variable2+'\ncross-correlation out to 9 months')
                plt.savefig('./Figures/crosscorrelations_individual_vars/'+variable+'_'+variable2+'_'+basin+'.png')
                plt.close()
                
            vcc = np.append(vcc,corr[:9],0)
            v1 = np.append(v1,np.repeat(variable,9),0)
            v2 = np.append(v2,np.repeat(variable2,9),0)
            bName = np.append(bName,np.repeat(basin,9),0)
            
df =pd.DataFrame(data={'basin':bName,'var1':v1,'var2':v2,'cross_corr':vcc})
path = './crosscorr_dfs/basin_cc.pkl'
df.to_pickle(path)

In [3]:

#Calculate the autocorrelation
if variable == variable2:
    anoms = bDF.value.values
    anoms = anoms-np.nanmean(anoms)
    std = np.std(anoms)
    corr = sm.tsa.stattools.ccf(anoms/std,anoms/std,adjusted=True,fft=False)
    plt.figure()
    plt.plot(corr[:9])
    plt.title(variable+' autocorrelation out to 9 months')
    plt.savefig('./Figures/crosscorrelations_individual_vars/'+variable+'_'+basin+'.png')
    plt.close()
else:
    anoms = bDF.value.values
    anoms2 = bDF2.value.values
    anoms = anoms-np.nanmean(anoms)
    anoms2 = anoms2-np.nanmean(anoms2)
    std = np.std(anoms)
    std2 = np.std(anoms2)
    corr = sm.tsa.stattools.ccf(anoms/std,anoms2/std2,adjusted=True,fft=False)
    plt.figure()
    plt.plot(corr[:9])
    plt.title(variable+' lags \n'+variable2+'\ncross-correlation out to 9 months')
    plt.savefig('./Figures/crosscorrelations_individual_vars/'+variable+'_'+variable2+'_'+basin+'.png')
    plt.close()

In [4]:
plt.ioff()

colors = {
'e_gleam_mm_mon':'limegreen',
'glws2_gwater_mm':'navy',
'grace_lwe_cm':'red',
'grun_ens_runoff_mm_day':'blue',
'prec_fldas_mm_day':'cornflowerblue',
'smroot_gleam_mm3_mm3':'saddlebrown',
'smsurf_gleam_mm3_mm3':'peru',
'lai_gimms4g_m2_m2':'green'
}


for basin in basins:
    for variable in variables:
        plt.figure()
        for variable2 in variable2s:
            if variable==variable2:continue
            plt.plot(df[(df.basin==basin)&(df.var1==variable)&(df.var2==variable2)].cross_corr.values,
                     label=str(variable2),color=colors[variable2])
        plt.legend()
        plt.title(basin+'\n'+variable+' lags [X]')
        plt.savefig('./Figures/crosscorrelations_all_vars/'+variable+'_allVars_'+basin+'.png')

  plt.figure()


In [5]:
basins = ['LIMPOPO','ORANGE','NILE','JUBBA (also GENALE WENZ)']

colors = {
'JUBBA (also GENALE WENZ)':'red',
'NILE':'cornflowerblue',
'ORANGE':'peru',
'LIMPOPO':'green'
}


for variable in variables:
    plt.figure()
    for basin in basins:
        plt.plot(df[(df.basin==basin)&(df.var1==variable)&(df.var2==variable)].cross_corr.values,
                 label=str(basin),color=colors[basin])
        plt.legend()
        plt.title(variable+' autocorrelation')
        plt.savefig('./Figures/autocorrelations_all_basins/'+variable+'_allbasins.png')