## Description
_______

This script calculates statistics for the esp simulated vs. historical simulated streamflow from the ESP outputs. This alows for a hindcast to be compared to simulated data to avoid skewing the results from model error. Statistics currently being calculated are bias for each year included in the ESP analysis, correlation coefficient and RMSE and (Huang et al. 2017). These statistics are being calculated with the mean of the ensemble. 

### Import Libraries

In [1]:
import xarray as xr
import matplotlib.pyplot as plt
import pandas as pd
import os
import numpy as np
from scipy.integrate import simps
from properscoring import crps_ensemble

### Inputs

In [2]:
# Define inputs for the plot
directory_path= '../project/58213_esp_results/' # directory containing ESP outputs
start_date= '04-01'       # start date for esp analysis in %Y-%m-%d
end_date= '07-31'         # end date for esp analysis in %Y-%m-%d
output_directory= '../project/simulated_stats/'   # location for the outputs
computed_path='../0058213.txt'

In [3]:
# adding swe rank
swe_rank= None #'../swe_analysis/swerank.csv' 

### Generate Plot

In [4]:
# Extract month and day from the dates
start_month, start_day = start_date.split('-')
end_month, end_day = end_date.split('-')

# Convert dates to integers
start_month, start_day = int(start_month), int(start_day)
end_month, end_day = int(end_month), int(end_day)

In [5]:
# Initialize an empty list to store the total simulated runoff for each ensemble member
all_sum_cout_series = []

# Initialize an empty list to store the total observed runoff for each ensemble member
all_sum_rout_series = []

In [6]:
# Read the DataFrame from the computed_path
sim = pd.read_csv(computed_path, sep='\t', index_col=0)

sim = sim.drop('UNITS', axis=0)

# Convert index to datetime format
sim.index = pd.to_datetime(sim.index, errors='coerce')

# Convert 'cout' column to numeric if needed
sim['cout'] = pd.to_numeric(sim['cout'], errors='coerce')

# Extract unique years from the index of sim
unique_years = sim.index.year.unique()

# Initialize an empty list to store trimmed DataFrames
trimmed_dfs = []

# Loop through each unique year
for year in unique_years:
    # Trim the DataFrame to keep only data within the specified start and end dates for each year
    year_start_date = '{}-{}'.format(year, start_date)
    year_end_date = '{}-{}'.format(year, end_date)
    trimmed_df = sim.loc[year_start_date:year_end_date]
    trimmed_dfs.append(trimmed_df)

# Concatenate the trimmed DataFrames into a single DataFrame
sim_trimmed = pd.concat(trimmed_dfs)

# Drop all columns except for 'cout'
sim_cout = sim_trimmed[['cout']]

yearly_cout_sum = sim_cout.groupby(sim_cout.index.year)['cout'].sum()

# Group by the year and sum the 'cout' values
yearly_cout_sum = sim_cout.groupby(sim_cout.index.year)['cout'].sum().reset_index()

# set date as the index
yearly_cout_sum = yearly_cout_sum.set_index('DATE')

In [7]:
# Iterate through each .nc file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith('.nc'):
        # Construct the full file path
        file_path = os.path.join(directory_path, filename)
        
        # Open the esp output file
        esp = xr.open_dataset(file_path)
        
        # Convert all data variable values to float
        esp = esp.astype(float)
        
        # Convert 'DATE' coordinate to datetime format
        esp['DATE'] = pd.to_datetime(esp['DATE'])
        
        # Extract the year from the last DATE
        last_date_year = pd.to_datetime(esp['DATE'][-1].values).year
        
        # Create start_date and end_date for the analysis period
        start_date = pd.Timestamp(year=last_date_year, month=start_month, day=start_day)
        end_date = pd.Timestamp(year=last_date_year, month=end_month, day=end_day)
        
        # Select data between start_date and end_date 
        ds_selected = esp.sel(DATE=slice(start_date, end_date))
        
        # Sum 'cout' variable for each ensemble member
        sum_cout = ds_selected['cout'].median(dim='ensemble_member') # change to .median for median stats
        
        # Convert sum_cout to pandas Series
        sum_cout_series = sum_cout.to_series()
        
        # Ensure the simulated index is a DatetimeIndex and add year of analysis to series
        sum_cout_series.index = pd.to_datetime(sum_cout_series.index)
        sum_cout_series.index = sum_cout_series.index.map(lambda x: x.replace(year=last_date_year))
        
        # Append the simulated and observed series to the list
        all_sum_cout_series.append(sum_cout_series)


In [8]:
# Concatenate all sum_cout_series into a single series
sum_cout_series_combined = pd.concat(all_sum_cout_series)

In [9]:
# Convert Series to DataFrame
sim_cout_filtered = sum_cout_series_combined.reset_index()

# set date as the index
sim_cout_filtered_df = sim_cout_filtered.set_index('DATE')

In [10]:
# sum flow in analysis period
yearly_esp_sum = sim_cout_filtered_df.groupby(sim_cout_filtered_df.index.year)['cout'].sum()

# Convert Series to DataFrame
yearly_esp_sum = yearly_esp_sum.reset_index()

# set date as the index
yearly_esp_sum = yearly_esp_sum.set_index('DATE')

In [11]:
# Filter yearly_cout_sum to include only rows where the index is in yearly_esp_sum's index
yearly_cout_sum_filtered = yearly_cout_sum[yearly_cout_sum.index.isin(yearly_esp_sum.index)]

In [12]:
# Create results dataframe
results = pd.DataFrame({
    'ESP Mean Total Flow (cms)': yearly_esp_sum['cout'],
    'Simulated Total Flow (cms)': yearly_cout_sum_filtered['cout']
})

In [13]:
results

Unnamed: 0_level_0,ESP Mean Total Flow (cms),Simulated Total Flow (cms)
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
1982,4361.21,4747.285
1983,3394.556,3853.725
1984,2506.436,2984.209
1985,3583.7125,4257.218
1986,4264.438,4946.56
1987,3029.9555,3422.937
1988,2669.6325,3041.764
1989,4156.97,5321.048
1990,4491.2525,5637.072
1991,4685.1315,6087.394


#### Calculate Statistics

In [14]:
# Assuming 'results' is your DataFrame
results[results < 0] = np.nan

In [15]:
# Calculate the bias for each day
results['Bias'] = results['ESP Mean Total Flow (cms)'] - results['Simulated Total Flow (cms)']

# Group by year and calculate the mean bias for each year
mean_bias_by_year = results.groupby(results.index)['Bias'].mean()

# Group by year and calculate the mean bias for each year
mean_obs_year = results.groupby(results.index)['Simulated Total Flow (cms)'].mean()

normalized_percent_bias= (mean_bias_by_year/mean_obs_year) *100

In [16]:
# Create a DataFrame called 'statistics' from the Series with index defined
statistics = pd.DataFrame(columns=['Mean Bias (cms)'], index=mean_bias_by_year.index)

In [17]:
# Populate the 'Mean Percent Bias' column with the values from 'mean_percent_bias_by_year'
statistics['Mean Bias (cms)'] = mean_bias_by_year.values

In [18]:
statistics['Mean Normalized Percent Bias (cms)'] = normalized_percent_bias.values

In [19]:
# Initialize an empty list to store correlation coefficients
correlation_coefficients = []

# Iterate over unique years in the index
for year in results.index.unique():
    # Filter the data for the current year
    year_data = results[results.index == year]
    # Calculate the correlation coefficient for the current year
    correlation_coefficient = np.corrcoef(year_data['ESP Mean Total Flow (cms)'], year_data['Simulated Total Flow (cms)'])[0, 1]
    # Append the correlation coefficient to the list
    correlation_coefficients.append(correlation_coefficient)

statistics['Correlation Coefficient'] = correlation_coefficients
    

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


In [20]:
# Initialize an empty list to store RMSE values
rmse_values = []
mean_flows = []

# Iterate over unique years in the index
for year in results.index.unique():
    # Filter the data for the current year
    year_data = results[results.index == year]
    # Calculate RMSE for the current year
    rmse = np.sqrt(np.mean((year_data['Simulated Total Flow (cms)'] - year_data['ESP Mean Total Flow (cms)'])**2))
    # Append the RMSE value to the list
    rmse_values.append(rmse)
    # find mean observed flow
    obs_mean= np.mean(year_data['Simulated Total Flow (cms)'])
    # append mean flows to list                  
    mean_flows.append(obs_mean)
    
    
# Create a DataFrame called 'statistics' with 'RMSE' for each year
statistics['RMSE'] = rmse_values
statistics['NRMSE']= mean_flows
statistics['NRMSE']= statistics['RMSE'] / statistics['NRMSE']

#results['RMSE'] = rmse_values
#results['NRMSE']= mean_flows
#results['NRMSE']= statistics['RMSE'] / statistics['NRMSE']

In [21]:
results

Unnamed: 0_level_0,ESP Mean Total Flow (cms),Simulated Total Flow (cms),Bias
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1982,4361.21,4747.285,-386.075
1983,3394.556,3853.725,-459.169
1984,2506.436,2984.209,-477.773
1985,3583.7125,4257.218,-673.5055
1986,4264.438,4946.56,-682.122
1987,3029.9555,3422.937,-392.9815
1988,2669.6325,3041.764,-372.1315
1989,4156.97,5321.048,-1164.078
1990,4491.2525,5637.072,-1145.8195
1991,4685.1315,6087.394,-1402.2625


SWE Rank

In [22]:
if swe_rank is not None:
    # Read CSV into a DataFrame
    swe_rank_df = pd.read_csv(swe_rank, index_col=0)
    
    # Merge 'SWE_Rank' column onto 'statistics' based on indexes
    statistics= pd.merge(statistics, swe_rank_df['SWE_Rank'], left_index=True, right_index=True)
    
    
    # Move 'SWE_Rank' column to the first position
    statistics.insert(0, 'SWE_Rank', statistics.pop('SWE_Rank'))


#### Outputs

In [23]:
# define output file
statistics_filename= '232_simulated_esp_stats.csv'
stats_output_path = output_directory + statistics_filename

In [24]:
# Convert all values in DataFrames to floats
#statistics = results.astype(float)

# Round all values to two decimal places
statistics = statistics.round(2)

In [25]:
# Save bias to CSV
statistics.to_csv(stats_output_path)

PermissionError: [Errno 13] Permission denied: '../project/simulated_stats/232_simulated_esp_stats.csv'