In [1]:
import os
import pandas as pd
import numpy as np
import networkx as nx
import hydrobm
import matplotlib.pyplot as plt
from hydrobm.calculate import calc_bm

### Inputs

In [2]:
# Define input file paths
performance_metrics= '../../model/model_versions/v_7/v7_1/performance_metrics.csv'

pobs= '../../model/model_versions/v_7/v7_1/Pobs.txt'

qobs= '../../model/model_versions/v_7/v7_1/Qobs.txt'

geodata= '../../model/model_versions/v_7/v7_1/GeoData.txt'

output_dir= './'

In [3]:
# Define calibration and validation periods
calibration_ranges = [('1980-10-01', '1984-09-30'),
               ('1989-10-01', '1998-09-30'),
               ('2003-10-01', '2007-09-30'),
               ('2012-10-01', '2015-09-30')]

validation_ranges = [('1984-10-01', '1989-09-30'),
               ('1998-10-01', '2003-09-30'),
               ('2007-10-01', '2012-09-30')]

In [4]:
# Specify the benchmarks and metrics to calculate
benchmarks = [
        # Streamflow benchmarks
        "mean_flow",
        "median_flow",
        "annual_mean_flow",
        "annual_median_flow",
        "monthly_mean_flow",
        "monthly_median_flow",
        "daily_mean_flow",
        "daily_median_flow",

#         # Long-term rainfall-runoff ratio benchmarks
#         "rainfall_runoff_ratio_to_all",
#         "rainfall_runoff_ratio_to_annual",
#         "rainfall_runoff_ratio_to_monthly",
#         "rainfall_runoff_ratio_to_daily",
#         "rainfall_runoff_ratio_to_timestep",

# #         # Short-term rainfall-runoff ratio benchmarks
#         "monthly_rainfall_runoff_ratio_to_monthly",
#         "monthly_rainfall_runoff_ratio_to_daily",
#         "monthly_rainfall_runoff_ratio_to_timestep",

#         # Schaefli & Gupta (2007) benchmarks
#         "scaled_precipitation_benchmark",  # equivalent to "rainfall_runoff_ratio_to_daily"
#         "adjusted_precipitation_benchmark",
#         "adjusted_smoothed_precipitation_benchmark",
    ]

metrics = ['nse', 'kge'] # could also use 'mse', 'rmse'

### Analysis

Prepare Data

In [5]:
# Read data
performance_metrics= pd.read_csv(performance_metrics, index_col=0)
pobs= pd.read_csv(pobs, index_col=0, sep='\t') 
qobs= pd.read_csv(qobs, index_col=0, sep='\t') 
geodata= pd.read_csv(geodata, index_col=0, sep='\t') 

In [6]:
# convert index to datetime
pobs.index = pd.to_datetime(pobs.index)
qobs.index = pd.to_datetime(qobs.index)

# Set index to int
geodata.index = geodata.index.astype(int)

# Convert column headers to integers
pobs.columns = pobs.columns.astype(int)
qobs.columns = qobs.columns.astype(int)

In [7]:
# Extract performance metrics index
subbasin_ids= performance_metrics.index

In [8]:
# Convert the ranges to Pandas Timestamps
calibration_ranges = [(pd.Timestamp(start), pd.Timestamp(end)) for start, end in calibration_ranges]
validation_ranges = [(pd.Timestamp(start), pd.Timestamp(end)) for start, end in validation_ranges]

In [9]:
riv_graph = nx.DiGraph()

# Add edges from DataFrame
for idx, row in geodata.iterrows():
    if row['maindown'] != '0':  # Skip if maindown is '0'
        riv_graph.add_edge(idx, row['maindown'])


In [10]:
# replace missing values with nan in streamflow
qobs.replace(-9999, np.nan, inplace=True)

In [11]:
# Convert streamflow from m3/s to m3
qobs= qobs * 86400 # s / day

In [12]:
# Set area column to numeric
geodata['area'] = pd.to_numeric(geodata['area'])

# Create dictionary with subbasin ID and area
area_dict = geodata['area'].to_dict()

In [13]:
# Convert pobs from mm to m3
pobs= pobs / 1000 # mm to m

# Multiply each column in pobs by the corresponding area value in area_dict
for col in pobs.columns:
    if col in area_dict:
        pobs[col] *= area_dict[col]

### Calculate Benchmarks

In [14]:
# Select the four best benchmarks for plotting
def top_n_indices_and_values(values_list, n=4):
    arr = np.array(values_list) # numpy array
    nan_idx = np.where(np.isnan(arr)) # find nan values
    arr_sort = arr.argsort() # sort the full array, nans go at the end
    arr_sort = arr_sort[~np.isin(arr_sort, nan_idx)] # remove nans
    indices = arr_sort[-n:] # get the top n indices
    values = arr[indices] # get the values
    return indices.tolist(), values.tolist()

In [16]:
# Define an empty list to store the messages
worse_than_benchmark_list = []

In [17]:
# Iterate over each subbasin in performance_metrics
for subbasin in performance_metrics.index:

    # Find upstream segments for the given subbasin
    upstream_segments = list(nx.ancestors(riv_graph, subbasin))

    # Sum upstream precipitation
    precipitation_sum = pd.DataFrame(pobs[upstream_segments].sum(axis=1), columns=['precipitation'])

    # Create hydrobm input dataframe
    bm_input = pd.DataFrame({
        'streamflow': qobs[subbasin],  # Streamflow for the given subbasin
        'precipitation': precipitation_sum['precipitation'],  # Sum of upstream precipitation
    })

    # Create the cal_mask column
    bm_input['cal_mask'] = bm_input.index.to_series().apply(
        lambda x: any(pd.to_datetime(start) <= x <= pd.to_datetime(end) for start, end in calibration_ranges)
    )

    # Create the val_mask column
    bm_input['val_mask'] = bm_input.index.to_series().apply(
        lambda x: any(pd.to_datetime(start) <= x <= pd.to_datetime(end) for start, end in validation_ranges)
    )

    # Calculate the benchmarks and scores
    benchmark_flows, scores = calc_bm(
        bm_input,

        # Time period selection
        bm_input['cal_mask'],
        val_mask=bm_input['val_mask'],

        # Variable names in 'data'
        precipitation="precipitation",
        streamflow="streamflow",

        # Benchmark choices
        benchmarks=benchmarks,
        metrics=metrics,
        optimization_method="brute_force",

        # Snow model inputs
        calc_snowmelt=False,
        temperature="temperature_2m_mean",
        snowmelt_threshold=0.0,
        snowmelt_rate=3.0,
    )

    # Get top 4 KGE scores and corresponding benchmarks
    idx, vals = top_n_indices_and_values(scores['kge_val'], 4)
    top_benchmarks = [scores['benchmarks'][id] for id in idx]
    top_kge_vals = [scores['kge_val'][id] for id in idx]

    # Reformat the scores for cleaner saving 
    col_names = scores.pop("benchmarks", None)
    df = pd.DataFrame(scores, index=col_names)
    df = df.T

        # Add the model_results column with performance metrics for the subbasin
    df['model_results'] = None  # Initialize the column
    df.loc['kge_cal', 'model_results'] = performance_metrics.loc[subbasin, 'Cal KGE']
    df.loc['kge_val', 'model_results'] = performance_metrics.loc[subbasin, 'Val KGE']
    df.loc['nse_cal', 'model_results'] = performance_metrics.loc[subbasin, 'Cal NSE']
    df.loc['nse_val', 'model_results'] = performance_metrics.loc[subbasin, 'Val NSE']

    # Save the DataFrame to CSV in the output directory
    df_filepath = os.path.join(output_dir, f"subbasin_{subbasin}_scores.csv")
    df.to_csv(df_filepath)
    
        # Plot streamflow along with the four best benchmarks
    fig, ax = plt.subplots(4, 1, figsize=(14, 14))
    for i, (bm, kge) in enumerate(zip(top_benchmarks, top_kge_vals)):
        bm_input['streamflow'].plot(ax=ax[i], linewidth=2, label='streamflow')
        benchmark_flows[f'bm_{bm}'].plot(ax=ax[i], label=bm)
        ax[i].legend(loc='upper left')
        ax[i].set_title(f"{subbasin}_{bm}_(BM Val KGE: {kge:.2f}, Model Val KGE: {df.loc['kge_val', 'model_results']:.2f})")
        ax[i].set_xlabel('')  # Drops 'Date'

    plt.tight_layout()

    # Save the figure to the output directory
    fig_filepath = os.path.join(output_dir, f"subbasin_{subbasin}_benchmark_plot.png")
    fig.savefig(fig_filepath)
    plt.close(fig)  # Close the figure after saving to free memory
    
    # Iterate through each row in df
    for idx, row in df.iterrows():
        # Loop through all columns except 'model_results'
        for col in df.columns:
            if col != 'model_results' and row['model_results'] < row[col]:
                message = f"{subbasin}_{idx} is worse than benchmark in {col}"
                print(message)  # Print the message
                worse_than_benchmark_list.append(message)  # Append the message to the list

58643_nse_val is worse than benchmark in mean_flow
58643_nse_val is worse than benchmark in median_flow
58643_nse_val is worse than benchmark in annual_mean_flow
58643_nse_val is worse than benchmark in annual_median_flow
58643_nse_val is worse than benchmark in monthly_mean_flow
58643_nse_val is worse than benchmark in monthly_median_flow
58643_nse_val is worse than benchmark in daily_mean_flow
58643_nse_val is worse than benchmark in daily_median_flow
58363_nse_val is worse than benchmark in mean_flow
58363_nse_val is worse than benchmark in median_flow
58363_nse_val is worse than benchmark in annual_mean_flow
58363_nse_val is worse than benchmark in annual_median_flow
58363_nse_val is worse than benchmark in monthly_mean_flow
58363_nse_val is worse than benchmark in monthly_median_flow
58363_nse_val is worse than benchmark in daily_mean_flow
58363_nse_val is worse than benchmark in daily_median_flow
58308_nse_val is worse than benchmark in mean_flow
58308_nse_val is worse than bench

In [20]:
# Define the path to save the file in the output_dir
output_file_path = os.path.join(output_dir, 'worse_than_benchmark_list.txt')

# Save the list to a text file
with open(output_file_path, 'w') as file:
    for line in worse_than_benchmark_list:
        file.write(line + '\n')

print(f"Saved the list to {output_file_path}")

Saved the list to ./worse_than_benchmark_list.txt
