In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from typing import Tuple

In [2]:
def remove_invalid_values(simulated, observed):
    valid_indices = np.where((observed != -9999) & (simulated != -9999))
    return simulated[valid_indices], observed[valid_indices]


In [3]:
def remove_nan_rows(
    array1: np.ndarray, 
    array2: np.ndarray
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Removes rows from two arrays where either array has NaN values.
    Retains the first row if it doesn't have any NaN values.
    
    Arguments:
    array1: np.ndarray:
        First input array
    array2: np.ndarray
        Second input array
    
    Returns:
    cleaned_array1: : np.ndarray
        Cleaned array1 without NaN rows
    cleaned_array2: np.ndarray
        Cleaned array2 without NaN rows
    """
    # checks for and removes any rows where either array has a value of NaN at a corresponding row 
    # including the first one
    
    mask = np.logical_and(~np.isnan(array1), ~np.isnan(array2))
    if not np.isnan(array1[0]) and not np.isnan(array2[0]):
        mask[0] = True
    cleaned_array1 = array1[mask]
    cleaned_array2 = array2[mask]
    return cleaned_array1, cleaned_array2

In [4]:
def compute_kge(simulated_array, observed_array):
    """
    Computes KGE (Kling-Gupta Efficiency) between observed and simulated values.

    Parameters:
        observed_array (numpy.ndarray): Array of observed values.
        simulated_array (numpy.ndarray): Array of simulated values.

    Returns:
        float: KGE value.
    """
    
    # Calculate Pearson correlation coefficient
    correlation_coefficient = np.corrcoef(observed_array, simulated_array)[0, 1]
    
    # Calculate standard deviation ratio
    std_observed = np.std(observed_array)
    std_simulated = np.std(simulated_array)
    std_ratio = std_simulated / std_observed
    
    # Calculate bias ratio
    mean_observed = np.mean(observed_array)
    mean_simulated = np.mean(simulated_array)
    bias_ratio = mean_simulated / mean_observed
    
    # Calculate KGE
    kge = 1 - np.sqrt((correlation_coefficient - 1)**2 + (std_ratio - 1)**2 + (bias_ratio - 1)**2)
    
    return kge

### Inputs

In [5]:
# Directory where Hype outputs are saved
file_path= "../opt_hype/"

sim_column= 'cout'

obs_column= 'rout'

calibration_ranges = [('1981-01-01', '1984-12-31'),
               ('1990-01-01', '1998-12-31'),
               ('2004-01-01', '2007-12-31'),
               ('2013-01-01', '2015-12-31')]

validation_ranges = [('1985-01-01', '1989-12-31'),
               ('1999-01-01', '2003-12-31'),
               ('2008-01-01', '2012-12-31')]

# Create an empty list to store total KGE values for each file
calibration_kge = []

validation_kge= []

file_names = []

### Calculate KGE

In [6]:
# Iterate through files in the output directory
for filename in os.listdir(file_path):
    if filename.endswith(".txt") and filename.startswith("00"):  # Process files with prefix '00' and end with '.txt'
        filepath = os.path.join(file_path, filename)

        # Create empty lists to store observed and simulated data for each year range
        simulated_data_cal = []
        observed_data_cal = []
        simulated_data_val = []
        observed_data_val = []
        
        # read HYPE output into dataframe
        flow = pd.read_csv(filepath, index_col=0, sep='\t')

        # Drop the first row if it's unnecessary (e.g., metadata or headers)
        flow = flow.drop(flow.index[0])

        # Ensure the index is datetime
        flow.index = pd.to_datetime(flow.index)

        # Ensure the 'cout' and 'rout' columns are of float type
        flow['cout'] = flow['cout'].astype(float)
        flow['rout'] = flow['rout'].astype(float)

        # Replace -9999 with NaN
        flow.replace(-9999, np.nan, inplace=True)
        
        # find average annual obs
        average_annual_obs = flow['rout'].groupby(flow.index.dayofyear).mean()

        # Create a DataFrame to store the averages
        average_annual = pd.DataFrame({
            'obs': average_annual_obs
        })

        # Convert the day-of-year index to datetime format
        average_annual.index = pd.to_datetime(average_annual.index, format='%j').strftime('%m-%d')

        # Drop the last row
        average_annual = average_annual.iloc[:-1]

        # Extract unique years from benchmark DataFrame
        years = flow.index.year.unique()

        # Prepare a list to hold DataFrames for each year
        merged_dfs = []

        for year in years:
            # Create a DataFrame with the year added to the month-day index
            annual_average_year = average_annual.copy()
            annual_average_year.index = pd.to_datetime(annual_average_year.index + f"-{year}", format='%m-%d-%Y')

            # Filter benchmark data for the current year
            benchmark_year = flow[flow.index.year == year]

            # Merge the benchmark data with annual_average data for the current year
            merged_year = pd.merge(benchmark_year, annual_average_year, left_index=True, right_index=True, how='left')

            # Append to the list of DataFrames
            merged_dfs.append(merged_year)

        # Concatenate all DataFrames
        result_df = pd.concat(merged_dfs)

        # Rename the column for clarity
        result_df.rename(columns={'obs': 'obs_model'}, inplace=True)

        # Convert the index to datetime if it's not already in datetime format
        if not isinstance(result_df.index, pd.DatetimeIndex):
            result_df.index = pd.to_datetime(result_df.index)

        # Process and filter DataFrame based on calibration period
        for start_date, end_date in calibration_ranges:
            trimmed_df1 = result_df.loc[start_date:end_date]
            simulated_data_cal.append(trimmed_df1[sim_column].values.astype(float))  # Convert to float array
            observed_data_cal.append(trimmed_df1[obs_column].values.astype(float))  # Convert to float array
            
        # Process and filter DataFrame based on validation period
        for start_date, end_date in validation_ranges:
            trimmed_df2 = result_df.loc[start_date:end_date]
            simulated_data_val.append(trimmed_df2[sim_column].values.astype(float))  # Convert to float array
            observed_data_val.append(trimmed_df2[obs_column].values.astype(float))  # Convert to float array

        # Concatenate the lists of arrays into NumPy arrays
        simulated_array_cal = np.concatenate(simulated_data_cal)
        observed_array_cal = np.concatenate(observed_data_cal)
        simulated_array_val = np.concatenate(simulated_data_val)
        observed_array_val = np.concatenate(observed_data_val)
        
        # Remove invalid values (-9999) after concatenating arrays
        simulated_array_cal, observed_array_cal = remove_invalid_values(simulated_array_cal, observed_array_cal)
        simulated_array_val, observed_array_val = remove_invalid_values(simulated_array_val, observed_array_val)
        
        # check for and remove rows with nan
        simulated_array_cal, observed_array_cal= remove_nan_rows(simulated_array_cal, observed_array_cal)
        simulated_array_val, observed_array_val= remove_nan_rows(simulated_array_val, observed_array_val)
        
        # Check if both arrays have the same length
        if len(simulated_array_cal) != len(observed_array_cal):
            raise ValueError(f"Observed and simulated data arrays for file {filename} have different lengths!")
            
        # Check if both arrays have the same length
        if len(simulated_array_val) != len(observed_array_val):
            raise ValueError(f"Observed and simulated data arrays for file {filename} have different lengths!")

        # Calculate KGE and bias for the current file
        cal_kge = compute_kge(simulated_array_cal, observed_array_cal)
        
        # Save total KGE to the list
        calibration_kge.append(cal_kge)
        
        val_kge = compute_kge(simulated_array_val, observed_array_val)
        
        # Save total KGE to the list
        validation_kge.append(val_kge)
        
        file_names.append(filename)

In [7]:
# Provided lists
ids = [
    '58223', '58213', '58208', '58408', '58643', '58308', 
    '58346', '58425', '58356', '58363', '58418', '58290', 
    '58328', '58292'
]

index = [
    'Swiftcurrent Creek at Sherburne Reservoir',
    'St. Mary River near Babb, MT',
    'St. Mary River at International Boundary',
    'Milk River at Western Crossing of International Boundary',
    'North Fork Milk River above St Mary Canal near Browning',
    'Milk River at Eastern Crossing',
    'Big Sandy Creek at Mouth',
    'Clear Creek at Mouth',
    'Lodge Creek at International Boundary',
    'Battle Creek at International Boundary',
    'Peoples Creek at Mouth',
    'Frenchman River at International Boundary',
    'Beaver Creek Bowdoin',
    'Rock Creek at Mouth'
]

# Create dictionary using zip
index_id_dict = dict(zip(ids,index))

In [8]:
# Function to remove leading "00" and trailing ".txt"
def clean_file_name(file_name):
    if file_name.startswith("00"):
        file_name = file_name[2:]
    if file_name.endswith(".txt"):
        file_name = file_name[:-4]
    return file_name

In [9]:
# Iterate through the list and clean file names
cleaned_file_names = [clean_file_name(file_name) for file_name in file_names]

In [10]:
# Create a new list populated by the corresponding dictionary item for each string in cleaned_file_names
mapped_list = [index_id_dict[name] for name in cleaned_file_names if name in index_id_dict]

In [11]:
results= pd.DataFrame(mapped_list, index=cleaned_file_names)

In [12]:
# Rename column 'name' to 'Name'
results = results.rename(columns={0: 'Name'})

In [13]:
# Populate the 'Cal KGE' column with calibrate_kge array
results['Cal KGE'] = calibration_kge

# Populate the 'Val KGE' column with validate_kge array
results['Val KGE'] = validation_kge

results

Unnamed: 0,Name,Cal KGE,Val KGE
58208,St. Mary River at International Boundary,0.854694,0.863095
58213,"St. Mary River near Babb, MT",0.752084,0.799242
58223,Swiftcurrent Creek at Sherburne Reservoir,0.660619,0.715382
58290,Frenchman River at International Boundary,0.332411,0.183949
58292,Rock Creek at Mouth,-0.051166,-0.99937
58308,Milk River at Eastern Crossing,0.374327,0.641805
58328,Beaver Creek Bowdoin,0.682549,0.494172
58346,Big Sandy Creek at Mouth,0.813249,0.431143
58356,Lodge Creek at International Boundary,0.436502,-0.38076
58363,Battle Creek at International Boundary,0.50726,-0.119981
