Note: make top line #!/usr/bin/env python

In [1]:
import subprocess
import os
import pandas as pd
import numpy as np
from typing import Tuple

In [2]:
# In[2]:
cwd = os.getcwd()
print(f'Here: {cwd}')

Here: C:\Users\Paul Coderre\Documents\github\SMM_Models\hype\scripts\analysis_scripts


In [3]:
def remove_invalid_values(simulated, observed):
    valid_indices = np.where((observed != -9999) & (simulated != -9999))
    return simulated[valid_indices], observed[valid_indices]


In [4]:
def remove_nan_rows(
    array1: np.ndarray, 
    array2: np.ndarray
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Removes rows from two arrays where either array has NaN values.
    Retains the first row if it doesn't have any NaN values.
    
    Arguments:
    array1: np.ndarray:
        First input array
    array2: np.ndarray
        Second input array
    
    Returns:
    cleaned_array1: : np.ndarray
        Cleaned array1 without NaN rows
    cleaned_array2: np.ndarray
        Cleaned array2 without NaN rows
    """
    # checks for and removes any rows where either array has a value of NaN at a corresponding row 
    # including the first one
    
    mask = np.logical_and(~np.isnan(array1), ~np.isnan(array2))
    if not np.isnan(array1[0]) and not np.isnan(array2[0]):
        mask[0] = True
    cleaned_array1 = array1[mask]
    cleaned_array2 = array2[mask]
    return cleaned_array1, cleaned_array2

In [5]:
def compute_kge(simulated_array, observed_array):
    """
    Computes KGE (Kling-Gupta Efficiency) between observed and simulated values.

    Parameters:
        observed_array (numpy.ndarray): Array of observed values.
        simulated_array (numpy.ndarray): Array of simulated values.

    Returns:
        float: KGE value.
    """
    
    # Calculate Pearson correlation coefficient
    correlation_coefficient = np.corrcoef(observed_array, simulated_array)[0, 1]
    
    # Calculate standard deviation ratio
    std_observed = np.std(observed_array)
    std_simulated = np.std(simulated_array)
    std_ratio = std_simulated / std_observed
    
    # Calculate bias ratio
    mean_observed = np.mean(observed_array)
    mean_simulated = np.mean(simulated_array)
    bias_ratio = mean_simulated / mean_observed
    
    # Calculate KGE
    kge = 1 - np.sqrt((correlation_coefficient - 1)**2 + (std_ratio - 1)**2 + (bias_ratio - 1)**2)
    
    return kge

In [6]:
def compute_bias(simulated_array, observed_array):
    """
    Computes bias between observed and simulated values.

    Parameters:
        observed_array (numpy.ndarray): Array of observed values.
        simulated_array (numpy.ndarray): Array of simulated values.

    Returns:
        float: Bias value.
    """
    
    # Calculate mean bias
    mean_observed = np.mean(observed_array)
    mean_simulated = np.mean(simulated_array)
    
    # Calculate bias
    bias = mean_simulated - mean_observed
       
    # Calculate percent bias
    percent_bias = (bias / mean_observed) * 100
    
    return percent_bias

In [7]:
def compute_nse(simulated_array, observed_array):
    """
    Computes NSE (Nash-Sutcliffe Efficiency) between observed and simulated values.

    Parameters:
        observed_array (numpy.ndarray): Array of observed values.
        simulated_array (numpy.ndarray): Array of simulated values.

    Returns:
        float: NSE value.
    """
    
    # Calculate the mean of the observed data
    mean_observed = np.mean(observed_array)
    
    # Calculate the numerator (sum of squared differences between observed and simulated)
    numerator = np.sum((observed_array - simulated_array) ** 2)
    
    # Calculate the denominator (sum of squared differences between observed and mean observed)
    denominator = np.sum((observed_array - mean_observed) ** 2)
    
    # Calculate NSE
    nse = 1 - (numerator / denominator)
    
    return nse

In [8]:
# Directory where Hype outputs are saved
file_path= "../../model/model_versions/v_6/v_6_2/results/"

In [9]:
calibration_ranges = [('1980-10-01', '1984-09-30'),
               ('1989-10-01', '1998-09-30'),
               ('2003-10-01', '2007-09-30'),
               ('2012-10-01', '2015-09-30')]

In [10]:
validation_ranges = [('1984-10-01', '1989-09-30'),
               ('1998-10-01', '2003-09-30'),
               ('2007-10-01', '2012-09-30')]

In [11]:
# Create an empty list to store total KGE values for each file
calibration_kge = []

In [12]:
validation_kge= []

In [13]:
calibration_bias = []

In [14]:
validation_bias= []

In [15]:
calibration_nse= []

In [16]:
validation_nse= []

In [17]:
file_names = []

In [18]:
# Iterate through files in the output directory
for filename in os.listdir(file_path):
    if filename.endswith(".txt") and filename.startswith("00"):  # Process files with prefix '00' and end with '.txt'
        filepath = os.path.join(file_path, filename)

        # Create empty lists to store observed and simulated data for each year range
        simulated_data_cal = []
        observed_data_cal = []
        simulated_data_val = []
        observed_data_val = []
        
        # Read tab-separated file into DataFrame
        df = pd.read_csv(filepath, sep='\t', index_col=0)
        df = df.iloc[1:]  # Drop the first row

        # Convert the index to datetime if it's not already in datetime format
        if not isinstance(df.index, pd.DatetimeIndex):
            df.index = pd.to_datetime(df.index)

        # Process and filter DataFrame based on calibration period
        for start_date, end_date in calibration_ranges:
            trimmed_df1 = df.loc[start_date:end_date]
            simulated_data_cal.append(trimmed_df1['cout'].values.astype(float))  # Convert to float array
            observed_data_cal.append(trimmed_df1['rout'].values.astype(float))  # Convert to float array
            
        # Process and filter DataFrame based on validation period
        for start_date, end_date in validation_ranges:
            trimmed_df2 = df.loc[start_date:end_date]
            simulated_data_val.append(trimmed_df2['cout'].values.astype(float))  # Convert to float array
            observed_data_val.append(trimmed_df2['rout'].values.astype(float))  # Convert to float array

        # Concatenate the lists of arrays into NumPy arrays
        simulated_array_cal = np.concatenate(simulated_data_cal)
        observed_array_cal = np.concatenate(observed_data_cal)
        simulated_array_val = np.concatenate(simulated_data_val)
        observed_array_val = np.concatenate(observed_data_val)
        
        # Remove invalid values (-9999) after concatenating arrays
        simulated_array_cal, observed_array_cal = remove_invalid_values(simulated_array_cal, observed_array_cal)
        simulated_array_val, observed_array_val = remove_invalid_values(simulated_array_val, observed_array_val)
        
        # check for and remove rows with nan
        simulated_array_cal, observed_array_cal= remove_nan_rows(simulated_array_cal, observed_array_cal)
        simulated_array_val, observed_array_val= remove_nan_rows(simulated_array_val, observed_array_val)
        
        # Check if both arrays have the same length
        if len(simulated_array_cal) != len(observed_array_cal):
            raise ValueError(f"Observed and simulated data arrays for file {filename} have different lengths!")
            
        # Check if both arrays have the same length
        if len(simulated_array_val) != len(observed_array_val):
            raise ValueError(f"Observed and simulated data arrays for file {filename} have different lengths!")

        # Calculate KGE and bias for the current file
        cal_kge = compute_kge(simulated_array_cal, observed_array_cal)
        
        # Save total KGE to the list
        calibration_kge.append(cal_kge)
        
        val_kge = compute_kge(simulated_array_val, observed_array_val)
        
        # Save total KGE to the list
        validation_kge.append(val_kge)
        
        # Calculate KGE and bias for the current file
        cal_bias= compute_bias(simulated_array_cal, observed_array_cal)
        
        # Save total KGE to the list
        calibration_bias.append(cal_bias)
        
        # Calculate KGE and bias for the current file
        val_bias= compute_bias(simulated_array_val, observed_array_val)
        
        # Save total KGE to the list
        validation_bias.append(val_bias)
        
               # Calculate KGE and bias for the current file
        cal_nse= compute_nse(simulated_array_cal, observed_array_cal)
        
        # Save total KGE to the list
        calibration_nse.append(cal_nse)
        
        # Calculate KGE and bias for the current file
        val_nse= compute_nse(simulated_array_val, observed_array_val)
        
        # Save total KGE to the list
        validation_nse.append(val_nse)
        
        
        file_names.append(filename)

In [19]:
index_id_dict = {
    '58643': {'Location': 'North Fork Milk River above St Mary Canal near Browning', 'Gauge ID': 'NFKMR', 'Drainage Area (km2)': 148},
    '58425': {'Location': 'Clear Creek at Mouth', 'Gauge ID': 'CLCMO', 'Drainage Area (km2)': 351},
    '58418': {'Location': 'Peoples Creek at Mouth', 'Gauge ID': 'PPCMO', 'Drainage Area (km2)': 1817},
    '58408': {'Location': 'Milk River at Western Crossing of International Boundary', 'Gauge ID': 'MRWIB', 'Drainage Area (km2)': 1054},
    '58398': {'Location': 'Rock Creek at Mouth', 'Gauge ID': 'RKCMO', 'Drainage Area (km2)': 2298},
    '58363': {'Location': 'Battle Creek at International Boundary', 'Gauge ID': 'BTCIB', 'Drainage Area (km2)': 2120},
    '58356': {'Location': 'Lodge Creek at International Boundary', 'Gauge ID': 'LDCIB', 'Drainage Area (km2)': 3897},
    '58346': {'Location': 'Big Sandy Creek at Mouth', 'Gauge ID': 'BSCMO', 'Drainage Area (km2)': 4403},
    '58328': {'Location': 'Beaver Creek Bowdoin', 'Gauge ID': 'BCHMO', 'Drainage Area (km2)': 6503},
    '58308': {'Location': 'Milk River at Eastern Crossing', 'Gauge ID': 'MREIB', 'Drainage Area (km2)': 3393},
    '58290': {'Location': 'Frenchman River at International Boundary', 'Gauge ID': 'FRRIB', 'Drainage Area (km2)': 5546},
    '58223': {'Location': 'Swiftcurrent Creek at Sherburne Reservoir', 'Gauge ID': 'SWCSB', 'Drainage Area (km2)': 80},
    '58213': {'Location': 'St. Mary River near Babb, MT', 'Gauge ID': 'SMRBB', 'Drainage Area (km2)': 711},
    '58208': {'Location': 'St. Mary River at International Boundary', 'Gauge ID': 'SMRIB', 'Drainage Area (km2)': 1217}
}

In [20]:
# Function to remove leading "00" and trailing ".txt"
def clean_file_name(file_name):
    if file_name.startswith("00"):
        file_name = file_name[2:]
    if file_name.endswith(".txt"):
        file_name = file_name[:-4]
    return file_name

In [21]:
# Iterate through the list and clean file names
cleaned_file_names = [clean_file_name(file_name) for file_name in file_names]

In [22]:
# Create a new list populated by the corresponding dictionary item for each string in cleaned_file_names
mapped_list = [index_id_dict[name] for name in cleaned_file_names if name in index_id_dict]

In [23]:
results= pd.DataFrame(mapped_list, index=cleaned_file_names)

In [24]:
# Rename column 'name' to 'Name'
results = results.rename(columns={0: 'Name'})

In [25]:
# Add calibration period results

results['Cal NSE'] = calibration_nse

results['Cal KGE'] = calibration_kge

results['Cal Bias'] = calibration_bias

# Populate the 'Val KGE' column with validate_kge array

results['Val NSE'] = validation_nse

results['Val KGE'] = validation_kge

results['Val Bias'] = validation_bias


In [26]:
# Convert index to integer
results.index = results.index.astype(int)

# Sort by index in descending order
results = results.sort_index(ascending=False)

# Assuming df is your DataFrame
results.to_csv('../../model/model_versions/v_6/v_6_2/results/performance_metrics.csv')

In [27]:
results

Unnamed: 0,Location,Drainage Area (km2),Cal NSE,Cal KGE,Cal Bias,Val NSE,Val KGE,Val Bias
58643,North Fork Milk River above St Mary Canal near...,148,0.213978,0.379196,-45.294624,-0.212959,0.38371,-35.905298
58425,Clear Creek at Mouth,351,0.831686,0.765334,15.001714,0.404716,0.519064,3.564819
58418,Peoples Creek at Mouth,1817,0.213067,0.255043,12.873638,0.459852,0.39134,15.48742
58408,Milk River at Western Crossing of Internationa...,1054,0.358931,0.488669,-4.65901,0.372787,0.454257,-3.73774
58398,Rock Creek at Mouth,2298,0.148976,0.287762,-7.294314,0.069254,0.24411,7.261184
58363,Battle Creek at International Boundary,2120,0.316495,0.324967,0.901512,-0.199505,0.08083,77.018296
58356,Lodge Creek at International Boundary,3897,0.224769,0.172799,-10.42552,0.339543,-0.061602,93.226531
58346,Big Sandy Creek at Mouth,4403,0.598116,0.595735,35.478258,0.463222,0.399952,53.417107
58328,Beaver Creek Bowdoin,6503,0.592496,0.606512,-17.39697,0.076896,0.415078,-8.984903
58308,Milk River at Eastern Crossing,3393,0.162128,0.222347,-22.955302,-0.049926,0.486183,6.727465


In [28]:
stmary = results.loc[[58213, 58223, 58208]]
milk = results.loc[[58408, 58643, 58308, 58346, 58425, 58356, 58363, 58418, 58290, 58328, 58398]]

In [29]:
print(f'Mean Basin Calibration period KGE = {results["Cal KGE"].mean()}')
print(f'Mean Basin Validation period KGE = {results["Val KGE"].mean()}')

print(f'Mean Basin Calibration period NSE = {results["Cal NSE"].mean()}')
print(f'Mean Basin Validation period NSE = {results["Val NSE"].mean()}')

Mean Basin Calibration period KGE = 0.48758577280176013
Mean Basin Validation period KGE = 0.43417482885289055
Mean Basin Calibration period NSE = 0.4626253763438763
Mean Basin Validation period NSE = 0.306989727956244


In [30]:
print(f'Mean St. Mary Calibration period KGE = {stmary["Cal KGE"].mean()}')
print(f'Mean St. Mary Validation period KGE = {stmary["Val KGE"].mean()}')

print(f'Mean St. Mary Calibration period NSE = {stmary["Cal NSE"].mean()}')
print(f'Mean St. Mary Validation period NSE = {stmary["Val NSE"].mean()}')

Mean St. Mary Calibration period KGE = 0.7984589852746526
Mean St. Mary Validation period KGE = 0.8486994714876928
Mean St. Mary Calibration period NSE = 0.8099659483212287
Mean St. Mary Validation period NSE = 0.8500576934246961


In [31]:
print(f'Mean Milk Calibration period KGE = {milk["Cal KGE"].mean()}')
print(f'Mean Milk Validation period KGE = {milk["Val KGE"].mean()}')

print(f'Mean Milk Calibration period NSE = {milk["Cal NSE"].mean()}')
print(f'Mean Milk Validation period NSE = {milk["Val NSE"].mean()}')

Mean Milk Calibration period KGE = 0.40280216940006214
Mean Milk Validation period KGE = 0.32112265358885356
Mean Milk Calibration period NSE = 0.367896129440962
Mean Milk Validation period NSE = 0.1588802828284843
