Note: make top line #!/usr/bin/env python

In [1]:
import subprocess
import os
import pandas as pd
import numpy as np
from typing import Tuple

In [2]:
# In[2]:
cwd = os.getcwd()
print(f'Here: {cwd}')

Here: C:\Users\Paul Coderre\Documents\github\SMM_Models\hype\scripts\analysis_scripts


In [3]:
def remove_invalid_values(simulated, observed):
    valid_indices = np.where((observed != -9999) & (simulated != -9999))
    return simulated[valid_indices], observed[valid_indices]


In [4]:
def remove_nan_rows(
    array1: np.ndarray, 
    array2: np.ndarray
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Removes rows from two arrays where either array has NaN values.
    Retains the first row if it doesn't have any NaN values.
    
    Arguments:
    array1: np.ndarray:
        First input array
    array2: np.ndarray
        Second input array
    
    Returns:
    cleaned_array1: : np.ndarray
        Cleaned array1 without NaN rows
    cleaned_array2: np.ndarray
        Cleaned array2 without NaN rows
    """
    # checks for and removes any rows where either array has a value of NaN at a corresponding row 
    # including the first one
    
    mask = np.logical_and(~np.isnan(array1), ~np.isnan(array2))
    if not np.isnan(array1[0]) and not np.isnan(array2[0]):
        mask[0] = True
    cleaned_array1 = array1[mask]
    cleaned_array2 = array2[mask]
    return cleaned_array1, cleaned_array2

In [5]:
def compute_kge(simulated_array, observed_array):
    """
    Computes KGE (Kling-Gupta Efficiency) between observed and simulated values.

    Parameters:
        observed_array (numpy.ndarray): Array of observed values.
        simulated_array (numpy.ndarray): Array of simulated values.

    Returns:
        float: KGE value.
    """
    
    # Calculate Pearson correlation coefficient
    correlation_coefficient = np.corrcoef(observed_array, simulated_array)[0, 1]
    
    # Calculate standard deviation ratio
    std_observed = np.std(observed_array)
    std_simulated = np.std(simulated_array)
    std_ratio = std_simulated / std_observed
    
    # Calculate bias ratio
    mean_observed = np.mean(observed_array)
    mean_simulated = np.mean(simulated_array)
    bias_ratio = mean_simulated / mean_observed
    
    # Calculate KGE
    kge = 1 - np.sqrt((correlation_coefficient - 1)**2 + (std_ratio - 1)**2 + (bias_ratio - 1)**2)
    
    return kge

In [6]:
def compute_bias(simulated_array, observed_array):
    """
    Computes bias between observed and simulated values.

    Parameters:
        observed_array (numpy.ndarray): Array of observed values.
        simulated_array (numpy.ndarray): Array of simulated values.

    Returns:
        float: Bias value.
    """
    
    # Calculate mean bias
    mean_observed = np.mean(observed_array)
    mean_simulated = np.mean(simulated_array)
    
    # Calculate bias
    bias = mean_simulated - mean_observed
       
    # Calculate percent bias
    percent_bias = (bias / mean_observed) * 100
    
    return percent_bias

In [7]:
def compute_nse(simulated_array, observed_array):
    """
    Computes NSE (Nash-Sutcliffe Efficiency) between observed and simulated values.

    Parameters:
        observed_array (numpy.ndarray): Array of observed values.
        simulated_array (numpy.ndarray): Array of simulated values.

    Returns:
        float: NSE value.
    """
    
    # Calculate the mean of the observed data
    mean_observed = np.mean(observed_array)
    
    # Calculate the numerator (sum of squared differences between observed and simulated)
    numerator = np.sum((observed_array - simulated_array) ** 2)
    
    # Calculate the denominator (sum of squared differences between observed and mean observed)
    denominator = np.sum((observed_array - mean_observed) ** 2)
    
    # Calculate NSE
    nse = 1 - (numerator / denominator)
    
    return nse

In [8]:
# Directory where Hype outputs are saved
file_path= "../../model/06_HYPE_Final/final_hds/data/"

In [9]:
calibration_ranges = [('1980-10-01', '1984-09-30'),
               ('1989-10-01', '1998-09-30'),
               ('2003-10-01', '2007-09-30'),
               ('2012-10-01', '2015-09-30')]

In [10]:
validation_ranges = [('1984-10-01', '1989-09-30'),
               ('1998-10-01', '2003-09-30'),
               ('2007-10-01', '2012-09-30')]

In [11]:
# Create an empty list to store total KGE values for each file
calibration_kge = []

In [12]:
validation_kge= []

In [13]:
calibration_bias = []

In [14]:
validation_bias= []

In [15]:
calibration_nse= []

In [16]:
validation_nse= []

In [17]:
file_names = []

In [18]:
# Iterate through files in the output directory
for filename in os.listdir(file_path):
    if filename.endswith(".txt") and filename.startswith("00"):  # Process files with prefix '00' and end with '.txt'
        filepath = os.path.join(file_path, filename)

        # Create empty lists to store observed and simulated data for each year range
        simulated_data_cal = []
        observed_data_cal = []
        simulated_data_val = []
        observed_data_val = []
        
        # Read tab-separated file into DataFrame
        df = pd.read_csv(filepath, sep='\t', index_col=0)
        df = df.iloc[1:]  # Drop the first row

        # Convert the index to datetime if it's not already in datetime format
        if not isinstance(df.index, pd.DatetimeIndex):
            df.index = pd.to_datetime(df.index)

        # Process and filter DataFrame based on calibration period
        for start_date, end_date in calibration_ranges:
            trimmed_df1 = df.loc[start_date:end_date]
            simulated_data_cal.append(trimmed_df1['cout'].values.astype(float))  # Convert to float array
            observed_data_cal.append(trimmed_df1['rout'].values.astype(float))  # Convert to float array
            
        # Process and filter DataFrame based on validation period
        for start_date, end_date in validation_ranges:
            trimmed_df2 = df.loc[start_date:end_date]
            simulated_data_val.append(trimmed_df2['cout'].values.astype(float))  # Convert to float array
            observed_data_val.append(trimmed_df2['rout'].values.astype(float))  # Convert to float array

        # Concatenate the lists of arrays into NumPy arrays
        simulated_array_cal = np.concatenate(simulated_data_cal)
        observed_array_cal = np.concatenate(observed_data_cal)
        simulated_array_val = np.concatenate(simulated_data_val)
        observed_array_val = np.concatenate(observed_data_val)
        
        # Remove invalid values (-9999) after concatenating arrays
        simulated_array_cal, observed_array_cal = remove_invalid_values(simulated_array_cal, observed_array_cal)
        simulated_array_val, observed_array_val = remove_invalid_values(simulated_array_val, observed_array_val)
        
        # check for and remove rows with nan
        simulated_array_cal, observed_array_cal= remove_nan_rows(simulated_array_cal, observed_array_cal)
        simulated_array_val, observed_array_val= remove_nan_rows(simulated_array_val, observed_array_val)
        
        # Check if both arrays have the same length
        if len(simulated_array_cal) != len(observed_array_cal):
            raise ValueError(f"Observed and simulated data arrays for file {filename} have different lengths!")
            
        # Check if both arrays have the same length
        if len(simulated_array_val) != len(observed_array_val):
            raise ValueError(f"Observed and simulated data arrays for file {filename} have different lengths!")

        # Calculate KGE and bias for the current file
        cal_kge = compute_kge(simulated_array_cal, observed_array_cal)
        
        # Save total KGE to the list
        calibration_kge.append(cal_kge)
        
        val_kge = compute_kge(simulated_array_val, observed_array_val)
        
        # Save total KGE to the list
        validation_kge.append(val_kge)
        
        # Calculate KGE and bias for the current file
        cal_bias= compute_bias(simulated_array_cal, observed_array_cal)
        
        # Save total KGE to the list
        calibration_bias.append(cal_bias)
        
        # Calculate KGE and bias for the current file
        val_bias= compute_bias(simulated_array_val, observed_array_val)
        
        # Save total KGE to the list
        validation_bias.append(val_bias)
        
               # Calculate KGE and bias for the current file
        cal_nse= compute_nse(simulated_array_cal, observed_array_cal)
        
        # Save total KGE to the list
        calibration_nse.append(cal_nse)
        
        # Calculate KGE and bias for the current file
        val_nse= compute_nse(simulated_array_val, observed_array_val)
        
        # Save total KGE to the list
        validation_nse.append(val_nse)
        
        
        file_names.append(filename)

In [19]:
# Provided lists
ids = [
    '58223', '58213', '58208', '58408', '58643', '58308', 
    '58346', '58425', '58356', '58363', '58418', '58290', 
    '58328', '58398'
]

index = [
    'Swiftcurrent Creek at Sherburne Reservoir',
    'St. Mary River near Babb, MT',
    'St. Mary River at International Boundary',
    'Milk River at Western Crossing of International Boundary',
    'North Fork Milk River above St Mary Canal near Browning',
    'Milk River at Eastern Crossing',
    'Big Sandy Creek at Mouth',
    'Clear Creek at Mouth',
    'Lodge Creek at International Boundary',
    'Battle Creek at International Boundary',
    'Peoples Creek at Mouth',
    'Frenchman River at International Boundary',
    'Beaver Creek Bowdoin',
    'Rock Creek at Mouth'
]

# Create dictionary using zip
index_id_dict = dict(zip(ids,index))

In [20]:
# Function to remove leading "00" and trailing ".txt"
def clean_file_name(file_name):
    if file_name.startswith("00"):
        file_name = file_name[2:]
    if file_name.endswith(".txt"):
        file_name = file_name[:-4]
    return file_name

In [21]:
# Iterate through the list and clean file names
cleaned_file_names = [clean_file_name(file_name) for file_name in file_names]

In [22]:
# Create a new list populated by the corresponding dictionary item for each string in cleaned_file_names
mapped_list = [index_id_dict[name] for name in cleaned_file_names if name in index_id_dict]

In [23]:
results= pd.DataFrame(mapped_list, index=cleaned_file_names)

In [24]:
# Rename column 'name' to 'Name'
results = results.rename(columns={0: 'Name'})

In [25]:
# Populate the 'Cal KGE' column with calibrate_kge array
results['Cal KGE'] = calibration_kge

# Populate the 'Val KGE' column with validate_kge array
results['Val KGE'] = validation_kge

results['Cal Bias'] = calibration_bias

results['Val Bias'] = validation_bias

results['Cal NSE'] = calibration_nse

results['Val NSE'] = validation_nse




In [26]:
# Convert index to integer
results.index = results.index.astype(int)

# Sort by index in descending order
results = results.sort_index(ascending=False)

# Assuming df is your DataFrame
results.to_csv('../../model/06_HYPE_Final/hds_results.csv')

In [27]:
results

Unnamed: 0,Name,Cal KGE,Val KGE,Cal Bias,Val Bias,Cal NSE,Val NSE
58643,North Fork Milk River above St Mary Canal near...,0.315279,0.216509,-45.299314,-27.445526,-0.334341,-0.927903
58425,Clear Creek at Mouth,0.703891,0.544418,-19.40254,-12.99051,0.731967,0.29809
58418,Peoples Creek at Mouth,0.36985,0.419567,22.438494,25.5572,0.15131,0.442043
58408,Milk River at Western Crossing of Internationa...,0.527435,0.515259,11.652345,18.708989,-0.029927,0.12452
58398,Rock Creek at Mouth,0.23124,0.042789,25.228457,55.340189,-0.821227,-1.055759
58363,Battle Creek at International Boundary,0.493062,-0.490387,3.994461,115.052228,0.216848,-1.657839
58356,Lodge Creek at International Boundary,0.492969,-0.948379,4.387623,179.888203,0.259506,-0.998681
58346,Big Sandy Creek at Mouth,0.739624,0.361426,13.357942,53.826455,0.541248,0.258987
58328,Beaver Creek Bowdoin,0.746051,0.390538,1.213108,-7.629572,0.510322,0.247899
58308,Milk River at Eastern Crossing,0.423101,0.46708,-11.713524,23.589813,-0.081689,-0.083583


In [28]:
stmary = results.loc[[58213, 58223, 58208]]
milk = results.loc[[58408, 58643, 58308, 58346, 58425, 58356, 58363, 58418, 58290, 58328, 58398]]

In [29]:
print(f'Mean Basin Calibration period KGE = {results["Cal KGE"].mean()}')
print(f'Mean Basin Validation period KGE = {results["Val KGE"].mean()}')

Mean Basin Calibration period KGE = 0.5509257842643066
Mean Basin Validation period KGE = 0.2536709861855537


In [30]:
print(f'Mean St. Mary Calibration period KGE = {stmary["Cal KGE"].mean()}')
print(f'Mean St. Mary Validation period KGE = {stmary["Val KGE"].mean()}')

Mean St. Mary Calibration period KGE = 0.7056674098715084
Mean St. Mary Validation period KGE = 0.7736563213963286


In [31]:
print(f'Mean Milk Calibration period KGE = {milk["Cal KGE"].mean()}')
print(f'Mean Milk Validation period KGE = {milk["Val KGE"].mean()}')

Mean Milk Calibration period KGE = 0.5087235227350697
Mean Milk Validation period KGE = 0.11185680385534237
