In [1]:
import os
import pandas as pd
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def find_non_zero_files(wb_dir, max_lines=200, subbasin_id=None):
    
    """
    Checks all HYPE water balance files for non-zero and non-NaN values. Returns a list of files with values. 
    Max_lines can be used to reduce computational demand. Can check the full file or just for a given subbasin (useful to
    confirm regionalization)

    Parameters:
    wb_dir (str): The directory path where the files are stored.
    max_lines (int, optional): The maximum number of lines to read from each file for performance optimization. Default is 200.
    subbasin_id (int or str, optional): A specific subbasin ID to check for non-zero values in the file. 
                                        If provided, the function will only check the column corresponding to this subbasin ID.

    Returns:
    list: A list of filenames that contain non-zero, non-NaN values in the specified subbasin column or the entire file.
    """
        
    non_zero_files = []
    
    
    # Iterate through each file in the directory
    for filename in os.listdir(wb_dir):
        if filename.startswith('WB') and filename.endswith('.txt'):
            file_path = os.path.join(wb_dir, filename)
            
            # Load only the first 'max_lines' lines into a DataFrame
            try:
                df = pd.read_csv(file_path, index_col=0, sep='\t', nrows=max_lines)
                
                # Convert only numeric headers to integers, ignore non-numeric ones
                try:
                    df.columns = pd.to_numeric(df.columns, errors='coerce').astype('Int64')  # 'Int64' to allow NaN for non-convertible columns
                except Exception as e:
                    print(f"Error converting headers to integers in {filename}: {e}")
                
                # If subbasin_id is provided, convert it to an integer
                if subbasin_id is not None:
                    try:
                        subbasin_id = int(subbasin_id)
                    except ValueError:
                        print(f"Subbasin ID {subbasin_id} is not a valid integer.")
                        continue
                
                # If subbasin_id is provided and found in columns, filter the DataFrame
                if subbasin_id in df.columns:
                    df = df[[subbasin_id]]
                
                # Check if there are any non-zero and non-NaN values in the DataFrame
                if ((df != 0) & df.notna()).any().any():
                    non_zero_files.append(filename)
            except Exception as e:
                print(f"Error processing {filename}: {e}")
    
    return non_zero_files

In [3]:
def find_non_matching_strings(list1, list2):
    """
    Finds and returns a list of strings that do not match between two lists.

    Parameters:
    list1 (list): The first list of strings to compare.
    list2 (list): The second list of strings to compare.

    Returns:
    list: A list of strings that are present in either list1 or list2, but not in both.
    """
    # Convert lists to sets for efficient comparison
    set1 = set(list1)
    set2 = set(list2)
    
    # Find strings that are in list1 but not in list2, and vice versa
    non_matching_from_list1 = set1 - set2
    non_matching_from_list2 = set2 - set1
    
    # Combine the non-matching strings from both lists
    non_matching = list(non_matching_from_list1.union(non_matching_from_list2))
    
    return non_matching

In [4]:
def read_non_zero_files(file_list):
    """
    Reads multiple files into an xarray DataArray from a provided list of filenames.

    Parameters:
    file_list (list): A list of file paths to read.

    Returns:
    xarray.DataArray: An xarray DataArray containing the data from all specified files.
    """
    try:
        # Read all files into a single DataArray
        data_array = xr.open_mfdataset(file_list, combine='by_coords')
        return data_array
    except Exception as e:
        print(f"Error reading files: {e}")
        return None

In [5]:
def read_wb_files(file_list, wb_dir, start_date, end_date):
    """
    Reads multiple text files into a single xarray DataArray with an additional dimension for filenames. Filter to only include specified date range and return a WBs DataArray
    with the full range to include initial conditions as well as a WBf DataArray containing the range except for the first date.

    Parameters:
    file_list (list): A list of filenames to append to the directory path.
    wb_dir (str): The base directory path.
    start_date (str): The start date for filtering the data.
    end_date (str): The end date for filtering the data.

    Returns:
    tuple: A tuple containing two xarray DataArrays:
        - filtered_wbs_components: DataArray containing data within the date range.
        - filtered_wbf_components: DataArray containing data without the first date.
    """
    # Create an empty list to store DataArrays and filenames
    ensemble_member_list = []
    filenames = []

    # Create full file paths
    full_file_paths = [os.path.join(wb_dir, filename) for filename in file_list]
    
    # Read each file and store in the ensemble_member_list
    for path in full_file_paths:
        try:
            # Determine if the filename starts with 'WBs' to skip the first line
            skip_first_line = os.path.basename(path).startswith('WBs')
            # Read the file into a pandas DataFrame normally
            df = pd.read_csv(path, sep='\t', index_col=0)  # Adjust the separator and index column as needed
            
            # Drop the first row if the filename starts with 'WBs'
            if os.path.basename(path).startswith('WBs'):
                df = df.drop(df.index[0])  # Drop the first row
            
            # Convert the index to datetime and ensure subbasin columns are integers
            df.index = pd.to_datetime(df.index)  # Convert DATE index to datetime
            df.columns = df.columns.astype(int)  # Convert subbasin columns to integers
            
            # Ensure all values are numeric (this may convert non-numeric entries to NaN)
            df = df.apply(pd.to_numeric, errors='coerce')
            
            # Create an xarray DataArray for the current file
            data_array = xr.DataArray(
                df.values,
                dims=('DATE', 'subbasin'),
                coords={'DATE': df.index,
                        'subbasin': df.columns}
            )
            
            # Assign the filename as an additional dimension
            data_array = data_array.expand_dims(wb=1)  # Create a new dimension for the filename
            data_array.attrs['filename'] = os.path.basename(path)  # Optionally store the filename in the attributes

            # Add the DataArray to the ensemble member list
            ensemble_member_list.append(data_array)
            filenames.append(os.path.basename(path).split('.')[0])  # Store the filename without extension
            
        except Exception as e:
            print(f"Error reading {path}: {e}")

    # Concatenate all DataArrays along the new 'wb' dimension
    if ensemble_member_list:
        combined_data_array = xr.concat(ensemble_member_list, dim='wb')  # Concatenate along the wb dimension
        combined_data_array.coords['wb'] = filenames  # Assign the filenames to the new dimension
        
        # Filter the DataArray to include only the specified date range
        filtered_wbs_components = combined_data_array.sel(DATE=slice(start_date, end_date))
        
        # Exclude the first date from the filtered_wbs_components
        first_date = pd.to_datetime(start_date)
        filtered_wbf_components = combined_data_array.sel(DATE=slice(first_date + pd.Timedelta(days=1), end_date))

        return filtered_wbs_components, filtered_wbf_components
    
    return None


In [6]:
def precipitation(filtered_wbf_components):
    """
    Filters the DataArray for wb entries that start with 'WBf_rain' and 'WBf_snowfall',
    sums the rainfall and snowfall components, and returns the total precipitation.

    Parameters:
    filtered_wbf_components (xarray.DataArray): The DataArray filtered by date.

    Returns:
    xarray.DataArray: Total precipitation (rainfall + snowfall) summed over all 'wb' entries.
    """
    # Filter for wb entries that start with 'WBf_rain'
    rainfall = filtered_wbf_components.sel(wb=filtered_wbf_components.wb.str.startswith('WBf_rain'))
    rainfall_sum = rainfall.sum(dim='wb')
    
    # Filter for wb entries that start with 'WBf_snowfall'
    snowfall = filtered_wbf_components.sel(wb=filtered_wbf_components.wb.str.startswith('WBf_snowfall'))
    snowfall_sum = snowfall.sum(dim='wb')
    
    # Calculate the total precipitation (rainfall + snowfall)
    total_precip = rainfall_sum + snowfall_sum
    
    rainfall_frac= rainfall_sum/total_precip
    snowfall_frac= snowfall_sum/total_precip
    
    print(f'Full Domain Rainfall Fraction: {rainfall_frac.mean()} \n'
    f'Full Domain Rainfall Fraction {snowfall_frac.mean()}')
    
    return total_precip

In [7]:
def calculate_wbf(filtered_wbf_components, subbasin, file_name):
    """
    Filters the DataArray for wb entries that start with 'WBf_satsurfaceflow_soillayer1_lstream',
    selects the specified subbasin, and returns the summed saturated surface runoff for that subbasin.

    Parameters:
    filtered_wbf_components (xarray.DataArray): The DataArray filtered by date.
    subbasin (int): The specific subbasin to select.

    Returns:
    xarray.DataArray: Summed saturated surface runoff for the specified subbasin.
    """
    # Filter for wb entries that start with requited name
    wbf = filtered_wbf_components.sel(wb=filtered_wbf_components.wb.str.startswith(file_name))
    
    # Select the specified subbasin and sum the values
    wbf = wbf.sel(subbasin=subbasin).sum()
    
    return wbf.values

In [8]:
def calculate_wbs(filtered_wbs_components, subbasin, file_name, start_date, end_date):
    """
    Filters the DataArray for wbs entries,
    selects the specified subbasin, and returns the change in storage for that subbasin.

    Parameters:
    filtered_wbs_components (xarray.DataArray): The DataArray filtered by date.
    subbasin (int): The specific subbasin to select.

    Returns:
    xarray.DataArray: Summed saturated surface runoff for the specified subbasin.
    """
    # Filter for wb entries that start with requited name
    wbs = filtered_wbs_components.sel(wb=filtered_wbf_components.wb.str.startswith(file_name))

    # extract changes in storage
    wbs= wbs.sel(subbasin=subbasin)
    wbs= wbs.sel(DATE= end_date).values - wbs.sel(DATE= start_date).values
    
    return wbs

### Inputs

In [9]:
# directory containing wb outputs
wb_dir= '../../model/model_versions/v_6/v_6_2/v6_2_waterbal/'

# date range for wb calculations, set to sometime after the first year in the model run to reduce effects of initial conditions
start_date = '1997-04-01'
end_date = '2005-09-30'

subbasin= 58363

### Find Non-Zero WB Components

In [10]:
milk_non_zero_files = find_non_zero_files(wb_dir, subbasin_id=None)

In [11]:
stm_non_zero_files= find_non_zero_files(wb_dir, subbasin_id=58208)

In [12]:
# find WB components that are present in the milk but not the St Mary
non_matching_strings = find_non_matching_strings(stm_non_zero_files, milk_non_zero_files)
print(non_matching_strings)

['WBf_precipitation__iwet.txt', 'WBf_evaporation_ilake_.txt', 'WBs_iwet.txt', 'WBf_flow_ilake_mriver.txt', 'WBf_snowmelt_via_macropore_snow_soillayer1.txt', 'WBf_precipitation__ilake.txt', 'WBf_flow_lstream_ilake.txt', 'WBs_ilake.txt', 'WBf_evaporation_iwet_.txt']


# Land Class Water Balance

In [13]:
# Read WB files into DataArray
filtered_wbs_components, filtered_wbf_components = read_wb_files(milk_non_zero_files, wb_dir, start_date, end_date)

In [14]:
print(filtered_wbf_components.wb.values)

['WBf_evaporation_ilake_' 'WBf_evaporation_iwet_' 'WBf_evaporation_snow_'
 'WBf_evaporation_soillayer1_' 'WBf_evaporation_soillayer2_'
 'WBf_flow_ilake_mriver' 'WBf_flow_lstream_ilake'
 'WBf_flow_lstream_mriver' 'WBf_flow_mriver_olake'
 'WBf_flow_olake_mriver_maindownstream'
 'WBf_percolation_soillayer1_soillayer2'
 'WBf_percolation_soillayer2_soillayer3' 'WBf_precipitation__ilake'
 'WBf_precipitation__iwet' 'WBf_rain_surfacerunoff__lstream'
 'WBf_rain_via_macropore__soillayer1' 'WBf_rain_via_macropore__soillayer3'
 'WBf_rain__soillayer1' 'WBf_satsurfaceflow_soillayer1_lstream'
 'WBf_snowfall__snow' 'WBf_snowmelt_snow_soillayer1'
 'WBf_snowmelt_surfacerunoff_snow_lstream'
 'WBf_snowmelt_via_macropore_snow_soillayer1'
 'WBf_snowmelt_via_macropore_snow_soillayer3'
 'WBf_soilrunoff_soillayer1_lstream' 'WBf_soilrunoff_soillayer2_lstream'
 'WBf_soilrunoff_soillayer3_lstream' 'WBs_ilake' 'WBs_iwet' 'WBs_lstream'
 'WBs_mriver' 'WBs_snow' 'WBs_soillayer1' 'WBs_soillayer2'
 'WBs_soillayer3']


## Components

### Partitioning

In [15]:
total_precip = precipitation(filtered_wbf_components)

Full Domain Rainfall Fraction: <xarray.DataArray ()>
array(0.6644147) 
Full Domain Rainfall Fraction <xarray.DataArray ()>
array(0.3355853)


### Land Class Fluxes

In [16]:
# find precipitation flux
precip_landclass= total_precip.sel(subbasin=subbasin).sum().values

# find runoff fluxes
sat_surface_runoff= calculate_wbf(filtered_wbf_components, subbasin, 'WBf_satsurfaceflow_soillayer1_lstream')

# note this is considerered as part of the rain input so it will be subtracted twice to consider it as an output
rain_runoff= calculate_wbf(filtered_wbf_components, subbasin, 'WBf_rain_surfacerunoff__lstream')

snowmelt_runoff = calculate_wbf(filtered_wbf_components, subbasin, 'WBf_snowmelt_surfacerunoff_snow_lstream')

soil_layer1_runoff = calculate_wbf(filtered_wbf_components, subbasin, 'WBf_soilrunoff_soillayer1_lstream')

soil_layer2_runoff = calculate_wbf(filtered_wbf_components, subbasin, 'WBf_soilrunoff_soillayer2_lstream')

soil_layer3_runoff = calculate_wbf(filtered_wbf_components, subbasin, 'WBf_soilrunoff_soillayer3_lstream')

# find et fluxes
et_sl1 = calculate_wbf(filtered_wbf_components, subbasin, 'WBf_evaporation_soillayer1')

et_sl2 = calculate_wbf(filtered_wbf_components, subbasin, 'WBf_evaporation_soillayer2')

sublimation = calculate_wbf(filtered_wbf_components, subbasin, 'WBf_evaporation_snow')

### Land Class Storages

In [17]:
# calculate changes in storage
delta_stor_sl1= calculate_wbs(filtered_wbs_components, subbasin, 'WBs_soillayer1', start_date, end_date)

delta_stor_sl2= calculate_wbs(filtered_wbs_components, subbasin, 'WBs_soillayer2', start_date, end_date)

delta_stor_sl3= calculate_wbs(filtered_wbs_components, subbasin, 'WBs_soillayer3', start_date, end_date)

delta_stor_snow= calculate_wbs(filtered_wbs_components, subbasin, 'WBs_snow', start_date, end_date)

#### Adjust Snow Storage

In [18]:
# find vertical snow fluxes within each land class 
snowmelt_macro_sl1=  calculate_wbf(filtered_wbf_components, subbasin,'WBf_snowmelt_via_macropore_snow_soillayer1')

snowmelt_macro_sl2=  calculate_wbf(filtered_wbf_components, subbasin,'WBf_snowmelt_via_macropore_snow_soillayer2')

snowmelt_macro_sl3=  calculate_wbf(filtered_wbf_components, subbasin,'WBf_snowmelt_via_macropore_snow_soillayer3')

snowmelt_infiltration_sl1=  calculate_wbf(filtered_wbf_components, subbasin,'WBf_WBf_snowmelt_snow_soillayer1.txt')

# add back vertical snow changes to only consider horizontal fluxes
delta_stor_snow_subbasin= delta_stor_snow + snowmelt_runoff + snowmelt_macro_sl1 + snowmelt_macro_sl2 + snowmelt_macro_sl3 + snowmelt_infiltration_sl1

# Results

In [19]:
# precipitation
p= precip_landclass

# evapotranspiration
e= et_sl1 + et_sl2 + sublimation

# runoff from land class 
r=  sat_surface_runoff + 2 * rain_runoff + snowmelt_runoff + soil_layer1_runoff + soil_layer2_runoff + soil_layer3_runoff

# change in storage
delta_s= delta_stor_sl1 + delta_stor_sl2 + delta_stor_sl3 + delta_stor_snow

print(f'Precipitation Flux= {p}\n'
f'Evapotranspiration Flux= {e}\n'
f'Runoff to Local Stream= {r}\n'
f'Change in Storage= {delta_s}')

Precipitation Flux= 1029198457.1181327
Evapotranspiration Flux= 1019353482.5522885
Runoff to Local Stream= 34991466.838923424
Change in Storage= [-21975882.639648]


In [20]:
# calculate the water balance
balance= p -e - r - delta_s

In [21]:
print(f'Land Class WB Error= {balance}\n'
     f'Land Class % Error= {np.abs(balance/delta_s)}')

Land Class WB Error= [-3170609.63343124]
Land Class % Error= [0.14427678]


# Subbasin Water Balance

### Internal Wetland

In [22]:
# find precipitation into iwet
precip_iwet= calculate_wbf(filtered_wbf_components, subbasin, 'WBf_precipitation__iwet')

# find et from iwet
evap_iwet= calculate_wbf(filtered_wbf_components, subbasin, 'WBf_evaporation_iwet')

# find change in storage from iwet
delta_stor_iwet= calculate_wbs(filtered_wbs_components, subbasin, 'WBs_iwet', start_date, end_date)

### Internal Lakes (Potholes)

In [23]:
# find precipitation into ilake
precip_ilake= calculate_wbf(filtered_wbf_components, subbasin, 'WBf_precipitation__ilake')

# find inflow to ilake
inflow_ilake= calculate_wbf(filtered_wbf_components, subbasin, 'WBf_flow_lstream_ilake')

# find evap ilake
evap_ilake= calculate_wbf(filtered_wbf_components, subbasin, 'WBf_evaporation_ilake')

# find outflow ilake
outflow_ilake= calculate_wbf(filtered_wbf_components, subbasin, 'WBf_flow_ilake_mriver')

# find change in storage ilake
delta_stor_ilake= calculate_wbs(filtered_wbs_components, subbasin, 'WBs_ilake', start_date, end_date)

In [24]:
ilake_wb= precip_ilake + inflow_ilake - evap_ilake - outflow_ilake - delta_stor_ilake

In [25]:
print(f'ilake WB Error= {ilake_wb}')

ilake WB Error= [5.80976879]


# Runoff Ratio

In [26]:
# find outlflow from subbasin to main river
lstream_mriver= calculate_wbf(filtered_wbf_components, subbasin, 'WBf_flow_lstream_mriver')

# total subbasin runoff is sum of ilake outflow and lstream straight to mriver
subbasin_runoff= lstream_mriver + outflow_ilake

In [27]:
# calculate the runoff ratio
total_subbasin_precip= precip_landclass + precip_ilake + precip_iwet

runoff_ratio= subbasin_runoff/total_subbasin_precip * 100

In [28]:
print(f'Total Precipitation= {total_subbasin_precip}\n'
f'Total Runoff= {subbasin_runoff}\n'
f'Runoff Ratio (%) for Subbasin: {subbasin} = {runoff_ratio}')

Total Precipitation= 1148637649.7482162
Total Runoff= 14131369.238403074
Runoff Ratio (%) for Subbasin: 58363 = 1.2302721612425556


# Results Dataframe

In [68]:
# Creating data for the DataFrame
data = {
    'Precipitation': {
        'Landclass Precipitation (%)': precip_landclass/total_subbasin_precip,
        'Pothole Precipitation (%)': precip_ilake/total_subbasin_precip,
        'Total Precipitation (m3)': total_subbasin_precip,
    },
    'Evapotranspiration (fraction of landclass precip)': {
        'ET SL 1': et_sl1 / precip_landclass,
        'ET SL 2': et_sl2 / precip_landclass,
        'Sublimation': sublimation / precip_landclass,
        'Total ET Flux': (et_sl1 + et_sl2 + sublimation) / precip_landclass
    },
    'Runoff (fraction of landclass precip)': {
        'Surface Runoff': (sat_surface_runoff + rain_runoff + snowmelt_runoff) / precip_landclass,
        'SL 1 Flow': soil_layer1_runoff / precip_landclass,
        'SL 2 Flow': soil_layer2_runoff / precip_landclass,
        'SL 3 Flow': soil_layer3_runoff / precip_landclass,
        'Total Runoff Flux': (sat_surface_runoff + 2 * rain_runoff + snowmelt_runoff + soil_layer1_runoff + soil_layer2_runoff + soil_layer3_runoff) / precip_landclass
    },
    'Change in Storage (fraction of landclass precip)': {
        'Change in Storage SL1': delta_stor_sl1 / precip_landclass,
        'Change in Storage SL2': delta_stor_sl2 / precip_landclass,
        'Change in Storage SL3': delta_stor_sl3 / precip_landclass,
        'Storage Change': (delta_stor_sl1 + delta_stor_sl2 + delta_stor_sl3) / precip_landclass
    },
    'Wetlands': {
        'Inflow (fraction of landclass precip)': inflow_ilake/precip_landclass,
        'Evaporation (fraction of inputs)': evap_ilake/(inflow_ilake + precip_ilake),
        'Outflow (fraction of inputs)': outflow_ilake/(inflow_ilake + precip_ilake),
        'Change in Storage (fraction of inputs)': delta_stor_ilake/(inflow_ilake + precip_ilake)
    },
    'Prairie Potholes': {
        'Inflow (fraction of landclass precip)': inflow_ilake/precip_landclass,
        'Evaporation (fraction of inputs)': evap_ilake/(inflow_ilake + precip_ilake),
        'Outflow (fraction of inputs)': outflow_ilake/(inflow_ilake + precip_ilake),
        'Change in Storage (fraction of inputs)': delta_stor_ilake/(inflow_ilake + precip_ilake)
    },
        'Main River (fraction of total precip)': { 
        'Flow from Local Stream (fraction of total runoff)': lstream_mriver/ (lstream_mriver + outflow_ilake),
        'Flow from Prairie Potholes (fraction of total runoff)': outflow_ilake/ (lstream_mriver + outflow_ilake),  
        'Runoff Ratio': runoff_ratio,
        'Total ET': et_sl1 + et_sl2 + sublimation + evap_ilake + evap_iwet,
        'Total Change in Storage': delta_stor_sl1 + delta_stor_sl2 + delta_stor_sl3 + delta_stor_ilake + delta_stor_iwet,
        'Total Runoff (m3)': (sat_surface_runoff + rain_runoff + snowmelt_runoff)
    }
}

# Convert dictionary to multi-index DataFrame
df = pd.concat({k: pd.DataFrame(v, index=[0]) for k, v in data.items()}, axis=1)

In [69]:
# Set MultiIndex for clearer separation
df.columns = pd.MultiIndex.from_tuples(
    [(category, var) for category, variables in data.items() for var in variables.keys()]
)

In [70]:
df

Unnamed: 0_level_0,Precipitation,Precipitation,Precipitation,Evapotranspiration (fraction of landclass precip),Evapotranspiration (fraction of landclass precip),Evapotranspiration (fraction of landclass precip),Evapotranspiration (fraction of landclass precip),Runoff (fraction of landclass precip),Runoff (fraction of landclass precip),Runoff (fraction of landclass precip),...,Prairie Potholes,Prairie Potholes,Prairie Potholes,Prairie Potholes,Main River (fraction of total precip),Main River (fraction of total precip),Main River (fraction of total precip),Main River (fraction of total precip),Main River (fraction of total precip),Main River (fraction of total precip)
Unnamed: 0_level_1,Landclass Precipitation (%),Pothole Precipitation (%),Total Precipitation (m3),ET SL 1,ET SL 2,Sublimation,Total ET Flux,Surface Runoff,SL 1 Flow,SL 2 Flow,...,Inflow (fraction of landclass precip),Evaporation (fraction of inputs),Outflow (fraction of inputs),Change in Storage (fraction of inputs),Flow from Local Stream (fraction of total runoff),Flow from Prairie Potholes (fraction of total runoff),Runoff Ratio,Total ET,Total Change in Storage,Total Runoff (m3)
0,0.896017,0.103983,1148638000.0,0.632836,0.261003,0.096595,0.990434,0.009746,0.001759,8.6e-05,...,0.032045,0.948834,0.092713,-0.041547,5.799506e-09,1.0,1.230272,1163975000.0,-28306500.0,10031080.0
