# Create spreadsheet(s) for:
- Number of impaired days (Current or Scenario - Reference < 0.2)
- Percent volume of Baseline - Scenario < 0.2 mg/l
- Percent volume per run of DO < 2 mg/l
- Percent volume per run of DO < 5 mg/l
Note: the `SSMGrid2_07222022` shapefile version does not have masked regions attributed 

In [1]:
import sys
import os
sys.path.insert(1, '../scripts/')
import xarray
import openpyxl
import contextily as cx 
import yaml
import numpy as np
import pandas
import pathlib
import time
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib as mpl
# load functions from my scripts file "ssm_utils"
from ssm_utils import get_nearest_node, reshape_fvcom, calc_fvcom_stat, extract_fvcom_level

### Readme information

In [None]:
this_file = 

### Define study focus and analysis scope 
- scope: wc for water column or benthic for bottom-level

In [2]:
case = "SOG_NB"
scope= "benthic"

### Load configuration file (yaml) and shapefile

In [3]:
with open('../etc/SSM_config.yaml', 'r') as file:
    ssm = yaml.safe_load(file)
# get shapefile path    
shp = ssm['paths']['shapefile']
# load shapefile into geopandas dataframe
gdf = gpd.read_file(shp)
gdf=gdf.rename(columns={'region_inf':'Regions'})

In [4]:
np_operator='min'
model_var='DOXG'
processed_netcdf_dir = pathlib.Path(ssm['paths']['processed_output'])/model_var
# get list of run sub-directories in processed netcdf directory
dir_list = os.listdir(processed_netcdf_dir)

In [5]:
%%time
MinBottomDO_full={}
MinBottDO={}
for run_dir in dir_list:
    try: 
        run_file=processed_netcdf_dir/run_dir/'bottom'/f'daily_{np_operator}_{model_var}_bottom.nc'
        with xarray.open_dataset(run_file) as ds:
            print([*ds])
            MinBottomDO_full[run_dir]=ds[f'{model_var}_daily_{np_operator}_bottom']
            # Sub-sample basin footprint (from 16012 nodes to 7494)
            MinBottDO[run_dir]=MinBottomDO_full[run_dir][:,gdf['node_id']-1]
            print(MinBottDO[run_dir].shape)
    except FileNotFoundError:
        print(f'File Not Found: {run_file}')

['DOXG_daily_min_bottom']
(361, 4144)
['DOXG_daily_min_bottom']
(361, 4144)
['DOXG_daily_min_bottom']
(361, 4144)
['DOXG_daily_min_bottom']
(361, 4144)
['DOXG_daily_min_bottom']
(361, 4144)
['DOXG_daily_min_bottom']
(361, 4144)
['DOXG_daily_min_bottom']
(361, 4144)
['DOXG_daily_min_bottom']
(361, 4144)
CPU times: user 41.2 ms, sys: 34.6 ms, total: 75.8 ms
Wall time: 341 ms


## Time series for percent volume days
I don't have this right yet

# Below threshold   
Two options:
1. Use DO standard
2. Use given number (e.g. 2 mg/l or 5 mg/l)

In [6]:
def calc_below_threshold(MinBottDO, DO_thresh, shp, dir_list, scope):
    """ 
    MinBottDO [2D]: Min daily bottom DOXG
    DO_thresh [1D or int]: DO_std or integer value
    shp [path]: shapefile path
    dir_list [list]: List of directory names for model output
    scope [string]: "benthic" or "wc" (for water column)
    """
    # Initialize
    DOXGBelowThresh={} # Boolean where DOXG<threshold
    DOXGBelowThreshDays={} # Number of days where DOXGBelowThresh = True
    DaysDOXGBelowThresh={} # Sum of days across regions
    VolumeDays={} # Percent of volume within region where DO<threshold
    PercentVolumeDays={}
    # Define dimension sizes and load shapefile
    [ndays,nnodes]=MinBottDO[dir_list[0]].shape
    gdf = gpd.read_file(shp)
    gdf=gdf.rename(columns={'region_inf':'Regions'})
    regions = gdf[['node_id','Regions']].groupby('Regions').count().index.to_list()
    regions.remove('Other') # These will be removed in future iterations
    if DO_thresh=='DO_standard':
        DO_thresh=gdf['DO_std']
    # Create array of Dissolved Oxygen threshold values 
    if type(DO_thresh)==int:
        DO_thresh2D = np.ones((nnodes,ndays))*DO_thresh
    else:
        # create array of DO_threshold values 
        DO_thresh2D = np.ones((nnodes,ndays))*np.array(DO_thresh).reshape(nnodes,1)
        # (7494,361) x (7494,1) => element-wise multiplication 
    # Calculate volume for volume days
    if scope=='benthic':
        volume = np.asarray(gdf.volume*ssm['siglev_diff'][-1]/100)
    else: # water column
        volume = np.asarray(gdf.volume)
    # Determine DOXGBelowThresh days
    for run_type in dir_list:
        print(run_type)
        # Boolean where DOXG<threshold
        DOXGBelowThresh[run_type] = MinBottDO[run_type]<=DO_thresh2D.transpose() #361x7494 (nodes x time)
        # Number of days where DOXGBelowThresh = True
        DOXGBelowThreshDays[run_type]=DOXGBelowThresh[run_type].sum(axis=0) #7494 (nodes)
        # Volume days
        VolumeDays_all=volume*DOXGBelowThreshDays[run_type]
        # Total number of days and percent volume days for each region
        DaysDOXGBelowThresh[run_type]={}
        VolumeDays[run_type]={}
        PercentVolumeDays[run_type]={}
        for region in regions:
            DaysDOXGBelowThresh[run_type][region]=np.array(DOXGBelowThreshDays[run_type])[
                (gdf['Regions']==region) &
                (gdf['included_i']==1)
            ].sum()
            VolumeDays[run_type][region]=np.array(VolumeDays_all)[
                (gdf['Regions']==region) &
                (gdf['included_i']==1)
            ].sum()
            # get regional volume
            if scope=='benthic': # take fraction for bottom-level volume
                RegionVolume = ssm['siglev_diff'][-1]/100*volume[
                    (gdf['Regions']==region) &
                    (gdf['included_i']==1)
                ].sum()
            else: # water column
                RegionVolume = volume[
                    (gdf['Regions']==region) &
                    (gdf['included_i']==1)
                ].sum()
            PercentVolumeDays[run_type][region]=100*(
                VolumeDays[run_type][region]/(RegionVolume*ndays)
            )
    # Convert to dataframe and organize information
    DaysDOXGBelowThresh_df = pandas.DataFrame(DaysDOXGBelowThresh)
    DaysDOXGBelowThresh_df = DaysDOXGBelowThresh_df.rename(columns=ssm['run_information']['run_tag'])
    DaysDOXGBelowThresh_df = DaysDOXGBelowThresh_df.reindex(columns=['Present Day','Reference','1b','1c','1d','1e','2a','2b'])
    # Percent of volume over the year in each region where DOXG change < threshold
    VolumeDays_df = pandas.DataFrame(VolumeDays)
    VolumeDays_df = VolumeDays_df.rename(columns=ssm['run_information']['run_tag'])
    VolumeDays_df = VolumeDays_df.reindex(columns=['Present Day','Reference','1b','1c','1d','1e','2a','2b'])
    # Percent of cumulative volume over the year in eash region where DOXG change < threshold
    PercentVolumeDays_df = pandas.DataFrame(PercentVolumeDays)
    PercentVolumeDays_df = PercentVolumeDays_df.rename(columns=ssm['run_information']['run_tag'])
    PercentVolumeDays_df = PercentVolumeDays_df.reindex(columns=['Present Day','Reference','1b','1c','1d','1e','2a','2b'])
    return DaysDOXGBelowThresh_df,VolumeDays_df,PercentVolumeDays_df

In [7]:
H={} #DOXG below threshold
VD={} #volume days
PVD={} # percent volume days
H['DO_std'],VD['DO_std'], PVD['DO_std']=calc_below_threshold(
    MinBottDO, 'DO_standard', shp, dir_list, scope)
PVD['DO_std']

wqm_reference
1c_all_sog_riv_off
2b_sog_river_2times
wqm_baseline
2a_sog_river_0.5times
1d_small_sog_wwtp_off
1e_med_sog_wwtp_off
1b_all_sog_wwtp_off


Unnamed: 0,Present Day,Reference,1b,1c,1d,1e,2a,2b
Hood,454.063703,454.063703,454.042225,454.068476,454.063703,454.046376,454.063385,454.06941
Main,383.898909,383.898909,383.893348,383.880562,383.898909,383.895433,383.889856,383.897426
SFJAdmiralty,630.615958,630.615958,630.616514,630.617403,630.615958,630.616514,630.616917,630.612754
SOG_Bellinghham,542.118561,542.118561,542.08428,542.058918,542.115921,542.084186,542.092652,542.17995
SouthSound,288.628467,288.628467,288.587752,288.540903,288.617845,288.60109,288.573305,288.752202
Whidbey,280.255175,280.255175,280.243481,280.137338,280.251345,280.243481,280.222946,280.377901


In [8]:
DO_thresh=2
H['2'], VD['2'], PVD['2']=calc_below_threshold(MinBottDO, DO_thresh, shp, dir_list, scope)
PVD['2']

wqm_reference
1c_all_sog_riv_off
2b_sog_river_2times
wqm_baseline
2a_sog_river_0.5times
1d_small_sog_wwtp_off
1e_med_sog_wwtp_off
1b_all_sog_wwtp_off


Unnamed: 0,Present Day,Reference,1b,1c,1d,1e,2a,2b
Hood,25.883866,25.883866,25.869596,25.782393,25.879261,25.869596,25.833279,26.020818
Main,0.000923,0.000923,0.000923,0.000923,0.000923,0.000923,0.000923,0.000961
SFJAdmiralty,0.114259,0.114259,0.114259,0.113741,0.114259,0.114259,0.114203,0.114788
SOG_Bellinghham,0.082518,0.082518,0.078829,0.070233,0.082518,0.079146,0.076282,0.101802
SouthSound,0.092175,0.092175,0.092175,0.091075,0.092175,0.092175,0.092175,0.094868
Whidbey,0.517753,0.517753,0.514735,0.511743,0.517753,0.514735,0.514735,0.518408


In [9]:
DO_thresh=5
H['5'], VD['5'], PVD['5']=calc_below_threshold(MinBottDO, DO_thresh, shp, dir_list, scope)
PVD['5']

wqm_reference
1c_all_sog_riv_off
2b_sog_river_2times
wqm_baseline
2a_sog_river_0.5times
1d_small_sog_wwtp_off
1e_med_sog_wwtp_off
1b_all_sog_wwtp_off


Unnamed: 0,Present Day,Reference,1b,1c,1d,1e,2a,2b
Hood,268.021452,268.021452,268.004667,267.903235,268.021452,268.014336,267.946945,268.240618
Main,48.721489,48.721489,48.679137,48.476394,48.712006,48.691887,48.601443,48.950492
SFJAdmiralty,472.039958,472.039958,472.020053,472.032239,472.034011,472.020053,472.043236,472.070941
SOG_Bellinghham,266.285418,266.285418,266.2301,266.104544,266.272645,266.241092,266.157983,266.43683
SouthSound,29.610397,29.610397,29.593254,29.468189,29.606127,29.593254,29.527573,29.72558
Whidbey,122.744882,122.744882,122.726215,122.514715,122.738812,122.726215,122.64215,122.987787


## Write to two files, one each for: 
- Impaired
- Hypoxic

In [10]:
excel_output_path = pathlib.Path(ssm['paths']['processed_output'])/case
if os.path.exists(excel_output_path)==False:
        print(f'creating: {excel_output_path}')
        os.umask(0) #clears permissions
        os.makedirs(excel_output_path, mode=0o777,exist_ok=True)
with pandas.ExcelWriter(excel_output_path/f'{case}_{scope}_DO<DOstd.xlsx', mode='w') as writer:  
    H['DO_std'].to_excel(writer, sheet_name='Number_of_Days')
    VD['DO_std'].to_excel(writer, sheet_name='Volume_Days')
    PVD['DO_std'].to_excel(writer, sheet_name='Percent_Volume_Days')
with pandas.ExcelWriter(excel_output_path/f'{case}_{scope}_DO<2.xlsx', mode='w') as writer:  
    H['2'].to_excel(writer, sheet_name='Number_of_Days')
    VD['2'].to_excel(writer, sheet_name='Volume_Days')
    PVD['2'].to_excel(writer, sheet_name='Percent_Volume_Days')
with pandas.ExcelWriter(excel_output_path/f'{case}_{scope}_DO<5.xlsx', mode='w') as writer:  
    H['5'].to_excel(writer, sheet_name='Number_of_Days')
    VD['5'].to_excel(writer, sheet_name='Volume_Days')
    PVD['5'].to_excel(writer, sheet_name='Percent_Volume_Days')

### Impaired
As defined by the Optimization Scenario Report Appendix F, version(?) B (add link/definition)

In [21]:
def calc_impaired(shp, scope, impairment=-0.2):
    """ 
    """
    # Initialize dictionaries
    MinDO_full={} # Min, daily DO over all nodes
    MinDO={} # Min, daily DO over all nodes in shapefile
    DO_diff_lt_0p2={} # Boolean where DO<threshold
    DO_diff_lt_0p2_days={} # Number of days where DOBelowThresh = True
    DaysImpaired={} # Sum of days across regions
    VolumeDaysImpaired={} # Percent of volume within region where DO<threshold
    PercentVolumeDaysImpaired={}
     
    # Define dimension sizes and load shapefile
    gdf = gpd.read_file(shp)
    gdf = gdf.rename(columns={'region_inf':'Regions'})
    regions = gdf[['node_id','Regions']].groupby('Regions').count().index.to_list()
    regions.remove('Other')
    
    [Nnodes,nattrs]=gdf.shape
    Nlevels=10
    # Calculate volume for volume days
    if scope=='benthic':
        volume = np.asarray(gdf.volume*ssm['siglev_diff'][-1]/100) # just the bottom level
    else: # water column
        volume = np.asarray(gdf.volume)
        depth_fraction = np.array(ssm['siglev_diff'])/100
        volume2D = np.dot(volume.reshape(Nnodes,1),depth_fraction.reshape(1,Nlevels))
    
    # Get path for model output
    model_var='DOXG' 
    processed_netcdf_dir = pathlib.Path(ssm['paths']['processed_output'])/model_var
    
    # Get list of run sub-directories in processed netcdf directory
    dir_list = os.listdir(processed_netcdf_dir)
    
    # Load all runs   
    if scope=='benthic':
        for run_dir in dir_list:
            try: 
                run_file=processed_netcdf_dir/run_dir/'bottom'/f'daily_min_{model_var}_bottom.nc'
                with xarray.open_dataset(run_file) as ds:
                    print([*ds])
                    MinDO_full[run_dir]=ds[f'{model_var}_daily_min_bottom']
                    # Sub-sample nodes (from 16012 nodes to 7494)
                    MinDO[run_dir]=MinDO_full[run_dir][:,gdf['node_id']-1]
                    print(MinDO[run_dir].shape)
            except FileNotFoundError:
                print(f'File Not Found: {run_file}')
            if run_dir == dir_list[0]:
                # Get number of days and nodes
                [ndays,nnodes]=MinDO[run_dir].shape
    else: # water column (with 10 levels)
        for run_dir in dir_list:
            try: 
                run_file=processed_netcdf_dir/run_dir/f'daily_min_{model_var}.nc'
                with xarray.open_dataset(run_file) as ds:
                    print([*ds])
                    MinDO_full[run_dir]=ds[f'{model_var}_daily_min']
                    # Sub-sample nodes (from 16012 nodes to 7494)
                    MinDO[run_dir]=MinDO_full[run_dir][:,:,gdf['node_id']-1]
                    print(MinDO[run_dir].shape)
            except FileNotFoundError:
                print(f'File Not Found: {run_file}')
            if run_dir == dir_list[0]:
                # Get number of days and nodes
                [ndays,nlevels,nnodes]=MinDO[run_dir].shape

    # Define reference run
    reference = ssm['run_information']['reference']
    dir_list.remove('wqm_reference')
    
    # Loop through all non-reference runs and calculate impairment
    for run_type in dir_list:
        print(f'Calculating difference for {run_type}')
        # Create array of Dissolved Oxygen threshold values 
        DO_diff = MinDO[run_type] - MinDO[reference]
        # Boolean where DO_diff<0.2
        DO_diff_lt_0p2[run_type] = DO_diff<=impairment #361x4144 (nodes x time) or 361x10x4144
        # Number of days where DO < threshold = True
        if scope=='benthic':
            DO_diff_lt_0p2_days[run_type]=DO_diff_lt_0p2[run_type].sum(axis=0) #4144 (nodes) or 10x4144
            VolumeDays_all=volume*DO_diff_lt_0p2_days[run_type]
        else: # water column: sum over days and take max value over depth
            # First get a count of days impaired for each depth level
            DO_diff_lt_0p2_days_wc=DO_diff_lt_0p2[run_type].sum(axis=0)
            # Take max over depth to get result similar to benthic case where
            # 1-day of impairement is counted if there is one or more levels impaired
            DO_diff_lt_0p2_days[run_type]=DO_diff_lt_0p2[run_type].sum(axis=0).max(axis=0)
            # Volume days: Use days impaired for each level  and element-wise 
            # multiplication of 10x4144 * 10x4144 matrices to get volume days by level
            VolumeDays_wc=volume*DO_diff_lt_0p2_days_wc
            # Add across levels to get total VolumeDays per node
            VolumeDays_all = VolumeDays_wc.sum(axis=0)
        
        # Total number of days and percent volume days for each region
        DaysImpaired[run_type]={}
        VolumeDaysImpaired[run_type]={}
        PercentVolumeDaysImpaired[run_type]={}
        for region in regions:
            DaysImpaired[run_type][region]=np.array(DO_diff_lt_0p2_days[run_type])[
                (gdf['Regions']==region) &
                (gdf['included_i']==1)
            ].sum()
            VolumeDaysImpaired[run_type][region]=np.array(VolumeDays_all)[
                (gdf['Regions']==region) &
                (gdf['included_i']==1)
            ].sum()
            # get regional volume
            if scope=='benthic': # take fraction for bottom-level volume
                RegionVolume = ssm['siglev_diff'][-1]/100*volume[
                    (gdf['Regions']==region) &
                    (gdf['included_i']==1)
                ].sum()
            else: # water column
                RegionVolume = volume[
                    (gdf['Regions']==region) &
                    (gdf['included_i']==1)
                ].sum()
            PercentVolumeDaysImpaired[run_type][region]=100*(
                VolumeDaysImpaired[run_type][region]/(RegionVolume*ndays)
            )
    # Convert to dataframe and organize information
    DaysImpaired_df = pandas.DataFrame(DaysImpaired)
    DaysImpaired_df = DaysImpaired_df.rename(columns=ssm['run_information']['run_tag'])
    DaysImpaired_df = DaysImpaired_df.reindex(columns=['Present Day','Reference','1b','1c','1d','1e','2a','2b'])
    # Percent of volume over the year in each region where DO change < threshold
    VolumeDaysImpaired_df = pandas.DataFrame(VolumeDaysImpaired)
    VolumeDaysImpaired_df = VolumeDaysImpaired_df.rename(
        columns=ssm['run_information']['run_tag'])
    VolumeDaysImpaired_df = VolumeDaysImpaired_df.reindex(
        columns=['Present Day','Reference','1b','1c','1d','1e','2a','2b'])
    # Percent of cumulative volume over the year in eash region where DO change < threshold
    PercentVolumeDaysImpaired_df = pandas.DataFrame(PercentVolumeDaysImpaired)
    PercentVolumeDaysImpaired_df = PercentVolumeDaysImpaired_df.rename(
        columns=ssm['run_information']['run_tag'])
    PercentVolumeDaysImpaired_df = PercentVolumeDaysImpaired_df.reindex(
        columns=['Present Day','Reference','1b','1c','1d','1e','2a','2b'])
    
    return DaysImpaired_df,VolumeDaysImpaired_df,PercentVolumeDaysImpaired_df

In [22]:
%%time
scope = 'wc'
DaysImpaired_df,VolumeDays_df,PercentVolumeDays_df = calc_impaired(shp, scope)

['DOXG_daily_min']
(361, 10, 4144)
['DOXG_daily_min']
(361, 10, 4144)
['DOXG_daily_min']
(361, 10, 4144)
['DOXG_daily_min']
(361, 10, 4144)
['DOXG_daily_min']
(361, 10, 4144)
['DOXG_daily_min']
(361, 10, 4144)
['DOXG_daily_min']
(361, 10, 4144)
['DOXG_daily_min']
(361, 10, 4144)
Calculating difference for 1c_all_sog_riv_off
Calculating difference for 2b_sog_river_2times
Calculating difference for wqm_baseline
Calculating difference for 2a_sog_river_0.5times
Calculating difference for 1d_small_sog_wwtp_off
Calculating difference for 1e_med_sog_wwtp_off
Calculating difference for 1b_all_sog_wwtp_off
CPU times: user 29.8 s, sys: 17min 21s, total: 17min 51s
Wall time: 18min 1s


In [23]:
excel_output_path = pathlib.Path(ssm['paths']['processed_output'])/case   
if os.path.exists(excel_output_path)==False:
        print(f'creating: {excel_output_path}')
        os.umask(0) #clears permissions
        os.makedirs(excel_output_path, mode=0o777,exist_ok=True)
with pandas.ExcelWriter(excel_output_path/f'{case}_{scope}_impaired.xlsx', mode='w') as writer:  
    DaysImpaired_df.to_excel(writer, sheet_name='Impaired_Days')
    VolumeDays_df.to_excel(writer, sheet_name='Volume_Days')
    PercentVolumeDays_df.to_excel(writer, sheet_name='Percent_Volume_Days')

## Test water columns calculation

In [None]:
benthic_file=processed_netcdf_dir/run_dir/'bottom'/f'daily_{np_operator}_{model_var}_bottom.nc'
wc_file = processed_netcdf_dir/run_dir/f'daily_{np_operator}_{model_var}.nc'

In [None]:
with xarray.open_dataset(benthic_file) as ds:
    print([*ds])
    MinDO_full=ds[f'{model_var}_daily_{np_operator}_bottom']
    # Sub-sample basin footprint (from 16012 nodes to 7494)
    MinDO=MinDO_full[:,gdf['node_id']-1]

In [None]:
with xarray.open_dataset(wc_file) as ds:
    print([*ds])
    MinDO_wc=ds[f'{model_var}_daily_{np_operator}']
    MinDO=MinDO_wc[:,:,gdf['node_id']-1]

In [None]:
MinDO_full

In [None]:
MinDO

In [None]:
volume = np.asarray(gdf.volume)
volume.reshape(4144,1).shape

In [None]:
depth_fraction = np.array(ssm['siglev_diff'])/100
depth_fraction.reshape(1,10).shape

In [None]:
volume2D = np.dot(volume.reshape(4144,1),depth_fraction.reshape(1,10))

In [None]:
volume2D[0,:]

In [None]:
volume[0]*np.array(ssm['siglev_diff'])/100

In [None]:
np.sum(volume[0]*np.array(ssm['siglev_diff'])/100) - volume[0]

In [None]:
volume*np.array(ssm['siglev_diff'])/100