# Post-Event Dashboard 1

This dashboard uses Holoviews functionality to link visualizations and explore the data.  This example queries all reference times within a single event period and returns time series characteristics (e.g. peak and time to peak) for each individual reference time (groupby reference time).  The goal of this dashboard is to facilitate qualitative exploration of trends and agreement in the timeseries characteristics for user selected subsets based on attributes.

### First load necessary packages 

In [1]:
import teehr.queries.duckdb as tqd
import dashboard_utils as du
from datetime import datetime, timedelta
from pathlib import Path
import numpy as np
import pandas as pd
import geopandas as gpd
import panel as pn
import holoviews as hv
from holoviews.element import tiles
import geoviews as gv
import hvplot
import hvplot.pandas
import colorcet as cc
import pprint
pn.extension()
hv.extension('bokeh', logo=False)
gv.extension('bokeh', logo=False)

import importlib

## Then specify the event you want to evaluate (named when loading the data)

In [2]:
if 'jovyan' in list(Path().absolute().parts):
    config_file = 'teehrhub_config.json'
else:
    config_file = 'local_config.json'

# Read the root path in the config file, set geometry subdir name under the root dir
root_dir = du.read_root_dir(Path(Path().absolute().parents[0], 'config', config_file))
geo_dir = Path(root_dir, 'geo')
event_defs_path = Path(root_dir, 'events', 'event_definitions.json')

existing_events = du.read_event_definitions(event_defs_path)
select_event_name = pn.widgets.Select(name='Select new or previously defined event:', options=list(existing_events.keys()))
pn.Row(select_event_name, pn.Column(pn.Spacer(height=30)))

In [3]:
event_specs = du.get_existing_event(existing_events, select_event_name)
pp = pprint.PrettyPrinter(depth=4)
print(f"\033[1mEvent Specs:\033[0m")
pp.pprint(event_specs)
ts_dir = Path(root_dir, 'events', event_specs['name'], 'timeseries')

[1mEvent Specs:[0m
{'end_date': datetime.date(2023, 8, 22),
 'huc2_list': ['15', '16', '18'],
 'lat_limits': (20, 55),
 'lon_limits': (-130, -112),
 'name': '202308_hilary',
 'start_date': datetime.date(2023, 8, 19)}


## Define the sets parquet files for each forecast configuration and variable
We use dictionaries to define multiple sets of parquet files needed to evaluate operational NWM forecasts. These files define the source of verifying data (primary_filepath), data to evaluate (secondary_filepath), as well as the necessary geometry, crosswalk, and attributes.

In [4]:
# short range streamflow forecast evaluation files 
SRF_streamflow = dict(
    nwm_config="short_range",
    variable="streamflow",
    primary_filepath=Path(ts_dir, "usgs", "*.parquet"),
    secondary_filepath=Path(ts_dir, "short_range", "*.parquet"),
    crosswalk_filepath=Path(geo_dir, "usgs_nwm22_crosswalk.conus.parquet"),
    geometry_filepath=Path(geo_dir, "usgs_point_geometry.conus.parquet")
)
# medium range streamflow forecast evaluation files 
MRF_streamflow = dict(
    nwm_config="medium_range_mem1",
    variable="streamflow",
    primary_filepath=Path(ts_dir, "usgs", "*.parquet"),
    secondary_filepath=Path(ts_dir, "medium_range_mem1", "*.parquet"),
    crosswalk_filepath=SRF_streamflow["crosswalk_filepath"],
    geometry_filepath=SRF_streamflow["geometry_filepath"],
)
attribute_paths = dict(
    usgs_upstream_area=Path(geo_dir, "usgs_attr_upstream_area.conus.parquet"),
    usgs_ecoregions=Path(geo_dir, "usgs_attr_ecoregions.conus.parquet"),
    usgs_stream_order=Path(geo_dir, "usgs_attr_stream_order.conus.parquet"),
    usgs_huc_crosswalk=Path(geo_dir, "usgs_huc12_crosswalk.conus.parquet"),
)

# put the scenarios in a list for widget purposes
scenario_definitions = [SRF_streamflow, MRF_streamflow]
for s in scenario_definitions:
    s['scenario_name'] = '-'.join([s['nwm_config'], s['variable']])

## general units ('english' or 'metric') to show in visualization
viz_units = "metric"

## read huc2 geometry
huc2_gdf = gpd.read_parquet(Path(geo_dir, 'huc2_geometry.conus.parquet'))

### Select the forecast scenario and evaluation dates (reference time range)
Run the cell below to generate widgets to facilitate selections, select values then move to the next cell (do not rerun this cell after making selections).  
Reference times will be included from 0z on the start date through the last available reference time on the end date.  Starting values for dates are set based on the event dates.

In [5]:
date_strings = du.list_nwm_dates_for_event_dates(event_specs['start_date'], event_specs['end_date'])
for key in date_strings.keys():
    print(date_strings[key])
print("\n")
    
scenario_selector = du.get_scenario_selector(scenario_name_list=du.get_scenario_names(scenario_definitions))
start_picker = pn.widgets.DatePicker(name='Reference Start Date:', value=event_specs['start_date'])
end_picker = pn.widgets.DatePicker(name='Reference End Date:', value=event_specs['end_date'])
huc2_selector = du.get_huc2_selector(value=event_specs['huc2_list'])
metric_selector = du.get_postevent_metric_selector(variable='streamflow')

pn.Row(
    pn.Column(scenario_selector, start_picker, end_picker),
    huc2_selector
)
                      

Short range forecasts references times that overlap with event dates (through current): 2023-08-18 00:00:00 through 2023-08-22 23:00:00
Short range forecasts valid times that overlap with event dates (through current): 2023-08-18 00:00:00 through 2023-08-23 17:00:00
Medium range forecasts that overlap with event dates (through current): 2023-08-09 00:00:00 through 2023-08-22 18:00:00
Medium range forecasts valid times that overlap with event dates (through current): 2023-08-09 00:00:00 through 2023-08-25 14:00:00




### Read and validate the selections

In [6]:
importlib.reload(du)
scenario = du.get_scenario(scenario_definitions, scenario_name=scenario_selector.value, variable='streamflow')
eval_dates = du.get_nwm_dates_for_event_dates(scenario['nwm_config'], start_picker.value, end_picker.value)
eval_dates

{'reference_time_start': datetime.datetime(2023, 8, 18, 0, 0),
 'reference_time_end': datetime.datetime(2023, 8, 20, 23, 0),
 'value_time_start': datetime.datetime(2023, 8, 18, 0, 0),
 'value_time_end': datetime.datetime(2023, 8, 21, 17, 0)}

### Get observed and forecast timeseries characteristics and join with some attributes

In [7]:
%%time
print('Reading parquet files and calculating metrics...')
query_gdf = du.run_teehr_query(
    query_type="metrics",
    scenario=scenario,
    huc_id=huc2_selector.value,
    value_time_start=eval_dates['value_time_start'],    
    value_time_end=eval_dates['value_time_end'],    
    reference_time_start=eval_dates['reference_time_start'],
    reference_time_end=eval_dates['reference_time_end'],
    group_by=['primary_location_id','reference_time'],
    order_by=['primary_location_id','reference_time'], 
    include_metrics=metric_selector.value,
    value_min=0,
    attribute_paths=attribute_paths,
)
display(query_gdf.head())

# convert units, add attributes
query_gdf = du.convert_query_to_viz_units(query_gdf, viz_units, scenario['variable'])
attribute_df = du.combine_attributes(attribute_paths, viz_units)
gdf = du.merge_attr_to_gdf(query_gdf, attribute_df)

# replace geometry with easting and northing to facilitate linked plots
df = gdf[[c for c in gdf.columns if c not in ['geometry','measurement_unit']]].copy()
df['latitude'] = gdf.geometry.y
df['easting'] = gdf.to_crs("EPSG:3857").geometry.x
df['northing'] = gdf.to_crs("EPSG:3857").geometry.y

# calculate the peak flow percent difference and peak time difference in hours
if all(x in df.columns for x in ['max_value_delta', 'primary_maximum']):
    df['max_perc_diff'] = df['max_value_delta']/df['primary_maximum']*100
    df.loc[df['max_perc_diff'] == np.inf, 'max_perc_diff'] = np.nan
if all(x in df.columns for x in ['max_value_timedelta']):
    df['max_time_diff'] = (df['max_value_timedelta'] / np.timedelta64(1, 'h')).astype(int)

# turn the string ecoregion into unique integers to enable histograms
eco_df = pd.DataFrame(df['ecoregion_L2'].unique())
eco_df['num']=eco_df[0].str[0:4].astype('float')
eco_df = eco_df.sort_values('num').reset_index()
eco_list=list(eco_df[0])
df['ecoregion_int'] = [eco_list.index(e)+1 for e in df['ecoregion_L2']]

display(df.head())

Reading parquet files and calculating metrics...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,primary_location_id,measurement_unit,reference_time,primary_maximum,secondary_maximum,max_value_delta,primary_max_value_time,secondary_max_value_time,max_value_timedelta,geometry
0,usgs-09403600,m3/s,2023-08-18 00:00:00,0.096844,0.75,0.653156,2023-08-18 18:00:00,2023-08-18 14:00:00,-1 days +20:00:00,POINT (-112.54798 37.10054)
1,usgs-09403600,m3/s,2023-08-18 01:00:00,0.102224,0.75,0.647776,2023-08-18 19:00:00,2023-08-18 15:00:00,-1 days +20:00:00,POINT (-112.54798 37.10054)
2,usgs-09403600,m3/s,2023-08-18 02:00:00,0.102224,0.75,0.647776,2023-08-18 19:00:00,2023-08-18 15:00:00,-1 days +20:00:00,POINT (-112.54798 37.10054)
3,usgs-09403600,m3/s,2023-08-18 03:00:00,0.102224,0.75,0.647776,2023-08-18 19:00:00,2023-08-18 18:00:00,-1 days +23:00:00,POINT (-112.54798 37.10054)
4,usgs-09403600,m3/s,2023-08-18 04:00:00,0.102224,0.75,0.647776,2023-08-18 19:00:00,2023-08-18 18:00:00,-1 days +23:00:00,POINT (-112.54798 37.10054)


Unnamed: 0,primary_location_id,reference_time,primary_maximum,secondary_maximum,max_value_delta,primary_max_value_time,secondary_max_value_time,max_value_timedelta,upstream_area,ecoregion_L2,stream_order,latitude,easting,northing,max_perc_diff,max_time_diff,ecoregion_int
0,usgs-09403600,2023-08-18 00:00:00,0.096844,0.75,0.653156,2023-08-18 18:00:00,2023-08-18 14:00:00,-1 days +20:00:00,505.67766,10.1 COLD DESERTS,3,37.100541,-12528780.0,4453130.0,674.444458,-4,3
1,usgs-09403600,2023-08-18 01:00:00,0.102224,0.75,0.647776,2023-08-18 19:00:00,2023-08-18 15:00:00,-1 days +20:00:00,505.67766,10.1 COLD DESERTS,3,37.100541,-12528780.0,4453130.0,633.684265,-4,3
2,usgs-09403600,2023-08-18 02:00:00,0.102224,0.75,0.647776,2023-08-18 19:00:00,2023-08-18 15:00:00,-1 days +20:00:00,505.67766,10.1 COLD DESERTS,3,37.100541,-12528780.0,4453130.0,633.684265,-4,3
3,usgs-09403600,2023-08-18 03:00:00,0.102224,0.75,0.647776,2023-08-18 19:00:00,2023-08-18 18:00:00,-1 days +23:00:00,505.67766,10.1 COLD DESERTS,3,37.100541,-12528780.0,4453130.0,633.684265,-1,3
4,usgs-09403600,2023-08-18 04:00:00,0.102224,0.75,0.647776,2023-08-18 19:00:00,2023-08-18 18:00:00,-1 days +23:00:00,505.67766,10.1 COLD DESERTS,3,37.100541,-12528780.0,4453130.0,633.684265,-1,3


CPU times: user 4min 43s, sys: 3.42 s, total: 4min 46s
Wall time: 1min 14s


### Build an interactive dashboard to explore the data

In [8]:
importlib.reload(du)

color_column_options = ['stream_order','ecoregion_int','upstream_area','latitude','max_perc_diff','max_time_diff']
color_variable_selector = pn.widgets.Select(name='Color Variable', 
                                          options=du.get_metric_selector_dict(color_column_options,scenario_selector.value),
                                          value=color_column_options[0], 
                                          width=180)

scatter_variable_options=['Peak Flow','Peak Time']
scatter_variable_selector = pn.widgets.Select(name='Scatter Variable', 
                                          options=scatter_variable_options, 
                                          value=scatter_variable_options[0], 
                                          width=180)

basemap = tiles.CartoLight().redim(x='easting', y='northing')
df_sub = df.drop_duplicates(subset=['primary_location_id'], keep='first')
points = pn.bind(
    du.get_points,
    df=df_sub, 
    color_variable=color_variable_selector.param.value, 
    scenario_name=scenario_selector.value,
    units=viz_units,
    opts=dict(width=500, height=400)
)
scatter = pn.bind(
    du.get_scatter,
    df=df, 
    scatter_variable=scatter_variable_selector.param.value, 
    color_variable=color_variable_selector.param.value, 
    scenario_name=scenario_selector.value,
    units=viz_units,
    opts=dict(width=400, height=400)
)
area_hist = du.get_histogram(df, column='upstream_area', nbins=50)
peak_diff_hist = du.get_histogram(df, column='max_perc_diff', nbins=50)
peak_timediff_hist = du.get_histogram(df, column='max_time_diff', nbins=50)
eco_hist =   du.get_categorical_histogram(df, column = 'ecoregion_int', labels=eco_df['num'])
order_hist = du.get_categorical_histogram(df, column = 'stream_order')

area_hist.         opts(width=300, height=200)
peak_diff_hist.    opts(width=300, height=200)
peak_timediff_hist.opts(width=300, height=200)
eco_hist.          opts(width=250, height=200)
order_hist.        opts(width=250, height=200)

scenario_text = du.get_scenario_text(scenario_selector.value)
subtitle = f"Example 1: Forecast Data Exploration<br> - {scenario_text}"
header = du.get_dashboard_header(subtitle)

ls = hv.link_selections.instance()

pn.Column(
    pn.Column(pn.Spacer(height=10), header, width=1100),
    pn.Row(
        pn.Spacer(height=20),
        pn.Column(pn.Spacer(height=20), scatter_variable_selector, color_variable_selector, width=220),
        pn.Row(ls(hv.DynamicMap(scatter)) + basemap*ls(hv.DynamicMap(points))),
    ),  
    pn.Row(ls(peak_diff_hist + peak_timediff_hist + order_hist + eco_hist)),
)


In [None]:
importlib.reload(du)

color_column_options = ['stream_order','ecoregion_int','upstream_area','latitude','max_perc_diff','max_time_diff']
color_variable_selector = pn.widgets.Select(name='Color Variable', 
                                          options=du.get_metric_selector_dict(color_column_options,scenario_selector.value),
                                          value=color_column_options[0], 
                                          width=180)

scatter_variable_options=['Peak Flow','Peak Time']
scatter_variable_selector = pn.widgets.Select(name='Scatter Variable', 
                                          options=scatter_variable_options, 
                                          value=scatter_variable_options[0], 
                                          width=180)

basemap = tiles.CartoLight().redim(x='easting', y='northing')
df_sub = df.drop_duplicates(subset=['primary_location_id'], keep='first')

area_hist = du.get_histogram(df, column='upstream_area', nbins=50)
peak_diff_hist = du.get_histogram(df, column='max_perc_diff', nbins=50)
peak_timediff_hist = du.get_histogram(df, column='max_time_diff', nbins=50)
eco_hist =   du.get_categorical_histogram(df, column = 'ecoregion_int', labels=eco_df['num'])
order_hist = du.get_categorical_histogram(df, column = 'stream_order')

area_hist.         opts(width=300, height=200)
peak_diff_hist.    opts(width=300, height=200)
peak_timediff_hist.opts(width=300, height=200)
eco_hist.          opts(width=250, height=200)
order_hist.        opts(width=250, height=200)

scenario_text = du.get_scenario_text(scenario_selector.value)
subtitle = f"Example 1: Forecast Data Exploration<br> - {scenario_text}"
header = du.get_dashboard_header(subtitle)

ls = hv.link_selections.instance()

pn.Row(scatter_variable_selector, color_variable_selector)

In [None]:
points = du.get_points(
    df=df_sub, 
    color_variable=color_variable_selector.value, 
    scenario_name=scenario_selector.value,
    units=viz_units,
    opts=dict(width=500, height=400)
)
scatter = du.get_scatter(
    df=df, 
    scatter_variable=scatter_variable_selector.value, 
    color_variable=color_variable_selector.value, 
    scenario_name=scenario_selector.value,
    units=viz_units,
    opts=dict(width=400, height=400)
)

pn.Column(
    pn.Column(pn.Spacer(height=10), header, width=1100),
    pn.Row(
        pn.Spacer(height=20),
        pn.Row(ls(scatter) + basemap*ls(points))
    ),  
    pn.Row(ls(peak_diff_hist + peak_timediff_hist + order_hist + eco_hist)),
)