## Post Event Example - Explore Forecast Data from a Recent Flood Event

Add more text description about this use case....





### Install and Import packages

In [None]:
%%capture
!pip install spatialpandas colormap colorcet duckdb
#!pip install 'teehr @ git+https://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@github.com/RTIInternational/teehr@main'
!pip install 'teehr @ git+https://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@github.com/RTIInternational/teehr@39d6627e4f49b0bdeab3a4c4e8837e6ce5a15f78'

In [None]:
import teehr.queries.duckdb as tqd

# dashboard functions
import dashboard_utils as dbu
import importlib
importlib.reload(dbu)

from datetime import timedelta
import pathlib
import geopandas as gpd
import pandas as pd
import spatialpandas as spd
import numpy as np
import pathlib
from typing import List

In [None]:
import hvplot
import hvplot.pandas
import holoviews as hv
from holoviews.element import tiles
import geoviews as gv
import panel as pn
import colorcet as cc
from holoviews.operation.datashader import rasterize, spread
hv.extension('bokeh', logo=False)

### Specify the parquet files for this study
First we need to specify all the parquet files containing the data we want to evaluate, as well as some necessary associated data (geometry, crosswalks, and attributes).
These files dictate the specific study (directory name), forecast configuration, and source of verifying data used in this evaluation.

In [None]:
STUDY_DIR = pathlib.Path("/home/jovyan/shared/rti-eval/post-event-example")

PRIMARY_FILEPATH_FORCING   = STUDY_DIR / "timeseries" / "forcing_analysis_assim" / "*.parquet"
SECONDARY_FILEPATH_FORCING = STUDY_DIR / "timeseries" / "forcing_medium_range" / "*.parquet"
CROSSWALK_FILEPATH_FORCING = STUDY_DIR / "geo" / "huc10_huc10_crosswalk.parquet"                 # the primary and secondary are both HUC10
GEOMETRY_FILEPATH_FORCING  = STUDY_DIR / "geo" / "huc10_geometry.parquet"

PRIMARY_FILEPATH_STREAMFLOW   = STUDY_DIR / "timeseries" / "usgs" / "*.parquet"
SECONDARY_FILEPATH_STREAMFLOW = STUDY_DIR / "timeseries" / "medium_range_mem1" / "*.parquet"
CROSSWALK_FILEPATH_STREAMFLOW = STUDY_DIR / "geo" / "usgs_nwm22_crosswalk.parquet"
GEOMETRY_FILEPATH_STREAMFLOW  = STUDY_DIR / "geo" / "usgs_geometry.parquet"

ATTRIBUTES_FILEPATH_UPSTREAM_AREA       = STUDY_DIR / "geo" / "usgs_attr_upstream_area.parquet"
ATTRIBUTES_FILEPATH_ECOREGIONS          = STUDY_DIR / "geo" / "usgs_attr_ecoregions.parquet"
#ATTRIBUTES_FILEPATH_UPSTREAM_IMPERVIOUS = STUDY_DIR / "geo" / "usgs_attr_upstream_imperv.parquet"    # don't have this data yet

CROSSWALK_FILEPATH_USGS_HUC10 = STUDY_DIR / "geo" / "usgs_huc10_crosswalk.parquet"

#####  
### Read the associated geometry, crosswalks and attribute data
Text...

In [None]:
## specify general units (english or metric) to show in visualization
viz_units = "metric"

## Read points geometry to plot static background points
cross_df = pd.read_parquet(pathlib.Path(CROSSWALK_FILEPATH_USGS_HUC10))
points_gdf = gpd.read_parquet(pathlib.Path(GEOMETRY_FILEPATH_STREAMFLOW))

# add easting and northing to point geom to simplify overlays
points_gdf['easting'] = points_gdf.to_crs("EPSG:3857").geometry.x
points_gdf['northing'] = points_gdf.to_crs("EPSG:3857").geometry.y

## specify list of attributes to include (from those available)
ATTRIBUTES_FILELIST_STREAMFLOW = [
    ATTRIBUTES_FILEPATH_UPSTREAM_AREA,
    ATTRIBUTES_FILEPATH_ECOREGIONS,  
   #ATTRIBUTES_FILEPATH_UPSTREAM_IMPERVIOUS
]

attr_df = dbu.combine_attributes(ATTRIBUTES_FILELIST_STREAMFLOW, viz_units)

#####
### Check the dates of available data and select the event period to evaluate
Next we will check the dates available in the parquet files, and use a slider to select all or a portion of the total available period to evaluate.   
  (ToDo: create utility to check that data are complete for all of the above defined timeseries files between the min/max dates).

In [None]:
min_date, max_date = dbu.get_parquet_date_range_across_sources(
    [
    PRIMARY_FILEPATH_FORCING, 
    SECONDARY_FILEPATH_FORCING,
    PRIMARY_FILEPATH_STREAMFLOW,
    SECONDARY_FILEPATH_STREAMFLOW,
    ])

slider_instructions = "Adjust the start and end dates on the slider below to define overall desired event period:"
event_dates_slider = dbu.get_event_date_range_slider(min_date- timedelta(hours = 1), max_date, dict(width = 800))
event_text = dbu.get_event_dates_text(min_date- timedelta(hours = 1), max_date)
pn.Column(
    pn.Spacer(background='white', height=20), 
    pn.pane.HTML(slider_instructions, style={'font-size': '18px', 'font-weight': 'bold'}),
    event_text, 
    event_dates_slider,
    pn.Spacer(background='white', height=20)
)

### Select a specific reference time to explore within the event period

For an initial example, we will select a single reference time to explore the comparison between forecast and observed data.  Later we will use this widget more interactively in a dashboard.

In [None]:
reference_time_player = dbu.get_reference_time_player_selected_dates(
    start = event_dates_slider.value_start - timedelta(hours = 1),
    end = event_dates_slider.value_end)

player_instructions = "User the slider or forward arrow (arrow with line) to select a reference time:"
current_ref_time = pn.bind(dbu.get_reference_time, reference_time=reference_time_player.param.value)
pn.Column(
    pn.Spacer(background='white', height=20), 
    pn.pane.HTML(player_instructions, style={'font-size': '18px', 'font-weight': 'bold'}),    
    current_ref_time, reference_time_player, 
    pn.Spacer(background='white', height=20))

##  Now we'll get some data to evaluate... 
###  We will run an example TEEHR query that will 
1) Read the forecast timeseries from the parquet cache for the defined configuration (secondary_filepath) and reference time (selected above)
2) Read the 'observed' timeseries from the cache for the defined verifying data source (primary_filepath)
3) Join the primary to the secondary timeseries, aligning the data by value_time
4) Calculate and return some basic timeseries and comparison metrics

In [None]:
# list metrics to keep in the results - 
    # Note that a single forecast contains insufficient data for statistical comparisons
    # need to describe all available somewhere
    # will use a wrapper to generate the query filter - later in notebook

metric_list = ['primary_average','secondary_average'] #'primary_max','primary_min','secondary_max','secondary_min'

gdf = tqd.get_metrics(
    primary_filepath=PRIMARY_FILEPATH_STREAMFLOW,
    secondary_filepath=SECONDARY_FILEPATH_STREAMFLOW,
    crosswalk_filepath=CROSSWALK_FILEPATH_STREAMFLOW,
    group_by=["reference_time", "primary_location_id","measurement_unit"],
    order_by=["reference_time", "primary_location_id"],
    filters=[{
            "column": "reference_time",
            "operator": "=",
            "value": f"{reference_time_player.value}"
        },
        {
            "column": "primary_value",
            "operator": ">=",
            "value": 0
        },
        {
            "column": "secondary_value",
            "operator": ">=",
            "value": 0
        }
    ],
    return_query=False,
    geometry_filepath=GEOMETRY_FILEPATH_STREAMFLOW,         
    include_geometry=True,
)
# reduce columns and get the difference
gdf = gdf[["reference_time", "primary_location_id","measurement_unit","geometry"] + metric_list]
gdf['perc_diff'] = (gdf['secondary_average'] - gdf['primary_average']) / gdf['primary_average'] * 100

# convert units if needed
gdf = dbu.convert_metrics_to_viz_units(gdf, viz_units)

# check it out
gdf.head(10)

### Now we'll create a simple map of the % difference in peak flow between forecast and observed using hvplot

In [None]:
pn.extension(sizing_mode='scale_both')
gdf = gdf.to_crs("EPSG:3857")
sdf = spd.GeoDataFrame(gdf)
title = (f"Reference Time: {reference_time_player.value}")
diff_map = sdf.hvplot.points(c='perc_diff', cmap=cc.CET_D1A[::-1], clim=(-100,100), width=800, height=400,
                             clabel="% Difference Peak Flow", title=title, size=5, xaxis = None, yaxis = None, tiles='OSM')
diff_map

#### Try changing the reference time selected above and rerunning the query and map a few times  
&nbsp;  

### Create other basic plots to explore the data more.... link them to explore

In [None]:

# regular dataframe works better for scat, hist, get rid of the huge values that make it hard to see...
df = gdf[['primary_location_id','primary_average','secondary_average','perc_diff']].copy()
df['easting'] = gdf.geometry.x
df['northing'] = gdf.geometry.y
df = df.loc[(df['secondary_average']<5000) & (df['primary_average']<5000) & \
            (df['secondary_average']>0) & (df['primary_average']>0)]

diff_hist = df.hvplot.hist(y='perc_diff', bins=100, bin_range=(-100, 1000), height=300, width=700, xlabel='% Difference Peak Flow')
diff_scat = df.hvplot.scatter(x='secondary_average', y='primary_average', height=300, width=400, xlabel='Forecast Peak', ylabel='Observed Peak')
ls = hv.link_selections.instance()
ls(diff_scat + diff_hist)

### Link them up with the map...

In [None]:

measure='perc_diff'
width = 700
basemap = osm2 = tiles.OSM().redim(x='easting', y='northing') #gv.tile_sources.CartoLight
points_hv = hv.Points(df, kdims=['easting','northing'], vdims=[measure, 'secondary_average', 'primary_average'])
points_hv.opts(width=width, height=400, color=hv.dim(measure), clim=(-100,100),
    cmap=cc.CET_D1A[::-1], size = 5, xaxis=None, yaxis=None, colorbar=True)

diff_hist = df.hvplot.hist(y=measure, width=width, bins=100, bin_range=(-100, 1000), height=200, xlabel='% Difference Peak Flow')
#diff_scat = df.hvplot.scatter(x='secondary_average', y='primary_average', vdims=[measure,'easting','northing'], alpha=0.2, width=400, height=400, xlabel='Forecast Peak', ylabel='Observed Peak')
diff_scat = hv.Scatter(df, kdims=['secondary_average'], vdims=['primary_average','easting','northing',measure])
diff_scat.opts(alpha=0.2, width=400, height=400, xlabel='Forecast Peak', ylabel='Observed Peak')
ls = hv.link_selections.instance()
ls((basemap*points_hv + diff_scat + diff_hist)).cols(2)

### Add some additional attributes and generate different plots

normalize flows, add linked histogram of upstream area and/or ecoregion...

In [None]:
gdf_merge = dbu.merge_attr_to_gdf(gdf, attr_df)
gdf_merge.head()

In [None]:
# normalize flows to upstream area for comparability


In [None]:
##  work on 3 way map.... add precip... add timeseries

pn.extension(sizing_mode='scale_both')
prim_map = sdf.hvplot.points(c='primary_average', cmap=cc.CET_L8[::-1], cnorm='eq_hist', clim=(0,15000), width=400,
                             clabel="Peak Flow (cfs)", title=title, size=5, xaxis = None, yaxis = None, tiles='CartoLight')
sec_map = sdf.hvplot.points(c='secondary_average', cmap=cc.CET_L8[::-1], cnorm='eq_hist', clim=(0,15000), width=400,
                             clabel="Peak Flow (cfs)", title=title, size=5, xaxis = None, yaxis = None, tiles='CartoLight')

prim_map + sec_map + basemap*points_hv.opts(width=400, height=300)

To do:
build up 3-col explorer layout  ...add precip... add timeseries from prior notebook   
turn into a dashboard at end  
try other scatter layouts, find best for alt dashboard - decide between these two for demo (prob only time for 1 post event example)


In [None]:
importlib.reload(dbu)

# metric query wrapper
gdf = dbu.get_comparison_metrics(
    primary_filepath=PRIMARY_FILEPATH_STREAMFLOW,
    secondary_filepath=SECONDARY_FILEPATH_STREAMFLOW,
    crosswalk_filepath=CROSSWALK_FILEPATH_STREAMFLOW,
    geometry_filepath=GEOMETRY_FILEPATH_STREAMFLOW,      
    single_reference_time=reference_time_player.value,    
    query_value_min=0,
)
gdf.head()

In [None]:
#  will need another wrapper around the above to generate the holoviews element, like in prior notebooks, but with new query wrapper structure

In [None]:
##  stuff below is remnants - reworking dashboards based on above...

In [None]:

## add normalized flow
# max_gdf['primary_ave_norm'] = merge_gdf['primary_average'] / merge_gdf['attribute_value'] * 3600 * 12
# max_gdf['secondary_ave_norm'] = merge_gdf['secondary_average'] / merge_gdf['attribute_value'] * 3600 * 12

# # subset data based on requested min/max (if any defined, e.g., only > 0 or other threshold)
# if measure_min_requested:
#     data_gdf = data_gdf[data_gdf[measure] >= measure_min_requested]
# if measure_max_requested:
#     data_gdf = data_gdf[data_gdf[measure] <= measure_max_requested]

In [None]:
######### General plotting options

flow_measure = "max_in_hr"
flow_map_title = "Normalized Event Peak Flow (in/hr)"  
precip_map_title = "Total Precipitation (in)"

if flow_measure == "max_recurr_int":
    points_cmap_opts = dict(cmap=dbs.get_recurr_colormap(), legend_position='bottom_right')
else:
    points_cmap_opts = dict(cmap=cc.CET_L8[::-1], cnorm='eq_hist', colorbar=True) 
    
map_opts = dict(show_grid=False, xaxis = None, yaxis = None)
curve_opts = dict(toolbar = None, tools=["hover"], show_title = False)

In [None]:
######### Build components for the dashboard

# Build background (static) map Elements - background tiles and all gage points 
# for reference on rasterized catchments DynamicMap
tiles_background = gv.tile_sources.CartoLight.opts(**map_opts, toolbar = 'right')

In [None]:
#points_background 
# points_background = hv.Points(points_gdf, kdims = ['easting','northing'], vdims = ['id']).opts(color='lightgray', size=2, toolbar='right')
# points = spread(rasterize(points_background), px=4, shape='circle').opts(cmap=["lightgray"]) #, responsive=True)

In [None]:
pn.extension(sizing_mode='stretch_width')

layout = pn.Column(
    pn.Column(current_ref_time, reference_time_player),
    #pn.Row(pn.panel(tiles_background * points, margin=0), pn.panel(tiles_background * points, margin=0), pn.panel(tiles_background * points, margin=0)),
    #pn.Row(pn.panel(tiles_background * points, margin=0), pn.panel(tiles_background * points, margin=0), pn.panel(tiles_background * points, margin=0)),
    pn.Row(pn.panel(tiles_background, margin=0), pn.panel(tiles_background, margin=0), pn.panel(tiles_background, margin=0)),
    pn.Row(pn.panel(tiles_background, margin=0), pn.panel(tiles_background, margin=0), pn.panel(tiles_background, margin=0)),
    pn.Spacer(background='green', height=150, margin=0),
    pn.Spacer(background='red', height=150, margin=0),
    )

In [None]:
# gspec = pn.GridSpec(sizing_mode='stretch_width', width_policy='max', height=900)

# gspec[0,:] = pn.Column(current_ref_time, reference_time_player, margin=5)
# gspec[1:4,0] = pn.panel(tiles_background, margin=0)
# gspec[1:4,1] = pn.panel(tiles_background, margin=0)
# gspec[1:4,2] = pn.panel(tiles_background, margin=0)
# gspec[4:7,0] = pn.panel(tiles_background, margin=0)
# gspec[4:7,1] = pn.panel(tiles_background, margin=0)
# gspec[4:7,2] = pn.panel(tiles_background, margin=0)
# gspec[7,:] = pn.Spacer(background='green',  margin=0)
# gspec[8,:] = pn.Spacer(background='red',  margin=0)

# gspec