## Example 1 - Querying the Data

Add more text description about this ....

First we will introduce the TEEHR query system so you get a sense of what is going on behind the scenes of the visualizations in later examples

.... brief explanation,   
concise/rapid approach to subset the timeseries you want to evaluate and get either the raw data or summary/comparison metrics  
efficient, enables querying the data 'on the fly' within interactive visualizations



### Install and Import packages

In [None]:
%%capture
!pip install spatialpandas colormap colorcet duckdb
#!pip install 'teehr @ git+https://[]@github.com/RTIInternational/teehr@main'
#!pip install 'teehr @ git+https://[]@github.com/RTIInternational/teehr@39d6627e4f49b0bdeab3a4c4e8837e6ce5a15f78'

In [None]:
import teehr.queries.duckdb as tqd

# dashboard functions
import dashboard_utils as du
import importlib

from datetime import timedelta
from pathlib import Path
import geopandas as gpd
import pandas as pd
import spatialpandas as spd
import numpy as np
import pathlib
from typing import List
import duckdb as ddb

import hvplot
import hvplot.pandas
import holoviews as hv
from holoviews.element import tiles
import geoviews as gv
import panel as pn
import colorcet as cc
from holoviews.operation.datashader import rasterize, spread
hv.extension('bokeh', logo=False)

## Point to the data that will be used for the evaluation

..Note this stuff could eventually be stored in a separate configuration file

..Note that assuming the cache has been generated already, not focusing on that aspect (teehr package will include examples and instructions)...

These are the evaluation scenario definitions - specific variables and configurations to be compared within the overall study.
We need to specify all the parquet files containing the data we want to evaluate, as well as some necessary associated data (geometry, crosswalks, and attributes).
These files dictate the specific study (directory name), forecast configuration, and source of verifying data used in this evaluation.



In [None]:
# overall study directory
STUDY_DIR = Path("/home", "jovyan", "shared", "rti-eval", "post-event-example")

# medium range streamflow forecast evaluation files 
MRF_streamflow = dict(
    scenario_name="medium_range",
    variable="streamflow",
    primary_filepath=Path(STUDY_DIR, "timeseries", "usgs", "*.parquet"),
    secondary_filepath=Path(STUDY_DIR, "timeseries", "medium_range_mem1", "*.parquet"),
    crosswalk_filepath=Path(STUDY_DIR, "geo", "usgs_nwm22_crosswalk.parquet"),
    geometry_filepath=Path(STUDY_DIR, "geo", "usgs_geometry.parquet")
)

# medium range precip forecast evaluation files
MRF_forcing = dict(
    scenario_name="medium_range",
    variable="precipitation",    
    primary_filepath=Path(STUDY_DIR, "timeseries", "forcing_analysis_assim", "*.parquet"),
    secondary_filepath=Path(STUDY_DIR, "timeseries", "forcing_medium_range", "*.parquet"),
    crosswalk_filepath=Path(STUDY_DIR, "geo", "huc10_huc10_crosswalk.parquet"),                    # the primary and secondary are both HUC10
    geometry_filepath=Path(STUDY_DIR, "geo", "huc10_geometry.parquet"),
)

# short range streamflow forecast evaluation files 
SRF_streamflow = dict(
    scenario_name="short_range",
    variable="streamflow",
    primary_filepath=MRF_streamflow["primary_filepath"],
    secondary_filepath=Path(STUDY_DIR, "timeseries", "short_range", "*.parquet"),
    crosswalk_filepath=MRF_streamflow["crosswalk_filepath"],
    geometry_filepath=MRF_streamflow["geometry_filepath"],
)

# medium range precip forecast evaluation files
SRF_forcing = dict(
    scenario_name="short_range",
    variable="precipitation",    
    primary_filepath=MRF_forcing["primary_filepath"],
    secondary_filepath=Path(STUDY_DIR, "timeseries", "forcing_short_range", "*.parquet"),
    crosswalk_filepath=MRF_streamflow["crosswalk_filepath"],
    geometry_filepath=MRF_streamflow["geometry_filepath"],
)

eval_scenarios = [MRF_streamflow, MRF_forcing, SRF_streamflow, SRF_forcing]

attribute_paths = dict(
    usgs_upstream_area=Path(STUDY_DIR, "geo", "usgs_attr_upstream_area.parquet"),
    usgs_ecoregions=Path(STUDY_DIR, "geo", "usgs_attr_ecoregions.parquet"),
    usgs_stream_order=Path(STUDY_DIR, "geo", "usgs_attr_stream_order.parquet"),
    usgs_huc_crosswalk=Path(STUDY_DIR, "geo", "usgs_huc12_crosswalk.parquet"),
    nwm22_huc_crosswalk=Path(STUDY_DIR, "geo", "nwm22_huc12_crosswalk.parquet"),
    #UPSTREAM_IMPERVIOUS = Path(STUDY_DIR, "geo", "usgs_attr_upstream_imperv.parquet")    # don't have this data yet
)

## Select the scenario and variable for evaluation:
We will use some panel widgets to make this easier:

In [None]:
importlib.reload(du)
scenario_selector = du.get_scenario_selector(scenario_name_list=sorted(du.get_scenario_names(eval_scenarios)))  
variable_selector = du.get_variable_selector(variable_list=du.get_scenario_variables(eval_scenarios))   
pn.Row(scenario_selector, variable_selector)

## Filter the data to the region, time period, stream size, threshold (etc.) of interest:

In [None]:
instructions_2 = "Filter the data to the subset of interest:"
instructions_3 = "Run the query in the cell below, then experiment with the selections to see how results and execution times change"

importlib.reload(du)
scenario = du.get_scenario(eval_scenarios, scenario_selector.value, variable_selector.value)

value_time_slider = du.get_date_range_slider_with_range_as_title(
    pathlist=[scenario["primary_filepath"], scenario["secondary_filepath"]],
    date_type='value_time', 
    opts = dict(width = 700, bar_color = "green", step=3600000)
)
reference_time_slider = du.get_date_range_slider_with_range_as_title(
    pathlist=[scenario["primary_filepath"], scenario["secondary_filepath"]],
    date_type='reference_time',
    opts = dict(width = 700, bar_color = "red", step=3600000*6))

lead_time_selector = du.get_lead_time_selector()

huc2_selector = du.get_huc2_selector()
order_limit_selector = du.get_order_limit_selector()
threshold_selector = du.get_threshold_selector(variable_selector.value)
metric_selector = du.get_metric_selector(variable_selector.value)

## tried this, did not work well - could not get the out (display(gdf)) to show up BELOW the widgets rather than above
#query_button = pn.widgets.Button(name='Run TEEHR Query', button_type='primary')    
#query_button.on_click(button_callback)

pn.Row(
    pn.Column(huc2_selector, order_limit_selector, threshold_selector, metric_selector),
    pn.Spacer(width=50),    
    pn.Column(     
        pn.Spacer(height=10), value_time_slider,
        pn.Spacer(height=10), reference_time_slider,
        pn.Spacer(height=5), lead_time_selector,
    )
)

## Make selections above and run the query in the cell below
### Experiment with the filter selections...

In [None]:
%%time
importlib.reload(du)
metrics_gdf = du.run_teehr_query(
    query_type="metrics",
    scenario=scenario,
    huc_id=huc2_selector.value,
    order_limit=order_limit_selector.value,
    value_time_start=value_time_slider[1].value_start,    
    value_time_end=value_time_slider[1].value_end,    
    reference_time_start=reference_time_slider[1].value_start,    
    reference_time_end=reference_time_slider[1].value_end,
    value_min=threshold_selector.value,    
    include_metrics=metric_selector.value,
    attribute_paths=attribute_paths,
    return_query=False,
)
display(metrics_gdf)

In [None]:
importlib.reload(du)
ts_df = du.run_teehr_query(
    query_type="timeseries",
    scenario=scenario,
    huc_id=huc2_selector.value,
    order_limit=order_limit_selector.value,
    value_time_start=value_time_slider[1].value_start,    
    value_time_end=value_time_slider[1].value_end,    
    reference_time_start=reference_time_slider[1].value_start,    
    reference_time_end=reference_time_slider[1].value_end,
    value_min=threshold_selector.value,    
    attribute_paths=attribute_paths,
    return_query=False,
)
display(ts_df)