## Exploring TEEHR's Additional Features

---

#### Possible things to cover
- Attributes
- Event Detection
- Ensemble
- Bootstrapping
- Cloning and reading from S3
- Adding calculated fields
  - Event detection
  - Row-calculated fields
- Fetching gridded data?

#### Future work: EVAAS?

In [None]:
from pathlib import Path
import os

import teehr
from utils import teehr_ngiab
from teehr.evaluation.utils import print_tree

# Enable use of accessor
from bokeh.io import output_notebook
output_notebook()

In [None]:
MOUNTED_DATA_DIR = Path(os.environ.get("NGIAB_OUTPUT_DIR"))
configuration_name = teehr_ngiab.sanitize_string(MOUNTED_DATA_DIR.name)
print(f"NGIAB output directory: {MOUNTED_DATA_DIR}")

#### Initialize the Evaluation object

In [None]:
TEEHR_EVALUATION_DIR = Path("/app/data/teehr")

# Initialize an Evaluation object from the directory
ev = teehr.Evaluation(dir_path=TEEHR_EVALUATION_DIR)

In [None]:
ev.configurations.to_pandas()

In [None]:
from pyspark.sql.functions import min, max

ev.primary_timeseries.to_sdf().select(min("value_time"), max("value_time")).show()

In [None]:
locations_gdf = ev.locations.to_geopandas()
print(f"Number of sites: {locations_gdf.index.size}")
locations_gdf.hvplot.points(geo=True, tiles=True)

#### Location Attributes

In [None]:
location_attributes_gdf = ev.location_attributes.to_geopandas()
location_attributes_gdf

##### List the unique location attributes

In [None]:
location_attributes_gdf.attribute_name.unique()

In [None]:
location_attributes_gdf.head()

##### The location attributes have been added to the `joined_timeseries` table

In [None]:
ev.joined_timeseries.fields()

#### Now we can make use of the location attributes in our metric calculations

In [None]:
# # Import geoviews dependencies
import holoviews as hv
import hvplot.pandas
from holoviews import opts
import geoviews as gv
import geoviews.tile_sources as gts
import panel as pn
hv.extension('bokeh', logo=False)
gv.extension('bokeh', logo=False)

In [None]:
location_attributes_gdf.to_crs("EPSG:4326", inplace=True)

In [None]:
location_attributes_gdf.attribute_name.unique().tolist()

In [None]:

subset_gdf = location_attributes_gdf[location_attributes_gdf.attribute_name == "stream_order"]
subset_gdf.hvplot.points(geo=True, tiles=True, c="value")

In [None]:
# Create metrics_df
metrics_df = ev.metrics.query(
    group_by=["configuration_name", "primary_location_id"],
    include_metrics=[
        teehr.DeterministicMetrics.NashSutcliffeEfficiency(),
        teehr.DeterministicMetrics.KlingGuptaEfficiency(),
        teehr.DeterministicMetrics.RelativeBias(),
        teehr.DeterministicMetrics.AnnualPeakRelativeBias(),
        teehr.DeterministicMetrics.RootMeanSquareError()
    ],
    order_by=["primary_location_id"]
).to_pandas()

In [None]:
metrics_df

#### Let's look at the best performing model configuration across all locations

In [None]:
metrics_df[
    [
        "configuration_name",
        "nash_sutcliffe_efficiency",
        "kling_gupta_efficiency"]
    ].groupby(["configuration_name"]).mean().sort_values(by="kling_gupta_efficiency", ascending=False)

In [None]:
metrics_df[
    [
        "configuration_name",
        "relative_bias",
        "annual_peak_flow_bias",
        "root_mean_square_error"]
    ].groupby(["configuration_name"]).mean().sort_values(by="root_mean_square_error", ascending=True)

#### Event Detection

In [None]:
# Add timeseries-aware row calculated field for Percentile Event Detection (in-memory)
sdf = ev.joined_timeseries.add_calculated_fields([
    teehr.TimeseriesAwareCalculatedFields.PercentileEventDetection()
]).to_sdf()
sdf.show(5)

In [None]:
# Create dataframe from sdf and filter. USGS gage selected that had high overall NSE.
pdf = sdf.filter((sdf.primary_location_id == 'usgs-14301000') & (sdf.event == 'true') & (sdf.configuration_name == 'nwm30_retrospective')).toPandas()
pdf.head()

In [None]:
# plot events
event_plot = pdf.hvplot.points(x='value_time', y='primary_value', color='event_id')
event_plot.opts(width=1200, height=600)