## Exploring TEEHR's Additional Features

---

#### Possible things to cover
- Attributes
- Event Detection
- Ensemble
- Bootstrapping
- Cloning and reading from S3
- Adding calculated fields
  - Event detection
  - Row-calculated fields
- Fetching gridded data?

In [None]:
from pathlib import Path
import os
import shutil

import teehr
from utils import teehr_ngiab
from teehr.evaluation.utils import print_tree
from pyspark.sql.functions import min, max
import hvplot.pandas  # noqa


# Enable use of accessor
from bokeh.io import output_notebook
output_notebook()

In [None]:
MOUNTED_DATA_DIR = Path(os.environ.get("NGIAB_OUTPUT_DIR"))
configuration_name = teehr_ngiab.sanitize_string(MOUNTED_DATA_DIR.name)
print(f"NGIAB output directory: {MOUNTED_DATA_DIR}")

#### Initialize the Evaluation object

In [None]:
TEEHR_EVALUATION_DIR = Path("/app/data/teehr")

# Initialize an Evaluation object from the directory
ev = teehr.Evaluation(dir_path=TEEHR_EVALUATION_DIR)

In [None]:
ev.configurations.to_pandas()

In [None]:
ev.primary_timeseries.to_sdf().select(min("value_time"), max("value_time")).show()

In [None]:
locations_gdf = ev.locations.to_geopandas()
print(f"Number of sites: {locations_gdf.index.size}")
locations_gdf.hvplot.points(geo=True, tiles=True).opts(width=800, height=400)

#### Location Attributes

In [None]:
location_attributes_gdf = ev.location_attributes.to_geopandas()
location_attributes_gdf

##### List the unique location attributes

In [None]:
location_attributes_gdf.attribute_name.unique()

##### The location attributes have been added to the `joined_timeseries` table

In [None]:
ev.joined_timeseries.fields()

#### Now we can make use of the location attributes in our metric calculations

##### Let's take a look at stream order

In [None]:
location_attributes_gdf.to_crs("EPSG:4326", inplace=True)
subset_gdf = location_attributes_gdf[location_attributes_gdf.attribute_name == "stream_order"]
subset_gdf.hvplot.points(geo=True, tiles=True, c="value").opts(width=1200, height=600)

In [None]:
# Create metrics_df
metrics_df = ev.metrics.query(
    group_by=["configuration_name", "stream_order"],
    include_metrics=[
        teehr.DeterministicMetrics.NashSutcliffeEfficiency(),
        teehr.DeterministicMetrics.KlingGuptaEfficiency(),
        teehr.DeterministicMetrics.RelativeBias(),
        teehr.DeterministicMetrics.AnnualPeakRelativeBias(),
        teehr.DeterministicMetrics.RootMeanSquareError()
    ],
    order_by=["configuration_name", "nash_sutcliffe_efficiency"]
).to_pandas()
metrics_df

#### Let's look at the best performing model configuration across all locations

In [None]:
metrics_df[
    [
        "configuration_name",
        "nash_sutcliffe_efficiency",
        "kling_gupta_efficiency"
    ]
    ].groupby(["configuration_name"]).mean().sort_values(by="kling_gupta_efficiency", ascending=False)

In [None]:
metrics_df[
    [
        "configuration_name",
        "relative_bias",
        "annual_peak_flow_bias",
        "root_mean_square_error"
    ]
    ].groupby(["configuration_name"]).mean().sort_values(by="root_mean_square_error", ascending=True)

In [None]:
metrics_df[
    [
        "configuration_name",
        "stream_order",
        "nash_sutcliffe_efficiency",
        "kling_gupta_efficiency"
    ]
    ].groupby(["configuration_name", "stream_order"]).mean().sort_values(by="kling_gupta_efficiency", ascending=False)

#### Event Detection

In [None]:
# Add timeseries-aware row calculated field for Percentile Event Detection (in-memory)
sdf = ev.joined_timeseries.add_calculated_fields([
    teehr.TimeseriesAwareCalculatedFields.PercentileEventDetection()
]).to_sdf()
sdf.show(5)

In [None]:
# Create dataframe from sdf and filter. USGS gage selected that had high overall NSE.
pdf = sdf.filter((sdf.primary_location_id == 'usgs-14301000') & 
                 (sdf.event == 'true') & 
                 (sdf.configuration_name == 'nwm30_retrospective')).toPandas()

In [None]:
# plot events
event_plot = pdf.hvplot.points(x='value_time', y='primary_value', color='event_id')
event_plot.opts(width=1200, height=600)

#### Calculate metrics for events only

In [None]:
metrics_sdf = (
    ev.metrics.add_calculated_fields([
        teehr.TimeseriesAwareCalculatedFields.PercentileEventDetection()
    ]).query(
        group_by=['configuration_name', 'primary_location_id'],
        filters=[
            "primary_location_id = 'usgs-10011500'",
            "event = true",
        ],
        include_metrics=[
            teehr.DeterministicMetrics.NashSutcliffeEfficiency()
        ]
    ).to_sdf().show()
)

#### Calculate relative bias in event peaks

In [None]:
metrics_sdf = (
    ev.metrics.add_calculated_fields([
        teehr.TimeseriesAwareCalculatedFields.PercentileEventDetection()
    ]).query(
        group_by=['configuration_name', 'primary_location_id', 'water_year', 'event'],
        filters=[
            "primary_location_id = 'usgs-10011500'",
            "event = true",
        ],
        include_metrics=[
            teehr.SignatureMetrics.Maximum(
                input_field_names=['primary_value'],
                output_field_name='max_primary_value'
            ),
            teehr.SignatureMetrics.Maximum(
                input_field_names=['secondary_value'],
                output_field_name='max_secondary_value'
            )
        ]
    )
    .query(
        group_by=['configuration_name', 'primary_location_id'],
        include_metrics=[
            teehr.DeterministicMetrics.RelativeBias(
                input_field_names=['max_primary_value', 'max_secondary_value'],
                output_field_name='annual_max_relative_bias'
            )
        ]
    ).to_sdf().show()
)

#### Representing metric uncertainty through bootstrapping

In [None]:
# TODO?

#### Cloning from S3

In [None]:
# Define the directory where the Evaluation will be created
test_eval_dir = Path(Path().home(), "temp", "05_clone_from_s3")
shutil.rmtree(test_eval_dir, ignore_errors=True)

# Create an Evaluation object and create the directory
ev = teehr.Evaluation(dir_path=test_eval_dir, create_dir=True)

In [None]:
# List the evaluations in the S3 bucket
ev.list_s3_evaluations()

In [None]:
# Clone the e0_2_location_example evaluation from the S3 bucket
ev.clone_from_s3("e0_2_location_example")

In [None]:
locations_gdf = ev.locations.to_geopandas()
locations_gdf.teehr.locations_map()

In [None]:
pt_df = ev. primary_timeseries.to_pandas()
pt_df.head()

#### Reading from S3

In [None]:
from teehr.loading.s3.clone_from_s3 import list_s3_evaluations
list_s3_evaluations()["url"].values

In [None]:
# Create an Evaluation object that points to the S3 location
ev = teehr.Evaluation("s3a://ciroh-rti-public-data/teehr-data-warehouse/v0_4_evaluations/e0_2_location_example")

In [None]:
locations_gdf = ev.locations.to_geopandas()
locations_gdf.teehr.locations_map()