# 03 - TEEHR Queries
The TEEHR library has tools to explore the cached timeseries data, join "observed" timeseries to "simulated" timeseries and generate statistical metrics while maintaining acceess to the source data for exploration.

TEEHR currently has three main queries functions:
* `Timeseries` This is the most basic query and fetches timeseries based on the user defined criteria and retruins it as a dataframe.
* `Timeseries Characteristics` This query returns the characteristics of a timeseries. 
* `Joined Timeseries` This query joins the primary ("observed") and secondary ("simulated") timeseries togteher based on location and time and returns as a dataframe.
* `Metrics` This query utilizes the joined timeseries, groups the values according to usered defined criteria and generates metrics which are returned as a dataframe.

In [None]:
import duckdb
from pathlib import Path
import teehr.queries.duckdb as tqd
import pandas as pd
import geopandas as gpd
import hvplot.pandas

In [None]:
CACHE_DIR = Path(Path.home(), "shared", "rti-eval")
STUDY_DIR = Path(CACHE_DIR, "post-event-example")
USGS = Path(STUDY_DIR, "timeseries/usgs/*.parquet")
MEDIUM_RANGE_MEM1 = Path(STUDY_DIR, "timeseries/medium_range_mem1/*.parquet")
SHORT_RANGE = Path(STUDY_DIR, "timeseries/short_range/*.parquet")
CROSSWALK = Path(STUDY_DIR, "geo/usgs_nwm22_crosswalk.parquet")
GEOMETRY = Path(STUDY_DIR, "geo/usgs_geometry.parquet")

## Timeseries

In [None]:
# ?tqd.get_timeseries

In [None]:
"""
tqd.get_timeseries(
    timeseries_filepath: str,
    order_by: List[str],
    filters: Optional[List[dict]] = None,
    return_query: bool = False,
) -> Union[str, pandas.core.frame.DataFrame, geopandas.geodataframe.GeoDataFrame]
"""
ts_df = tqd.get_timeseries(
    timeseries_filepath=SHORT_RANGE,
    order_by=["value_time"],
    filters=[
        {
            "column":  "location_id",
            "operator": "=",
            "value": "nwm22-8941685"
        },
        {
            "column":  "reference_time",
            "operator": "=",
            "value": "2023-01-02 16:00:00"
        }
    ]
)
ts_df

## Timeseries Characteristics

In [None]:
# ?tqd.get_timeseries_chars

In [None]:
"""
tqd.get_timeseries_chars(
    timeseries_filepath: str,
    group_by: list[str],
    order_by: List[str],
    filters: Optional[List[dict]] = None,
    return_query: bool = False,
) -> Union[str, pandas.core.frame.DataFrame, geopandas.geodataframe.GeoDataFrame]

usgs-05129290 -> nwm22-7152082
usgs-05129515 -> nwm22-7163988
"""
ts_chars_df = tqd.get_timeseries_chars(
    timeseries_filepath=SHORT_RANGE,
    group_by=["location_id", "reference_time"],
    order_by=["location_id"],
    filters=[
        {
            "column":  "location_id",
            "operator": "in",
            "value": ["nwm22-8941685", "nwm22-7152082", "nwm22-7163988"]
        },
        {
            "column":  "reference_time",
            "operator": "=",
            "value": "2023-01-02 16:00:00"
        }
    ]
)
ts_chars_df.transpose()

## Joined Timeseries

In [None]:
# ?tqd.get_joined_timeseries

In [None]:
"""
tqd.get_joined_timeseries(
    primary_filepath: str,
    secondary_filepath: str,
    crosswalk_filepath: str,
    order_by: List[str],
    filters: Optional[List[dict]] = None,
    return_query: bool = False,
    geometry_filepath: Optional[str] = None,
    include_geometry: bool = False,
) -> Union[str, pandas.core.frame.DataFrame, geopandas.geodataframe.GeoDataFrame]
"""
joined_df = tqd.get_joined_timeseries(
    primary_filepath=USGS,
    secondary_filepath=SHORT_RANGE,
    crosswalk_filepath=CROSSWALK,
    order_by=["value_time"],
    filters=[
        {
            "column":  "primary_location_id",
            "operator": "=",
            "value": "usgs-10336676"
        },
        {
            "column":  "reference_time",
            "operator": "=",
            "value": "2023-01-02 16:00:00"
        }
    ]
)
joined_df

In [None]:
joined_df.hvplot(x="value_time", y=["primary_value", "secondary_value"])

## Timeseries Metrics

In [None]:
?tqd.get_metrics

In [None]:
"""
tqd.get_metrics(
    primary_filepath: str,
    secondary_filepath: str,
    crosswalk_filepath: str,
    group_by: List[str],
    order_by: List[str],
    include_metrics: Union[List[teehr.models.queries.MetricEnum], ForwardRef('all')],
    filters: Optional[List[dict]] = None,
    return_query: bool = False,
    geometry_filepath: Optional[str] = None,
    include_geometry: bool = False,
) -> Union[str, pandas.core.frame.DataFrame, geopandas.geodataframe.GeoDataFrame]
"""
metrics_df = tqd.get_metrics(
    primary_filepath=USGS,
    secondary_filepath=SHORT_RANGE,
    crosswalk_filepath=CROSSWALK,
    group_by=["primary_location_id", "reference_time"],
    order_by=["primary_location_id"],
    include_metrics="all",
    filters=[
        {
            "column":  "primary_location_id",
            "operator": "in",
            "value": ["usgs-10336676","usgs-05129290", "usgs-05129515"]
        },
        {
            "column":  "reference_time",
            "operator": "=",
            "value": "2023-01-02 16:00:00"
        },
    ]
)
metrics_df.transpose()

## Want to know what query is being executed? Try: `return_query=True`

In [None]:
qry = tqd.get_metrics(
    primary_filepath=USGS,
    secondary_filepath=SHORT_RANGE,
    crosswalk_filepath=CROSSWALK,
    group_by=["primary_location_id", "reference_time"],
    order_by=["primary_location_id"],
    include_metrics=["bias", "nash_sutcliffe_efficiency"],
    filters=[
        {
            "column":  "primary_location_id",
            "operator": "=",
            "value": "usgs-10336676"
        },
        {
            "column":  "reference_time",
            "operator": "=",
            "value": "2023-01-02 16:00:00"
        }
    ],
    return_query=True
)
print(qry)