# 03 - TEEHR Queries
The TEEHR library has tools to explore the cached timeseries data, join "observed" timeseries to "simulated" timeseries and generate statistical metrics while maintaining acceess to the source data for exploration.

TEEHR currently has three main queries functions:
* `Timeseries` This is the most basic query and fetches timeseries based on the user defined criteria and returns it as a dataframe.
* `Timeseries Characteristics` This query returns the characteristics of a timeseries. 
* `Joined Timeseries` This query joins the primary ("observed") and secondary ("simulated") timeseries together and returns as a dataframe.
* `Metrics` This query utilizes the joined timeseries, groups the values according to user-defined criteria and generates metrics which are returned as a dataframe.

In [3]:
import duckdb
from pathlib import Path
import teehr.queries.duckdb as tqd
import pandas as pd
import geopandas as gpd
import hvplot.pandas

In [4]:
CACHE_DIR = Path(Path.home(), "shared", "rti-eval")
STUDY_DIR = Path(CACHE_DIR, "post-event-example")
USGS = Path(STUDY_DIR, "timeseries/usgs/*.parquet")
MEDIUM_RANGE_MEM1 = Path(STUDY_DIR, "timeseries/medium_range_mem1/*.parquet")
SHORT_RANGE = Path(STUDY_DIR, "timeseries/short_range/*.parquet")
CROSSWALK = Path(STUDY_DIR, "geo/usgs_nwm22_crosswalk.parquet")
GEOMETRY = Path(STUDY_DIR, "geo/usgs_geometry.parquet")

## Timeseries

In [3]:
# ?tqd.get_timeseries

In [9]:
%%time
"""
tqd.get_timeseries(
    timeseries_filepath: str,
    order_by: List[str],
    filters: Optional[List[dict]] = None,
    return_query: bool = False,
) -> Union[str, pandas.core.frame.DataFrame, geopandas.geodataframe.GeoDataFrame]
"""
ts_df = tqd.get_timeseries(
    timeseries_filepath=SHORT_RANGE,
    order_by=["value_time"],
    filters=[
        {
            "column":  "location_id",
            "operator": "=",
            "value": "nwm22-8941685"
        },
        {
            "column":  "reference_time",
            "operator": "=",
            "value": "2023-01-02 17:00:00"
        }
    ]
)
ts_df

CPU times: user 176 ms, sys: 126 ms, total: 303 ms
Wall time: 281 ms


Unnamed: 0,reference_time,value_time,location_id,value,configuration,measurement_unit,variable_name
0,2023-01-02 17:00:00,2023-01-02 18:00:00,nwm22-8941685,1.47,short_range,m3/s,streamflow
1,2023-01-02 17:00:00,2023-01-02 19:00:00,nwm22-8941685,1.33,short_range,m3/s,streamflow
2,2023-01-02 17:00:00,2023-01-02 20:00:00,nwm22-8941685,1.16,short_range,m3/s,streamflow
3,2023-01-02 17:00:00,2023-01-02 21:00:00,nwm22-8941685,0.98,short_range,m3/s,streamflow
4,2023-01-02 17:00:00,2023-01-02 22:00:00,nwm22-8941685,0.8,short_range,m3/s,streamflow
5,2023-01-02 17:00:00,2023-01-02 23:00:00,nwm22-8941685,0.63,short_range,m3/s,streamflow
6,2023-01-02 17:00:00,2023-01-03 00:00:00,nwm22-8941685,0.49,short_range,m3/s,streamflow
7,2023-01-02 17:00:00,2023-01-03 01:00:00,nwm22-8941685,0.37,short_range,m3/s,streamflow
8,2023-01-02 17:00:00,2023-01-03 02:00:00,nwm22-8941685,0.28,short_range,m3/s,streamflow
9,2023-01-02 17:00:00,2023-01-03 03:00:00,nwm22-8941685,0.21,short_range,m3/s,streamflow


## Timeseries Characteristics

In [5]:
# ?tqd.get_timeseries_chars

In [12]:
%%time
"""
tqd.get_timeseries_chars(
    timeseries_filepath: str,
    group_by: list[str],
    order_by: List[str],
    filters: Optional[List[dict]] = None,
    return_query: bool = False,
) -> Union[str, pandas.core.frame.DataFrame, geopandas.geodataframe.GeoDataFrame]
"""
ts_chars_df = tqd.get_timeseries_chars(
    timeseries_filepath=SHORT_RANGE,
    group_by=["location_id", "reference_time"],
    order_by=["location_id"],
    filters=[
        {
            "column":  "location_id",
            "operator": "in",
            "value": ["nwm22-8941685", "nwm22-7152082", "nwm22-7163988"]
        },
        {
            "column":  "reference_time",
            "operator": "=",
            "value": "2023-01-02 16:00:00"
        }
    ]
)
ts_chars_df.transpose()

CPU times: user 413 ms, sys: 306 ms, total: 719 ms
Wall time: 475 ms


Unnamed: 0,0,1,2
location_id,nwm22-7152082,nwm22-7163988,nwm22-8941685
reference_time,2023-01-02 16:00:00,2023-01-02 16:00:00,2023-01-02 16:00:00
count,18,18,18
min,1.4,156.099997,0.04
max,3.59,279.109994,1.51
average,2.587778,218.041662,0.477222
sum,46.579999,3924.749912,8.59
variance,0.549717,1771.759401,0.229987
max_value_time,2023-01-02 17:00:00,2023-01-02 17:00:00,2023-01-02 17:00:00


## Joined Timeseries

In [7]:
# ?tqd.get_joined_timeseries

In [21]:
%%time
"""
tqd.get_joined_timeseries(
    primary_filepath: str,
    secondary_filepath: str,
    crosswalk_filepath: str,
    order_by: List[str],
    filters: Optional[List[dict]] = None,
    return_query: bool = False,
    geometry_filepath: Optional[str] = None,
    include_geometry: bool = False,
) -> Union[str, pandas.core.frame.DataFrame, geopandas.geodataframe.GeoDataFrame]
"""
joined_df = tqd.get_joined_timeseries(
    primary_filepath=USGS,
    secondary_filepath=SHORT_RANGE,
    crosswalk_filepath=CROSSWALK,
    order_by=["value_time"],
    filters=[
        {
            "column":  "primary_location_id",
            "operator": "=",
            "value": "usgs-07289000"
        },
        {
            "column":  "reference_time",
            "operator": "=",
            "value": "2023-01-14 16:00:00"
        }
    ]
)
joined_df

CPU times: user 627 ms, sys: 195 ms, total: 821 ms
Wall time: 373 ms


Unnamed: 0,reference_time,value_time,secondary_location_id,secondary_value,configuration,measurement_unit,variable_name,primary_value,primary_location_id,lead_time
0,2023-01-14 16:00:00,2023-01-14 17:00:00,nwm22-19266232,20484.869542,short_range,m3/s,streamflow,20473.080078,usgs-07289000,0 days 01:00:00
1,2023-01-14 16:00:00,2023-01-14 18:00:00,nwm22-19266232,20497.749542,short_range,m3/s,streamflow,20501.396484,usgs-07289000,0 days 02:00:00
2,2023-01-14 16:00:00,2023-01-14 19:00:00,nwm22-19266232,20517.899541,short_range,m3/s,streamflow,20586.347656,usgs-07289000,0 days 03:00:00
3,2023-01-14 16:00:00,2023-01-14 20:00:00,nwm22-19266232,20547.259541,short_range,m3/s,streamflow,20529.712891,usgs-07289000,0 days 04:00:00
4,2023-01-14 16:00:00,2023-01-14 21:00:00,nwm22-19266232,20587.16954,short_range,m3/s,streamflow,20558.03125,usgs-07289000,0 days 05:00:00
5,2023-01-14 16:00:00,2023-01-14 22:00:00,nwm22-19266232,20637.999539,short_range,m3/s,streamflow,20558.03125,usgs-07289000,0 days 06:00:00
6,2023-01-14 16:00:00,2023-01-15 00:00:00,nwm22-19266232,20767.959536,short_range,m3/s,streamflow,20558.03125,usgs-07289000,0 days 08:00:00
7,2023-01-14 16:00:00,2023-01-15 00:00:00,nwm22-19266232,20767.959536,short_range,m3/s,streamflow,20558.03125,usgs-07289000,0 days 08:00:00
8,2023-01-14 16:00:00,2023-01-15 01:00:00,nwm22-19266232,20842.689534,short_range,m3/s,streamflow,20529.712891,usgs-07289000,0 days 09:00:00
9,2023-01-14 16:00:00,2023-01-15 02:00:00,nwm22-19266232,20920.389532,short_range,m3/s,streamflow,20558.03125,usgs-07289000,0 days 10:00:00


In [22]:
joined_df.hvplot(x="value_time", y=["primary_value", "secondary_value"])

## Timeseries Metrics

In [10]:
# ?tqd.get_metrics

In [34]:
%%time
"""
tqd.get_metrics(
    primary_filepath: str,
    secondary_filepath: str,
    crosswalk_filepath: str,
    group_by: List[str],
    order_by: List[str],
    include_metrics: Union[List[teehr.models.queries.MetricEnum], ForwardRef('all')],
    filters: Optional[List[dict]] = None,
    return_query: bool = False,
    geometry_filepath: Optional[str] = None,
    include_geometry: bool = False,
) -> Union[str, pandas.core.frame.DataFrame, geopandas.geodataframe.GeoDataFrame]
"""
metrics_df = tqd.get_metrics(
    primary_filepath=USGS,
    secondary_filepath=SHORT_RANGE,
    crosswalk_filepath=CROSSWALK,
    group_by=["primary_location_id", "reference_time"],
    order_by=["primary_location_id"],
    include_metrics=["bias"],
    filters=[
        {
            "column":  "primary_location_id",
            "operator": "in",
            "value": ["usgs-10336676","usgs-05129290", "usgs-05129515"]
        },
        {
            "column":  "reference_time",
            "operator": "=",
            "value": "2023-01-02 16:00:00"
        },
    ],
    geometry_filepath=GEOMETRY,
    include_geometry=True
)
metrics_df.transpose()

CPU times: user 760 ms, sys: 186 ms, total: 946 ms
Wall time: 349 ms


Unnamed: 0,0,1,2
primary_location_id,usgs-05129290,usgs-05129515,usgs-10336676
reference_time,2023-01-02 16:00:00,2023-01-02 16:00:00,2023-01-02 16:00:00
bias,1.030491,50.142329,0.792218
geometry,POINT (-93.0748688 48.5243635),POINT (-93.4466667 48.5922222),POINT (-120.1576913 39.1321292)


## Want to know what query is being executed? Try: `return_query=True`

In [24]:
qry = tqd.get_metrics(
    primary_filepath=USGS,
    secondary_filepath=SHORT_RANGE,
    crosswalk_filepath=CROSSWALK,
    group_by=["primary_location_id", "reference_time"],
    order_by=["primary_location_id"],
    include_metrics="all",
    filters=[
        {
            "column":  "primary_location_id",
            "operator": "=",
            "value": "usgs-10336676"
        },
        {
            "column":  "reference_time",
            "operator": "=",
            "value": "2023-01-02 16:00:00"
        }
    ],
    return_query=True
)
print(qry)

        WITH joined as (
            SELECT
                sf.reference_time
                , sf.value_time as value_time
                , sf.location_id as secondary_location_id
                , sf.value as secondary_value
                , sf.configuration
                , sf.measurement_unit
                , sf.variable_name
                , pf.value as primary_value
                , pf.location_id as primary_location_id
                , sf.value_time - sf.reference_time as lead_time
                , abs(pf.value - sf.value) as absolute_difference
            FROM read_parquet('/home/jovyan/shared/rti-eval/post-event-example/timeseries/short_range/*.parquet') sf
            JOIN read_parquet('/home/jovyan/shared/rti-eval/post-event-example/geo/usgs_nwm22_crosswalk.parquet') cf
                on cf.secondary_location_id = sf.location_id
            JOIN read_parquet('/home/jovyan/shared/rti-eval/post-event-example/timeseries/usgs/*.parquet') pf
                on cf.primar