# 03 - Queries
The TEEHR library has tools to join "observed" timeseries to "simulated" timeseries and generate statistical metrics while maintaining acceess to the source data for exploration.

In [1]:
import duckdb
from pathlib import Path
import teehr.queries.duckdb as tqd

In [2]:
CACHE_DIR = Path(Path.home(), "shared", "rti-eval")
STUDY_DIR = Path(CACHE_DIR, "post-event-example")
USGS = Path(STUDY_DIR, "timeseries/usgs/*.parquet")
MEDIUM_RANGE_MEM1 = Path(STUDY_DIR, "timeseries/medium_range_mem1/*.parquet")
CROSSWALK = Path(STUDY_DIR, "geo/usgs_nwm22_crosswalk.parquet")
GEOMETRY = Path(STUDY_DIR, "geo/usgs_geometry.parquet")

In [3]:
?tqd.get_metrics

[0;31mSignature:[0m
[0mtqd[0m[0;34m.[0m[0mget_metrics[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mprimary_filepath[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msecondary_filepath[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcrosswalk_filepath[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgroup_by[0m[0;34m:[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0morder_by[0m[0;34m:[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minclude_metrics[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mteehr[0m[0;34m.[0m[0mmodels[0m[0;34m.[0m[0mqueries[0m[0;34m.[0m[0mMetricEnum[0m[0;34m][0m[0;34m,[0m [0mForwardRef[0m[0;34m([0m[0;34m'all'[0m[0;34m)[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfilters[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mdic

In [None]:
%%time
query_df = tqd.get_metrics(
    primary_filepath=USGS,
    secondary_filepath=MEDIUM_RANGE_MEM1,
    crosswalk_filepath=CROSSWALK,
    group_by=["primary_location_id", "reference_time"],
    order_by=["primary_location_id"],
    include_metrics=["bias"],
    # filters=[]
    return_query=False,
    geometry_filepath=GEOMETRY,
    include_geometry=False,
)

In [None]:
%%time
import re
query_str = tqd.get_metrics(
    primary_filepath=USGS,
    secondary_filepath=MEDIUM_RANGE_MEM1,
    crosswalk_filepath=CROSSWALK,
    group_by=["primary_location_id"],
    order_by=["primary_location_id"],
    include_metrics=["bias"],
    # filters=[]
    return_query=True,
    geometry_filepath=GEOMETRY,
    include_geometry=False,
)
print(re.sub(r"\n+", "\n", query_str))

In [None]:
%%time
duckdb.query(f"""
WITH joined as (
            SELECT
                sf.reference_time
                , sf.value_time as value_time
                , sf.location_id as secondary_location_id
                , sf.value as secondary_value
                , sf.configuration
                , sf.measurement_unit
                , sf.variable_name
                , pf.value as primary_value
                , pf.location_id as primary_location_id
                , sf.value_time - sf.reference_time as lead_time
                , abs(pf.value - sf.value) as absolute_difference
            FROM read_parquet('/home/jovyan/shared/rti-eval/post-event-example/timeseries/medium_range_mem1/*.parquet', union_by_name=True) sf
            JOIN read_parquet('/home/jovyan/shared/rti-eval/post-event-example/geo/usgs_nwm22_crosswalk.parquet', union_by_name=True) cf
                on cf.secondary_location_id = sf.location_id
            JOIN read_parquet('/home/jovyan/shared/rti-eval/post-event-example/timeseries/usgs/*.parquet') pf
                on cf.primary_location_id = pf.location_id
                and sf.value_time = pf.value_time
                and sf.measurement_unit = pf.measurement_unit
                and sf.variable_name = pf.variable_name
            --no where clause
        )
        , metrics AS (
            SELECT
                joined.primary_location_id
                , sum(primary_value - secondary_value)/count(*) as bias
            FROM
                joined
            GROUP BY
                joined.primary_location_id
            ORDER BY
                joined.primary_location_id
        )
        SELECT
            metrics.*
        FROM metrics  
    ;
""").df()