In [None]:
%matplotlib inline

# adding project dirs to path so code may be referenced from the notebook
import sys
sys.path.insert(0, '../../evaluation')
sys.path.insert(0, '../../evaluation/queries')

In [None]:
import os
from pathlib import Path

CACHE_DIR = Path("/home", "jovyan", "cache")
NWM_CACHE_DIR = os.path.join(CACHE_DIR, "nwm")
USGS_CACHE_DIR = os.path.join(CACHE_DIR, "usgs")
GEO_CACHE_DIR = os.path.join(CACHE_DIR, "geo")

NWM_CACHE_H5 = os.path.join(NWM_CACHE_DIR, "gcp_client.h5")

PARQUET_CACHE_DIR = os.path.join(CACHE_DIR, "parquet")
MEDIUM_RANGE_FORCING_PARQUET = os.path.join(PARQUET_CACHE_DIR, "forcing_medium_range")
FORCING_ANALYSIS_ASSIM_PARQUET = os.path.join(PARQUET_CACHE_DIR, "forcing_analysis_assim")
MEDIUM_RANGE_1_PARQUET = os.path.join(PARQUET_CACHE_DIR, "medium_range_mem1")
USGS_PARQUET = os.path.join(PARQUET_CACHE_DIR, "usgs")

HUC10_SHP_FILEPATH = os.path.join(GEO_CACHE_DIR, "wbdhu10_conus.shp")
HUC10_PARQUET_FILEPATH = os.path.join(GEO_CACHE_DIR, "wbdhu10_conus.parquet")
HUC10_MEDIUM_RANGE_WEIGHTS_FILEPATH = os.path.join(GEO_CACHE_DIR, "wbdhu10_medium_range_weights.pkl")

ROUTE_LINK_FILE = os.path.join(NWM_CACHE_DIR, "RouteLink_CONUS.nc")
ROUTE_LINK_PARQUET = os.path.join(NWM_CACHE_DIR, "route_link_conus.parquet")

In [None]:
from dask.distributed import Client, LocalCluster
cluster = LocalCluster()
client = Client(cluster)
cluster

In [None]:
import dask.dataframe as dd
import pandas as pd

In [None]:
ddf_forecast = dd.read_parquet(MEDIUM_RANGE_FORCING_PARQUET)
ddf_observed = dd.read_parquet(FORCING_ANALYSIS_ASSIM_PARQUET)

In [None]:
ddf_joined = dd.merge(
    ddf_forecast,
    ddf_observed,
    on=["catchment_id","value_time"],
    suffixes=["_forecast","_observed"],
)
# ddf_joined = ddf_joined.reset_index(drop=True)
# ddf_joined = ddf_joined.set_index("index")
# ddf_joined.rename(
# {
#     "value_x": "forecast_value",
#     "value_y": "observed_value"
# }
# )

ddf_joined

In [None]:
%%time
ddf = ddf_joined[
    (ddf_joined["catchment_id"].str.startswith("18")) & 
    (ddf_joined["reference_time"] > pd.Timestamp(2023,1,1))
]

ddf.groupby(["catchment_id","reference_time"]).count().compute()

In [None]:
%%capture
!pip install duckdb

In [None]:
import duckdb

In [None]:
%%time
query = f"""SELECT * from '{MEDIUM_RANGE_FORCING_PARQUET}/2023*.parquet' WHERE catchment_id LIKE '18%'"""
df = duckdb.query(query).to_df()
df