In [None]:
from pathlib import Path

import duckdb
import xarray as xr
from dask.distributed import Client, LocalCluster
from dask_gateway import Gateway
import dask
import numpy as np
import pandas as pd

import teehr.queries.duckdb as tqd

from const import LOCAL_ZARR_JOINED_NWM20_FILEPATH, LOCAL_ZARR_JOINED_NWM21_FILEPATH, LOCAL_JOINED_NWM20_FILEPATH

from zarr_metric_funcs import get_zarr_metrics  # r_squared, relative_bias, root_mean_squared_error, kling_gupta_efficiency

Goal: Compare performance in pulling data from zarr with xarray and parquet using duckdb and calculating simple metrics for a **single configuration**.

Note: Combining multiple configurations into a single Zarr store
presents a challenge when the number of location_ids differs between configurations.



In [None]:
cluster = LocalCluster()
client = Client(cluster)
client

In [None]:
%%time
zarr_ds = xr.open_zarr(LOCAL_ZARR_JOINED_NWM20_FILEPATH)
primary_da = zarr_ds.streamflow.sel(timeseries_name="primary_value")
secondary_da = zarr_ds.streamflow.sel(timeseries_name="secondary_value")
df_nwm20 = get_zarr_metrics(primary_da, secondary_da, configuration="nwm20_retrospective")

zarr_ds = xr.open_zarr(LOCAL_ZARR_JOINED_NWM21_FILEPATH)
primary_da = zarr_ds.streamflow.sel(timeseries_name="primary_value")
secondary_da = zarr_ds.streamflow.sel(timeseries_name="secondary_value")
df_nwm21 = get_zarr_metrics(primary_da, secondary_da, configuration="nwm21_retrospective")

df = pd.concat([df_nwm20, df_nwm21])
df.sort_values(["primary_location_id", "configuration"])

In [None]:
df.to_parquet("/data/benchmarks/teehr-benchmark-202404/results/zarr_local_joined_results.parquet")

In [None]:
df