# Query Approaches Experiment
Experiment to compare the performance of several different ways to query data and compute statistics from populations of forecast and observed data pairs from parquet files.  This includes duckdb, pandas and dask dataframes, as well as a hybrid approach.

In [1]:
%matplotlib inline

# adding project dirs to path so code may be referenced from the notebook
import sys
sys.path.insert(0, '../../evaluation')
sys.path.insert(0, '../../evaluation/queries')

# DuckDB
This approach is a straight DuckDB approach where timeseries are queried and metrics calculated in the SQL query.

In [2]:
%%capture
!pip install duckdb

In [3]:
import config
import queries
import duckdb

In [4]:
%%time
query = queries.calculate_catchment_metrics(
    config.MEDIUM_RANGE_FORCING_PARQUET,
    config.FORCING_ANALYSIS_ASSIM_PARQUET,
    group_by=["catchment_id"],
    order_by=["observed_average"],
    filters=[
        {
            "column": "catchment_id",
            "operator": "like",
            "value": "18%"
        },
        {
            "column": "reference_time",
            "operator": "=",
            "value": "2022-12-25 00:00:00"
        },
    ]
)
df = duckdb.query(query).to_df()
df

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

CPU times: user 13.9 s, sys: 38.6 s, total: 52.4 s
Wall time: 5.9 s


Unnamed: 0,catchment_id,intercept,covariance,corr,r_squared,forecast_count,observed_count,forecast_average,observed_average,forecast_variance,observed_variance,max_forecast_delta,bias
0,1809020107,0.000032,-1.157402e-11,-0.024768,0.000613,240,240,0.000032,5.115208e-07,2.735229e-08,7.983299e-12,0.001094,-0.000031
1,1809020401,0.000040,-4.399643e-11,-0.018183,0.000331,240,240,0.000040,1.767954e-06,3.032814e-08,1.930490e-10,0.000920,-0.000038
2,1809020105,0.000031,-7.116862e-11,-0.030987,0.000960,240,240,0.000030,2.620372e-06,2.479689e-08,2.127328e-10,0.000916,-0.000028
3,1809020104,0.000033,-7.525106e-11,-0.033564,0.001127,240,240,0.000031,3.003525e-06,2.547968e-08,1.972769e-10,0.000960,-0.000028
4,1809020307,0.000034,-4.283208e-11,-0.015387,0.000237,240,240,0.000033,3.506734e-06,1.670182e-08,4.639595e-10,0.000737,-0.000030
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1004,1804001003,0.000172,1.500176e-07,0.543907,0.295835,240,240,0.000338,3.888092e-04,2.168355e-07,3.508359e-07,-0.000263,0.000050
1005,1802012804,0.000215,1.675463e-07,0.490106,0.240204,240,240,0.000356,3.946540e-04,2.491197e-07,4.691175e-07,-0.002140,0.000039
1006,1804001201,0.000186,1.569458e-07,0.524898,0.275518,240,240,0.000352,3.968691e-04,2.371285e-07,3.770207e-07,-0.000212,0.000045
1007,1804000904,0.000172,1.794513e-07,0.592151,0.350643,240,240,0.000366,3.991244e-04,2.488538e-07,3.690485e-07,-0.000183,0.000033


# Pandas
Using this approach we open the parquet files using pandas and calculate the metrics using pandas groupby and aggregate functionality. We only caculate two simple metrics because even with that the performance was not too good.  More metrics would only make it worse.

In [5]:
import pandas as pd

In [6]:
%%time
# load forecast data
df_forecast = pd.read_parquet(config.MEDIUM_RANGE_FORCING_PARQUET)
df_forecast = df_forecast[
        (df_forecast["catchment_id"].str.startswith("18")) & 
        (df_forecast["reference_time"] == pd.Timestamp(2022,12,25,0,0,0))
     ]

# load obersved data
df_observed = pd.read_parquet(config.FORCING_ANALYSIS_ASSIM_PARQUET)
df_observed = df_observed[
        (df_observed["catchment_id"].str.startswith("18"))
     ]

# join forecast and observed
df_joined = pd.merge(
    df_forecast,
    df_observed,
    on=["catchment_id","value_time"],
    suffixes=["_forecast","_observed"],
    how="inner"
)[["catchment_id","reference_time","value_time","value_forecast", "value_observed"]]

# groupby and aggregate
df_joined.groupby("catchment_id")[["value_forecast","value_observed"]].agg(
        average_forecast = ("value_forecast", "mean"),
        average_observed = ("value_observed", "mean")
    )

CPU times: user 1min 10s, sys: 1min 22s, total: 2min 32s
Wall time: 39.8 s


Unnamed: 0_level_0,average_forecast,average_observed
catchment_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0101000201,,
0101000202,,
0101000203,,
0101000204,,
0101000205,,
...,...,...
1810020412,0.000006,0.000015
1810020413,0.000005,0.000014
1810020414,0.000003,0.000007
1810020415,0.000008,0.000018


# Dask
This approach is very similar to the Pandas approach but uses a dask dataframe.  Performance is slightly better, but not much.

In [7]:
from dask.distributed import Client, LocalCluster
cluster = LocalCluster()
client = Client(cluster)
cluster

0,1
Dashboard: /user/mgdenno/proxy/8787/status,Workers: 4
Total threads: 16,Total memory: 58.87 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:33821,Workers: 4
Dashboard: /user/mgdenno/proxy/8787/status,Total threads: 16
Started: Just now,Total memory: 58.87 GiB

0,1
Comm: tcp://127.0.0.1:43057,Total threads: 4
Dashboard: /user/mgdenno/proxy/45731/status,Memory: 14.72 GiB
Nanny: tcp://127.0.0.1:33171,
Local directory: /tmp/dask-worker-space/worker-p9_oi41l,Local directory: /tmp/dask-worker-space/worker-p9_oi41l

0,1
Comm: tcp://127.0.0.1:39943,Total threads: 4
Dashboard: /user/mgdenno/proxy/38429/status,Memory: 14.72 GiB
Nanny: tcp://127.0.0.1:44843,
Local directory: /tmp/dask-worker-space/worker-3npu6mv0,Local directory: /tmp/dask-worker-space/worker-3npu6mv0

0,1
Comm: tcp://127.0.0.1:38847,Total threads: 4
Dashboard: /user/mgdenno/proxy/44033/status,Memory: 14.72 GiB
Nanny: tcp://127.0.0.1:44097,
Local directory: /tmp/dask-worker-space/worker-3014nqdg,Local directory: /tmp/dask-worker-space/worker-3014nqdg

0,1
Comm: tcp://127.0.0.1:37573,Total threads: 4
Dashboard: /user/mgdenno/proxy/46549/status,Memory: 14.72 GiB
Nanny: tcp://127.0.0.1:44183,
Local directory: /tmp/dask-worker-space/worker-ssr7a8_c,Local directory: /tmp/dask-worker-space/worker-ssr7a8_c


In [8]:
import dask.dataframe as dd

In [9]:
%%time
# load forecast data
ddf_forecast = dd.read_parquet(config.MEDIUM_RANGE_FORCING_PARQUET)
ddf_forecast = ddf_forecast[
        (ddf_forecast["catchment_id"].str.startswith("18")) & 
        (ddf_forecast["reference_time"] == pd.Timestamp(2022,12,25,0,0,0))
     ]

# load obersved data
ddf_observed = dd.read_parquet(config.FORCING_ANALYSIS_ASSIM_PARQUET)
ddf_observed = ddf_observed[
        (ddf_observed["catchment_id"].str.startswith("18"))
     ]

# join forecast and observed
ddf_joined = dd.merge(
    ddf_forecast,
    ddf_observed,
    on=["catchment_id","value_time"],
    suffixes=["_forecast","_observed"],
    how="inner"
)

# groupby and aggregate
ddf_joined.groupby("catchment_id")[["value_forecast","value_observed"]].agg(
        average_forecast = ("value_forecast", "mean"),
        average_observed = ("value_observed", "mean")
    ).compute()

CPU times: user 10.2 s, sys: 1.39 s, total: 11.5 s
Wall time: 33 s


Unnamed: 0_level_0,average_forecast,average_observed
catchment_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1810020304,0.000050,0.000064
1809020321,0.000040,0.000012
1807010202,0.000073,0.000052
1802000214,0.000055,0.000039
1804001308,0.000148,0.000164
...,...,...
1810020203,0.000009,0.000011
1810020303,0.000032,0.000032
1810020307,0.000008,0.000017
1810020402,0.000009,0.000009


# Hybrid
The hyrid approach uses DuckDB to query out timeseries pairs and then uses pandas to calculate some statistics.  This approach is likely good for smaller datasets, such as forecasts at a single location where you want to calculate non-standard metrics that are difficult to calculate 

In [14]:
%%time
query = queries.get_joined_catchment_timeseries(
    config.MEDIUM_RANGE_FORCING_PARQUET,
    config.FORCING_ANALYSIS_ASSIM_PARQUET,
    filters=[
        {
            "column": "catchment_id",
            "operator": "==",
            "value": "1801010101"
        }
    ]
)
df = duckdb.query(query).to_df()
df.groupby(["catchment_id","lead_time"])[["forecast_value","observed_value"]].agg(
        average_forecast = ("forecast_value", "mean"),
        average_observed = ("observed_value", "mean")
    )

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

CPU times: user 38.6 s, sys: 1.45 s, total: 40.1 s
Wall time: 2.84 s


Unnamed: 0_level_0,Unnamed: 1_level_0,average_forecast,average_observed
catchment_id,lead_time,Unnamed: 2_level_1,Unnamed: 3_level_1
1801010101,0 days 01:00:00,0.000207,0.000295
1801010101,0 days 02:00:00,0.000234,0.000313
1801010101,0 days 03:00:00,0.000223,0.000335
1801010101,0 days 04:00:00,0.000226,0.000303
1801010101,0 days 05:00:00,0.000240,0.000290
1801010101,...,...,...
1801010101,9 days 20:00:00,0.000231,0.000252
1801010101,9 days 21:00:00,0.000231,0.000280
1801010101,9 days 22:00:00,0.000221,0.000250
1801010101,9 days 23:00:00,0.000221,0.000232


In [18]:
%%time
query = queries.calculate_catchment_metrics(
    config.MEDIUM_RANGE_FORCING_PARQUET,
    config.FORCING_ANALYSIS_ASSIM_PARQUET,
    group_by=["catchment_id","lead_time"],
    order_by=["catchment_id","lead_time"],
    filters=[
         {
            "column": "catchment_id",
            "operator": "==",
            "value": "1801010101"
        }
    ]
)
df = duckdb.query(query).to_df()
df[["catchment_id","lead_time","forecast_average","observed_average"]]

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

CPU times: user 36.2 s, sys: 1.31 s, total: 37.5 s
Wall time: 2.57 s


Unnamed: 0,catchment_id,lead_time,forecast_average,observed_average
0,1801010101,0 days 01:00:00,0.000207,0.000295
1,1801010101,0 days 02:00:00,0.000234,0.000313
2,1801010101,0 days 03:00:00,0.000223,0.000335
3,1801010101,0 days 04:00:00,0.000226,0.000303
4,1801010101,0 days 05:00:00,0.000240,0.000290
...,...,...,...,...
235,1801010101,9 days 20:00:00,0.000231,0.000252
236,1801010101,9 days 21:00:00,0.000231,0.000280
237,1801010101,9 days 22:00:00,0.000221,0.000250
238,1801010101,9 days 23:00:00,0.000221,0.000232


# Conclusion
DuckDB seems to be the fastest way to query and compute metrics and statistics accross large populations of data. For smaller datasets, say just a few locations, pulling the timeseries out and working in Pandas can work and has the benefit of having the power to Pandas to resample, slica e and dice the data in ways that may be difficult in DuckDB.