In [None]:
from dask.distributed import Client, LocalCluster
from dask_gateway import Gateway
import dask.dataframe as dd
import dask
import numpy as np
import pandas as pd

from const import LOCAL_JOINED_FILEPATH
from dask_metric_funcs import root_mean_squared_error, relative_bias, r_squared, kling_gupta_efficiency

GROUPBY_FIELDS = ["primary_location_id", "configuration"]

**Local cluster**

In [None]:
cluster = LocalCluster()
client = Client(cluster)
client

**Gateway cluster**

In [None]:
gateway = Gateway()

In [None]:
options = gateway.cluster_options()
options.worker_cores = 1
options.worker_memory = 4
# options  # should show interactive widget to select cores, etc

cluster = gateway.new_cluster(options)
client = cluster.get_client()
client

In [None]:
cluster.scale(8)

In [None]:
cluster.close()
client.close()

NOTE: We don't see improvement (actually worse?) using a distributed cluster, probably because the dataset is not large enough to warrant its use

**Calculate metrics**

In [None]:
# S3_JOINED_FILEPATH, LOCAL_JOINED_FILEPATH
ddf = dd.read_parquet(
    LOCAL_JOINED_FILEPATH,
    columns=["primary_value", "secondary_value", "primary_location_id", "configuration", "absolute_difference"]
)

In [None]:
ddf_tmp = ddf.copy()
grouped = ddf_tmp.groupby(GROUPBY_FIELDS)[["primary_value", "secondary_value"]]

In [None]:
%%time
rmse_srs = grouped.apply(root_mean_squared_error, meta=pd.Series(dtype=np.float32, name='root_mean_squared_error')).compute()
bias_srs = grouped.apply(relative_bias, meta=pd.Series(dtype=np.float32, name='relative_bias')).compute()
rsquared_srs = grouped.apply(r_squared, meta=pd.Series(dtype=np.float32, name='r_squared')).compute()
kge_srs = grouped.apply(kling_gupta_efficiency, meta=pd.Series(dtype=np.float32, name='kling_gupta_efficiency')).compute()

In [None]:
metrics_df = pd.concat([rmse_srs, bias_srs, rsquared_srs, kge_srs], axis=1)
metrics_df = metrics_df.reset_index()
metrics_df

In [None]:
metrics_df.to_parquet("/data/benchmarks/teehr-benchmark-202404/results/dask_local_joined_results.parquet")

In [None]:
# %%time
# rmse_srs = root_mean_squared_error(ddf, groupby_fields=GROUPBY_FIELDS)
# bias_srs = relative_bias(ddf, groupby_fields=GROUPBY_FIELDS)
# r2_srs = r_squared(ddf, groupby_fields=GROUPBY_FIELDS)
# kge_srs = kling_gupta_efficiency(ddf, groupby_fields=GROUPBY_FIELDS)

# metrics_df = pd.concat([rmse_srs, bias_srs, r2_srs, kge_srs], axis=1)
# metrics_df.reset_index(inplace=True)
# metrics_df