In [None]:
# 02 Spark vs. DuckDB

In [None]:
import duckdb
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import pandas_udf

from teehr.classes.duckdb_joined_parquet import DuckDBJoinedParquet

In [None]:
SparkSession.builder.master("local[*]").getOrCreate().stop()

conf = (
    SparkConf()
    .setAppName('TEEHR')
    .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")
    .set("spark.sql.execution.arrow.pyspark.enabled", "true")
)
## Start Spark Session
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [None]:
from pandas_udfs_complete import nash_sutcliffe_efficiency
spark.udf.register("nash_sutcliffe_efficiency", nash_sutcliffe_efficiency)

from pandas_udfs_complete import kling_gupta_efficiency
spark.udf.register("kling_gupta_efficiency", kling_gupta_efficiency)

from pandas_udfs_complete import relative_bias
spark.udf.register("relative_bias", relative_bias)

In [None]:
JOINED_TABLE = "/data/playground/mdenno/40-yr-retrospective/dataset/joined"

In [None]:
# read joined from hive partition folder format
joined = spark.read.format("parquet").option("recursiveFileLookup", "true").load(JOINED_TABLE)
joined.createTempView("joined_temp")

In [None]:
%%time
# Calculate a few basic metrics for python UFDs
sdf = spark.sql("""
WITH joined as (
    SELECT
        *
    FROM joined_temp jt
)
, metrics AS (
    SELECT
        joined.primary_location_id
        , joined.configuration
        , relative_bias(joined.primary_value, joined.secondary_value) as relative_bias
        , kling_gupta_efficiency(joined.primary_value, joined.secondary_value) as kling_gupta_efficiency
        , nash_sutcliffe_efficiency(joined.primary_value, joined.secondary_value) as nash_sutcliffe_efficiency
    FROM
        joined
    GROUP BY
        joined.primary_location_id
        , joined.configuration
)
SELECT
    *
FROM metrics
   --WHERE primary_location_id IN ('usgs-01010070', 'usgs-01105500')
ORDER BY
    metrics.primary_location_id
    , metrics.configuration
""")
sdf.show()

In [None]:
jp = DuckDBJoinedParquet(f"{JOINED_TABLE}/**/*.parquet")

In [None]:
%%time
df = jp.get_metrics(
    group_by=["primary_location_id", "configuration"],
    order_by=["primary_location_id", "configuration"],
    include_metrics=["relative_bias"]
)
df

In [None]:
%%time
qry = jp.get_metrics(
    group_by=["primary_location_id", "configuration"],
    order_by=["primary_location_id", "configuration"],
    include_metrics=["relative_bias"],
    return_query=True
)
print(qry)

In [None]:
%%time
# Calculate a few basic metrics for python UFDs
sdf = spark.sql("""
WITH joined as (
            SELECT
        *
    FROM joined_temp jt
        )
        , metrics AS (
            SELECT
                joined.primary_location_id,joined.configuration
                , sum(secondary_value - primary_value) / sum(primary_value) AS relative_bias
            FROM
                joined
            GROUP BY
                joined.primary_location_id,joined.configuration
        )
        SELECT
            metrics.primary_location_id,metrics.configuration
            , relative_bias
        FROM metrics
        ORDER BY
            metrics.primary_location_id,metrics.configuration
    ;
""")
sdf.show()

In [None]:
%%time
# Calculate a few basic metrics for python UFDs
sdf = spark.sql("""
WITH joined as (
            SELECT
        *
    FROM joined_temp jt
        )
        ,nse AS (
            SELECT
                primary_location_id,configuration
                ,avg(primary_value) AS avg_primary_value
            FROM
                joined
            GROUP BY
                primary_location_id,configuration
        )
        , metrics AS (
            SELECT
                joined.primary_location_id,joined.configuration
                , 1 - (
            sum(pow(joined.primary_value - joined.secondary_value, 2))
            / sum(pow(joined.primary_value - nse.avg_primary_value, 2))
        ) as nash_sutcliffe_efficiency
                , 1 - sqrt(
            pow(corr(secondary_value, primary_value) - 1, 2)
            + pow(stddev(secondary_value) / stddev(primary_value) - 1, 2)
            + pow(avg(secondary_value) / avg(primary_value) - 1, 2)
        ) as kling_gupta_efficiency
            FROM
                joined
        INNER JOIN nse
        ON nse.primary_location_id = joined.primary_location_id AND nse.configuration = joined.configuration
            GROUP BY
                joined.primary_location_id,joined.configuration
        )
        SELECT
            metrics.primary_location_id,metrics.configuration
            , kling_gupta_efficiency,nash_sutcliffe_efficiency
        FROM metrics
        ORDER BY
            metrics.primary_location_id,metrics.configuration
    ;

""")
sdf.show()

In [None]:
JOINED_TABLE = "/data/protocols/p1_daily_streamflow_sim/teehr_database/joined"

In [None]:
# read joined from hive partition folder format
joined = spark.read.format("parquet").option("recursiveFileLookup", "true").load(JOINED_TABLE)
spark.catalog.dropTempView("joined_temp")
joined.createTempView("joined_temp")

In [None]:
%%time
# Calculate a few basic metrics for python UFDs
sdf = spark.sql("""
WITH joined as (
    SELECT
        *
    FROM joined_temp jt
)
, metrics AS (
    SELECT
        joined.primary_location_id
        , joined.configuration
        , kling_gupta_efficiency(joined.primary_value, joined.secondary_value) as kling_gupta_efficiency
        , nash_sutcliffe_efficiency(joined.primary_value, joined.secondary_value) as nash_sutcliffe_efficiency
    FROM
        joined
    GROUP BY
        joined.primary_location_id
        , joined.configuration
)
SELECT
    *
FROM metrics
ORDER BY
    metrics.primary_location_id
    , metrics.configuration
""")
sdf.show()

In [None]:
jp = DuckDBJoinedParquet(f"{JOINED_TABLE}/**/*.parquet")

In [None]:
%%time
df = jp.get_metrics(
    group_by=["primary_location_id", "configuration"],
    order_by=["primary_location_id", "configuration"],
    include_metrics=["kling_gupta_efficiency", "nash_sutcliffe_efficiency"],
)
df

In [None]:
JOINED_TABLE = "/data/protocols/p2_hourly_streamflow_sim/teehr_database/joined"

In [None]:
# read joined from hive partition folder format
joined = spark.read.format("parquet").option("recursiveFileLookup", "true").load(JOINED_TABLE)
spark.catalog.dropTempView("joined_temp")
joined.createTempView("joined_temp")

In [None]:
%%time
# Calculate a few basic metrics for python UFDs
sdf = spark.sql("""
WITH joined as (
    SELECT
        *
    FROM joined_temp jt
)
, metrics AS (
    SELECT
        joined.primary_location_id
        , joined.configuration
        , kling_gupta_efficiency(joined.primary_value, joined.secondary_value) as kling_gupta_efficiency
        , nash_sutcliffe_efficiency(joined.primary_value, joined.secondary_value) as nash_sutcliffe_efficiency
    FROM
        joined
    GROUP BY
        joined.primary_location_id
        , joined.configuration
)
SELECT
    *
FROM metrics
ORDER BY
    metrics.primary_location_id
    , metrics.configuration
""")
sdf.show()

In [None]:
jp = DuckDBJoinedParquet(f"{JOINED_TABLE}/**/*.parquet")

In [None]:
%%time
df = jp.get_metrics(
    group_by=["primary_location_id", "configuration"],
    order_by=["primary_location_id", "configuration"],
    include_metrics=["kling_gupta_efficiency", "nash_sutcliffe_efficiency"],
)
df

In [None]:
spark.stop()