In [None]:
!pip install numba

In [None]:
!pip install arch

In [None]:
import duckdb
import pandas as pd
import numpy as np
from arch.bootstrap import StationaryBootstrap, CircularBlockBootstrap
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import pandas_udf

In [None]:
# Lets pull out a single timeseries from the joined parquets

In [None]:
def kling_gupta_efficiency(p: pd.Series, s: pd.Series) -> float:

    if len(s) == 0 or len(s) == 0:
        return np.nan
    std_p = np.std(p)
    mean_p = np.mean(p)
    std_s = np.std(s)

    if std_p == 0 or mean_p == 0 or std_s == 0:
        return np.nan

    # Pearson correlation coefficient
    linear_correlation = np.corrcoef(s, p)[0,1]

    # Relative variability
    relative_variability = std_s / std_p

    # Relative mean
    relative_mean = np.mean(s) / mean_p

    # Scaled Euclidean distance
    euclidean_distance = np.sqrt(
        (1 * (linear_correlation - 1.0)) ** 2.0 +
        (1 * (relative_variability - 1.0)) ** 2.0 +
        (1* (relative_mean - 1.0)) ** 2.0
        )

    # Return KGE
    return 1.0 - euclidean_distance

In [None]:
SparkSession.builder.master("local[*]").getOrCreate().stop()

conf = (
    SparkConf()
    .setAppName('TEEHR')
    .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")
    .set("spark.sql.execution.arrow.pyspark.enabled", "true")
)
## Start Spark Session
spark = SparkSession.builder.config(conf=conf).getOrCreate()
## See spark config if ya want
# spark.sparkContext.getConf().getAll()

In [None]:
from pyspark.sql.types import ArrayType, FloatType, StringType, MapType, StructType, StructField

In [None]:
@pandas_udf( MapType(StringType(), FloatType()) )
def bs_kling_gupta_efficiency(p: pd.Series, s: pd.Series) -> float:

    bs = CircularBlockBootstrap(365, p, s, seed=1234)
    results = bs.apply(kling_gupta_efficiency, 1000)
    quantiles = (0.05, 0.50, 0.95)
    values = np.quantile(results, quantiles)
    quantiles = [f"KGE_{str(i)}" for i in quantiles]
    d = dict(zip(quantiles,values))
    return d

In [None]:
spark.udf.register("bs_kling_gupta_efficiency", bs_kling_gupta_efficiency)

In [None]:
JOINED_TABLE = "/data/protocols/p1_daily_streamflow_sim/teehr_database/joined"

In [None]:
# read joined from hive partition folder format
joined = spark.read.format("parquet").option("recursiveFileLookup", "true").load(JOINED_TABLE)
spark.catalog.dropTempView("joined_temp")
joined.createTempView("joined_temp")

In [None]:
%%time
# Calculate a few basic metrics for python UFDs
sdf = spark.sql("""
WITH joined as (
    SELECT
        *
    FROM joined_temp jt
)
, metrics AS (
    SELECT
        joined.primary_location_id
        , joined.configuration
        , bs_kling_gupta_efficiency(joined.primary_value, joined.secondary_value) as bs_kling_gupta_efficiency
    FROM
        joined
    GROUP BY
        joined.primary_location_id
        , joined.configuration
)
SELECT
    *
FROM metrics
   --WHERE primary_location_id IN ('usgs-01010070', 'usgs-01105500')
ORDER BY
    metrics.primary_location_id
    , metrics.configuration
""")
sdf.show(20, False)

In [None]:
# import pyspark.sql.functions as F
# keys_df = sdf.select(F.explode(F.map_keys(F.col("bs_kling_gupta_efficiency")))).distinct()
# keys = list(map(lambda row: row[0], keys_df.collect()))
# key_cols = list(map(lambda f: F.col("bs_kling_gupta_efficiency").getItem(f).alias(str(f)), keys))
# sdf.select(key_cols).show()

In [None]:
cols = list(map(
    lambda f: F.col("bs_kling_gupta_efficiency").getItem(f).alias(str(f)),
    ["KGE_0.05", "KGE_0.5", "KGE_0.95"]))
sdf.select(cols).show()

In [None]:
@pandas_udf( MapType(StringType(), FloatType()) )
def bs_kling_gupta_efficiency(p: pd.Series, s: pd.Series) -> float:

    bs = CircularBlockBootstrap(365*24, p, s, seed=1234)
    results = bs.apply(kling_gupta_efficiency, 1000)
    quantiles = (0.05, 0.50, 0.95)
    values = np.quantile(results, quantiles)
    quantiles = [f"KGE_{str(i)}" for i in quantiles]
    d = dict(zip(quantiles,values))
    return d

In [None]:
JOINED_TABLE = "/data/protocols/p2_hourly_streamflow_sim/teehr_database/joined"

In [None]:
# read joined from hive partition folder format
joined = spark.read.format("parquet").option("recursiveFileLookup", "true").load(JOINED_TABLE)
spark.catalog.dropTempView("joined_temp")
joined.createTempView("joined_temp")

In [None]:
%%time
# Calculate a few basic metrics for python UFDs
sdf = spark.sql("""
WITH joined as (
    SELECT
        *
    FROM joined_temp jt
)
, metrics AS (
    SELECT
        joined.primary_location_id
        , joined.configuration
        , bs_kling_gupta_efficiency(joined.primary_value, joined.secondary_value) as bs_kling_gupta_efficiency
    FROM
        joined
    GROUP BY
        joined.primary_location_id
        , joined.configuration
)
SELECT
    *
FROM metrics
   --WHERE primary_location_id IN ('usgs-01010070', 'usgs-01105500')
ORDER BY
    metrics.primary_location_id
    , metrics.configuration
""")
sdf.show()

In [None]:
JOINED_TABLE = "/data/playground/mdenno/40-yr-retrospective/dataset/joined"

In [None]:
# read joined from hive partition folder format
joined = spark.read.format("parquet").option("recursiveFileLookup", "true").load(JOINED_TABLE)
spark.catalog.dropTempView("joined_temp")
joined.createTempView("joined_temp")

In [None]:
%%time
# Calculate a few basic metrics for python UFDs
sdf = spark.sql("""
WITH joined as (
    SELECT
        *
    FROM joined_temp jt
)
, metrics AS (
    SELECT
        joined.primary_location_id
        , joined.configuration
        , bs_kling_gupta_efficiency(joined.primary_value, joined.secondary_value) as bs_kling_gupta_efficiency
    FROM
        joined
    GROUP BY
        joined.primary_location_id
        , joined.configuration
)
SELECT
    *
FROM metrics
   WHERE primary_location_id IN ('usgs-01010070', 'usgs-01105500')
ORDER BY
    metrics.primary_location_id
    , metrics.configuration
""")
sdf.show()

In [None]:
%%time
# Calculate a few basic metrics for python UFDs
sdf = spark.sql("""
WITH joined as (
    SELECT
        *
    FROM joined_temp jt
)
, metrics AS (
    SELECT
        joined.primary_location_id
        , joined.configuration
        , bs_kling_gupta_efficiency(joined.primary_value, joined.secondary_value) as bs_kling_gupta_efficiency
    FROM
        joined
    GROUP BY
        joined.primary_location_id
        , joined.configuration
)
SELECT
    *
FROM metrics
   --WHERE primary_location_id IN ('usgs-01010070', 'usgs-01105500')
ORDER BY
    metrics.primary_location_id
    , metrics.configuration
""")
sdf.show()

In [None]:
# spark.stop()