### TEEHR with Spark and Iceberg

In [1]:
import os
import duckdb
import numpy as np
from pyspark.sql.functions import pandas_udf
import pandas as pd
from urllib.request import urlretrieve
import gc

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

In [3]:
config = {
    "spark.kubernetes.authenticate.driver.serviceAccountName": "jupyter",
    "spark.kubernetes.namespace": "teehr-spark-default",
    "spark.kubernetes.container.image": os.environ["TEEHR_WORKER_IMAGE"],
    "spark.executor.extraJavaOptions=-Daws.region": "us-east-1",
    "spark.driver.extraJavaOptions=-Daws.region": "us-east-1",
    "spark.executor.instances": "6",
    "spark.executor.memory": "16g",
    "spark.executor.cores": "2",
    "spark.driver.blockManager.port": "7777",
    "spark.driver.port": "2222",
    "spark.driver.host": "jupyter.teehr-spark-default.svc.cluster.local",
    "spark.driver.bindAddress": "0.0.0.0",
    "spark.hadoop.fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem",
    "spark.hadoop.fs.s3a.aws.credentials.provider": "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider",
    "spark.sql.catalog.demo.s3.access-key-id": "minio",
    "spark.sql.catalog.demo.s3.secret-access-key": "password123",
    "spark.sql.parquet.enableVectorizedReader": "false",
    "spark.kubernetes.executor.node.selector.dedicated": "worker",
    "spark.kubernetes.executor.podTemplateFile": "/home/spark/pod-template.yaml",
}

def get_spark_session(app_name: str, conf: SparkConf):
    conf.setMaster("k8s://https://kubernetes.default.svc.cluster.local")
    for key, value in config.items():
        conf.set(key, value)    
    return SparkSession.builder.appName(app_name).config(conf=conf).getOrCreate()

In [4]:
spark = get_spark_session("teehr-workers", SparkConf())
# spark.sparkContext.getConf().getAll()

24/05/13 15:50:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [10]:
%%time
spark.sql("SELECT value_time, value FROM proto.obs WHERE location_id = 'usgs-01116905' ORDER BY value_time;").show()



+-------------------+------------------+
|         value_time|             value|
+-------------------+------------------+
|2007-04-28 04:00:00| 4.049308776855469|
|2007-04-28 05:00:00| 3.992675304412842|
|2007-04-28 06:00:00|3.9077248573303223|
|2007-04-28 07:00:00|3.8794078826904297|
|2007-04-28 08:00:00|  3.79445743560791|
|2007-04-28 09:00:00| 3.737823724746704|
|2007-04-28 10:00:00| 3.681190013885498|
|2007-04-28 11:00:00|3.5962395668029785|
|2007-04-28 12:00:00|3.5396058559417725|
|2007-04-28 13:00:00|3.4829721450805664|
|2007-04-28 14:00:00|3.3980214595794678|
|2007-04-28 15:00:00|3.3130710124969482|
|2007-04-28 16:00:00|3.2281205654144287|
|2007-04-28 17:00:00|3.2281205654144287|
|2007-04-28 18:00:00| 3.199803590774536|
|2007-04-28 19:00:00|3.1148531436920166|
|2007-04-28 20:00:00|3.1714868545532227|
|2007-04-28 21:00:00|3.0582194328308105|
|2007-04-28 22:00:00|3.0015857219696045|
|2007-04-28 23:00:00| 2.973268747329712|
+-------------------+------------------+
only showing top

                                                                                

In [11]:
from pandas_udfs import * 
spark.udf.register("teehr_kling_gupta_efficiency", teehr_kling_gupta_efficiency)
spark.udf.register("teehr_root_mean_squared_error", teehr_root_mean_squared_error)
spark.udf.register("teehr_relative_bias", teehr_relative_bias)
spark.udf.register("teehr_r_squared", teehr_r_squared)

<pyspark.sql.udf.UserDefinedFunction at 0x7f75ba99bc70>

In [12]:
%%time
sdf = spark.sql("""
WITH metrics AS (
    SELECT
        joined.primary_location_id
        , teehr_kling_gupta_efficiency(joined.primary_value, joined.secondary_value) as kling_gupta_efficiency
        , teehr_root_mean_squared_error(joined.primary_value, joined.secondary_value) as root_mean_squared_error
        , teehr_relative_bias(joined.primary_value, joined.secondary_value) as relative_bias
        , teehr_r_squared(joined.primary_value, joined.secondary_value) as r_squared
    FROM
        proto.joined as joined
    GROUP BY
        joined.primary_location_id
)
SELECT
    metrics.primary_location_id
    , kling_gupta_efficiency
    , root_mean_squared_error
    , relative_bias
    , r_squared
FROM metrics
ORDER BY
    metrics.primary_location_id;
""")
sdf.show()



+-------------------+----------------------+-----------------------+-------------+-----------+
|primary_location_id|kling_gupta_efficiency|root_mean_squared_error|relative_bias|  r_squared|
+-------------------+----------------------+-----------------------+-------------+-----------+
|      usgs-01010000|             0.7197487|              93.688675|    0.1379642|  0.5716339|
|      usgs-01010070|            0.64957094|              14.360057|  -0.07752601| 0.44042128|
|      usgs-01010500|            0.74391365|              145.41783|  0.036977984| 0.70574176|
|      usgs-01011000|             0.6479533|               49.79671|   0.05642169| 0.74898195|
|      usgs-01013500|            0.39131734|               42.23165| -0.042349346|  0.6445334|
|      usgs-01014000|            0.66906583|              248.39078|-0.0023849339|  0.7511065|
|      usgs-01015800|              0.651026|              40.082417|  -0.09978306|  0.7559192|
|      usgs-01017000|             0.6505159|      

                                                                                

In [13]:
%%time
sdf = spark.sql("""
WITH metrics AS (
    SELECT
        joined.primary_location_id
        , teehr_kling_gupta_efficiency(joined.primary_value, joined.secondary_value) as kling_gupta_efficiency
        , teehr_root_mean_squared_error(joined.primary_value, joined.secondary_value) as root_mean_squared_error
        , teehr_relative_bias(joined.primary_value, joined.secondary_value) as relative_bias
        , teehr_r_squared(joined.primary_value, joined.secondary_value) as r_squared
    FROM
        proto.joined as joined
    GROUP BY
        joined.primary_location_id
)
SELECT
    metrics.primary_location_id
    , kling_gupta_efficiency
    , root_mean_squared_error
    , relative_bias
    , r_squared
FROM metrics
WHERE
    primary_location_id='usgs-01021480'
ORDER BY
    metrics.primary_location_id;
""")
sdf.show()



+-------------------+----------------------+-----------------------+-------------+---------+
|primary_location_id|kling_gupta_efficiency|root_mean_squared_error|relative_bias|r_squared|
+-------------------+----------------------+-----------------------+-------------+---------+
|      usgs-01021480|             0.8161562|              1.2657453|  -0.06696106|0.7241807|
+-------------------+----------------------+-----------------------+-------------+---------+

CPU times: user 11.8 ms, sys: 14.8 ms, total: 26.6 ms
Wall time: 8.32 s


                                                                                

In [14]:
spark.stop()

24/05/13 16:41:57 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed.
