### TEEHR with Spark and Iceberg

In [1]:
import os
import duckdb
import numpy as np
from pyspark.sql.functions import pandas_udf
import pandas as pd
from urllib.request import urlretrieve
import gc

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

In [3]:
config = {
    "spark.kubernetes.authenticate.driver.serviceAccountName": "jupyter",
    "spark.kubernetes.namespace": "teehr-spark-default",
    "spark.kubernetes.container.image": os.environ["TEEHR_WORKER_IMAGE"],
    "spark.executor.extraJavaOptions=-Daws.region": "us-east-1",
    "spark.driver.extraJavaOptions=-Daws.region": "us-east-1",
    "spark.executor.instances": "6",
    "spark.executor.memory": "16g",
    "spark.executor.cores": "2",
    "spark.driver.blockManager.port": "7777",
    "spark.driver.port": "2222",
    "spark.driver.host": "jupyter.teehr-spark-default.svc.cluster.local",
    "spark.driver.bindAddress": "0.0.0.0",
    "spark.hadoop.fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem",
    "spark.hadoop.fs.s3a.aws.credentials.provider": "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider",
    "spark.sql.catalog.demo.s3.access-key-id": "minio",
    "spark.sql.catalog.demo.s3.secret-access-key": "password123",
    "spark.sql.parquet.enableVectorizedReader": "false",
    "spark.kubernetes.executor.node.selector.dedicated": "worker",
    "spark.kubernetes.executor.podTemplateFile": "/home/spark/pod-template.yaml",
}

def get_spark_session(app_name: str, conf: SparkConf):
    conf.setMaster("k8s://https://kubernetes.default.svc.cluster.local")
    for key, value in config.items():
        conf.set(key, value)    
    return SparkSession.builder.appName(app_name).config(conf=conf).getOrCreate()

In [4]:
spark = get_spark_session("teehr-workers", SparkConf())
# spark.sparkContext.getConf().getAll()

24/05/19 17:56:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/19 17:56:11 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [5]:
%%time
spark.sql("SELECT value_time, value FROM science_eval.primary WHERE location_id = 'usgs-01116905' ORDER BY value_time;").show()

                                                                                

+-------------------+------------------+
|         value_time|             value|
+-------------------+------------------+
|2016-01-01 00:00:00|1.4215056896209717|
|2016-01-01 01:00:00|1.4215056896209717|
|2016-01-01 02:00:00|1.3903571367263794|
|2016-01-01 03:00:00|1.3903571367263794|
|2016-01-01 04:00:00| 1.359208583831787|
|2016-01-01 05:00:00| 1.359208583831787|
|2016-01-01 06:00:00|1.3280601501464844|
|2016-01-01 07:00:00|1.3280601501464844|
|2016-01-01 08:00:00|1.2969114780426025|
|2016-01-01 09:00:00|1.2969114780426025|
|2016-01-01 10:00:00|1.2657630443572998|
|2016-01-01 11:00:00|1.2657630443572998|
|2016-01-01 12:00:00|1.2346144914627075|
|2016-01-01 13:00:00|1.2346144914627075|
|2016-01-01 14:00:00|1.2062976360321045|
|2016-01-01 15:00:00|1.2062976360321045|
|2016-01-01 16:00:00|1.2062976360321045|
|2016-01-01 17:00:00|1.1751490831375122|
|2016-01-01 18:00:00|1.1751490831375122|
|2016-01-01 19:00:00|1.1751490831375122|
+-------------------+------------------+
only showing top

In [6]:
from pandas_udfs import * 
spark.udf.register("teehr_kling_gupta_efficiency", teehr_kling_gupta_efficiency)
spark.udf.register("teehr_root_mean_squared_error", teehr_root_mean_squared_error)
spark.udf.register("teehr_relative_bias", teehr_relative_bias)
spark.udf.register("teehr_r_squared", teehr_r_squared)

<pyspark.sql.udf.UserDefinedFunction at 0x7f0575e024d0>

In [7]:
%%time
sdf = spark.sql("""
WITH metrics AS (
    SELECT
        joined.primary_location_id
        , teehr_kling_gupta_efficiency(joined.primary_value, joined.secondary_value) as kling_gupta_efficiency
        , teehr_root_mean_squared_error(joined.primary_value, joined.secondary_value) as root_mean_squared_error
        , teehr_relative_bias(joined.primary_value, joined.secondary_value) as relative_bias
        , teehr_r_squared(joined.primary_value, joined.secondary_value) as r_squared
    FROM
        science_eval.joined as joined
    GROUP BY
        joined.primary_location_id
)
SELECT
    metrics.primary_location_id
    , kling_gupta_efficiency
    , root_mean_squared_error
    , relative_bias
    , r_squared
FROM metrics
ORDER BY
    metrics.primary_location_id;
""")
sdf.show()



+-------------------+----------------------+-----------------------+-------------+-----------+
|primary_location_id|kling_gupta_efficiency|root_mean_squared_error|relative_bias|  r_squared|
+-------------------+----------------------+-----------------------+-------------+-----------+
|      usgs-01010000|            0.42462006|               97.13046|  -0.21570142|  0.7913283|
|      usgs-01010070|            0.44942167|              14.033999|  -0.28686345|  0.6007537|
|      usgs-01010500|            0.27008578|              221.87845|   -0.3975725|  0.8074171|
|      usgs-01011000|             0.6334492|              57.011288|  -0.19466422|  0.7688364|
|      usgs-01013500|            0.54683834|              41.360138|  -0.23086037| 0.82660383|
|      usgs-01014000|            0.37036315|              354.70465|  -0.35600448| 0.86370957|
|      usgs-01015800|            0.72119784|              31.359137|  -0.14541015| 0.90401065|
|      usgs-01017000|            0.76449686|      

                                                                                

In [8]:
%%time
sdf = spark.sql("""
WITH metrics AS (
    SELECT
        joined.primary_location_id
        , teehr_kling_gupta_efficiency(joined.primary_value, joined.secondary_value) as kling_gupta_efficiency
        , teehr_root_mean_squared_error(joined.primary_value, joined.secondary_value) as root_mean_squared_error
        , teehr_relative_bias(joined.primary_value, joined.secondary_value) as relative_bias
        , teehr_r_squared(joined.primary_value, joined.secondary_value) as r_squared
    FROM
        science_eval.joined as joined
    GROUP BY
        joined.primary_location_id
)
SELECT
    metrics.primary_location_id
    , kling_gupta_efficiency
    , root_mean_squared_error
    , relative_bias
    , r_squared
FROM metrics
WHERE
    primary_location_id='usgs-01021480'
ORDER BY
    metrics.primary_location_id;
""")
sdf.show()



+-------------------+----------------------+-----------------------+-------------+----------+
|primary_location_id|kling_gupta_efficiency|root_mean_squared_error|relative_bias| r_squared|
+-------------------+----------------------+-----------------------+-------------+----------+
|      usgs-01021480|             0.6604273|              1.1185906|   -0.1908394|0.73515147|
+-------------------+----------------------+-----------------------+-------------+----------+

CPU times: user 10 ms, sys: 3.27 ms, total: 13.3 ms
Wall time: 4.56 s


                                                                                

In [9]:
spark.stop()

24/05/19 17:57:20 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed.
