### TEEHR with Spark and Iceberg

In [1]:
import os
import duckdb
import numpy as np
from pyspark.sql.functions import pandas_udf
import pandas as pd
from urllib.request import urlretrieve
import gc

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

In [3]:
config = {
    "spark.kubernetes.authenticate.driver.serviceAccountName": "jupyter",
    "spark.kubernetes.namespace": "teehr-spark-default",
    "spark.kubernetes.container.image": os.environ["TEEHR_WORKER_IMAGE"],
    "spark.executor.extraJavaOptions=-Daws.region": "us-east-1",
    "spark.driver.extraJavaOptions=-Daws.region": "us-east-1",
    "spark.executor.instances": "6",
    "spark.executor.memory": "16g",
    "spark.executor.cores": "2",
    "spark.driver.blockManager.port": "7777",
    "spark.driver.port": "2222",
    "spark.driver.host": "jupyter.teehr-spark-default.svc.cluster.local",
    "spark.driver.bindAddress": "0.0.0.0",
    "spark.hadoop.fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem",
    "spark.hadoop.fs.s3a.aws.credentials.provider": "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider",
    "spark.sql.catalog.demo.s3.access-key-id": "minio",
    "spark.sql.catalog.demo.s3.secret-access-key": "password123",
    "spark.sql.parquet.enableVectorizedReader": "false",
    "spark.kubernetes.executor.node.selector.dedicated": "worker",
    "spark.kubernetes.executor.podTemplateFile": "/home/spark/pod-template.yaml",
}

def get_spark_session(app_name: str, conf: SparkConf):
    conf.setMaster("k8s://https://kubernetes.default.svc.cluster.local")
    for key, value in config.items():
        conf.set(key, value)    
    return SparkSession.builder.appName(app_name).config(conf=conf).getOrCreate()

In [4]:
spark = get_spark_session("teehr-workers", SparkConf())
# spark.sparkContext.getConf().getAll()

24/05/19 17:13:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/19 17:13:13 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [5]:
obs = spark.read.parquet("s3a://ciroh-rti-public-data/teehr/protocols/science-eval/timeseries/usgs*.parquet")

24/05/19 17:16:36 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

In [6]:
obs.count()

                                                                                

176063841

In [8]:
obs.write.mode("append").saveAsTable("science_eval.primary")

                                                                                

In [9]:
spark.sql("SELECT count(*) FROM science_eval.primary;").show()

+---------+
| count(1)|
+---------+
|176063841|
+---------+



In [17]:
sim = spark.read.parquet("s3a://ciroh-rti-public-data/teehr/protocols/science-eval/timeseries/nwm2*.parquet")

In [18]:
sim.count()

                                                                                

399505152

In [19]:
sim.write.mode("append").saveAsTable("science_eval.secondary")

                                                                                

In [20]:
spark.sql("SELECT count(*) FROM science_eval.secondary;").show()

+---------+
| count(1)|
+---------+
|399505152|
+---------+



In [21]:
xw = spark.read.parquet("s3a://ciroh-rti-public-data/teehr/common/geo/usgs_nwm2*_crosswalk.conus.parquet")

In [22]:
xw.count()

22835

In [24]:
xw.drop("feature_id").write.mode("append").saveAsTable("science_eval.crosswalk")

In [25]:
spark.sql("SELECT count(*) FROM science_eval.crosswalk;").show()

+--------+
|count(1)|
+--------+
|   22835|
+--------+



In [26]:
# Join from Iceberg
sdf = spark.sql("""
SELECT
        sf.reference_time
        , sf.configuration
        , sf.measurement_unit
        , sf.variable_name
        , sf.value_time as value_time
        , sf.location_id as secondary_location_id
        , sf.value as secondary_value
        , pf.location_id as primary_location_id
        , pf.value as primary_value
    FROM science_eval.secondary sf
    JOIN science_eval.crosswalk cf
        on cf.secondary_location_id = sf.location_id
    JOIN science_eval.primary pf
        on cf.primary_location_id = pf.location_id
        and sf.value_time = pf.value_time
        and sf.measurement_unit = pf.measurement_unit
        and sf.variable_name = pf.variable_name
""")

In [27]:
%%time
sdf.count()



CPU times: user 125 ms, sys: 11.6 ms, total: 137 ms
Wall time: 1min 32s


                                                                                

351877563

In [28]:
%%time
sdf.write.mode("append").saveAsTable("science_eval.joined")



CPU times: user 195 ms, sys: 35.1 ms, total: 230 ms
Wall time: 2min 54s


                                                                                

In [29]:
spark.sql("SELECT count(*) FROM science_eval.joined;").show()

+---------+
| count(1)|
+---------+
|351877563|
+---------+



In [30]:
spark.stop()

24/05/19 17:41:37 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed.
