### TEEHR with Spark and Iceberg

In [None]:
import os
import duckdb
import numpy as np
from pyspark.sql.functions import pandas_udf
import pandas as pd
from urllib.request import urlretrieve
import gc

In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

In [None]:
config = {
    "spark.kubernetes.authenticate.driver.serviceAccountName": "jupyter",
    "spark.kubernetes.namespace": "teehr-spark-default",
    "spark.kubernetes.container.image": os.environ["TEEHR_WORKER_IMAGE"],
    "spark.executor.extraJavaOptions=-Daws.region": "us-east-1",
    "spark.driver.extraJavaOptions=-Daws.region": "us-east-1",
    "spark.executor.instances": "6",
    "spark.executor.memory": "16g",
    "spark.executor.cores": "2",
    "spark.driver.blockManager.port": "7777",
    "spark.driver.port": "2222",
    "spark.driver.host": "jupyter.teehr-spark-default.svc.cluster.local",
    "spark.driver.bindAddress": "0.0.0.0",
    "spark.hadoop.fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem",
    "spark.hadoop.fs.s3a.aws.credentials.provider": "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider",
    "spark.sql.catalog.demo.s3.access-key-id": "minio",
    "spark.sql.catalog.demo.s3.secret-access-key": "password123",
    "spark.sql.parquet.enableVectorizedReader": "false",
    "spark.kubernetes.executor.node.selector.dedicated": "worker",
    "spark.kubernetes.executor.podTemplateFile": "/home/spark/pod-template.yaml",
}

def get_spark_session(app_name: str, conf: SparkConf):
    conf.setMaster("k8s://https://kubernetes.default.svc.cluster.local")
    for key, value in config.items():
        conf.set(key, value)    
    return SparkSession.builder.appName(app_name).config(conf=conf).getOrCreate()

In [None]:
spark = get_spark_session("teehr-workers", SparkConf())
# spark.sparkContext.getConf().getAll()

In [None]:
obs = spark.read.parquet("s3a://ciroh-rti-public-data/teehr-data-warehouse/common/observations/usgs_streamflow/*.parquet")

In [None]:
obs.count()

In [None]:
obs.drop("__index_level_0__").write.mode("append").saveAsTable("proto.obs")

In [None]:
spark.sql("SELECT count(*) FROM proto.obs;").show()

In [None]:
sim = spark.read.parquet("s3a://ciroh-rti-public-data/teehr-data-warehouse/common/baselines/nwm30_retrospective_conus/*.parquet")

In [None]:
sim.count()

In [None]:
sim.write.mode("append").saveAsTable("proto.sim").show()

In [None]:
spark.sql("SELECT count(*) FROM proto.sim;").show()

In [None]:
xw = spark.read.parquet("s3a://ciroh-rti-public-data/teehr-data-warehouse/common/crosswalks/usgs_nwm30_crosswalk.conus.parquet")

In [None]:
xw.count()

In [None]:
xw.write.mode("append").saveAsTable("proto.xw")

In [None]:
spark.sql("SELECT count(*) FROM proto.xw;").show()

In [None]:
# Join from Iceberg
sdf = spark.sql("""
SELECT
        sf.reference_time
        , sf.configuration
        , sf.measurement_unit
        , sf.variable_name
        , sf.value_time as value_time
        , sf.location_id as secondary_location_id
        , sf.value as secondary_value
        , pf.location_id as primary_location_id
        , pf.value as primary_value
    FROM proto.sim sf
    JOIN proto.xw cf
        on cf.secondary_location_id = sf.location_id
    JOIN proto.obs pf
        on cf.primary_location_id = pf.location_id
        and sf.value_time = pf.value_time
        and sf.measurement_unit = pf.measurement_unit
        and sf.variable_name = pf.variable_name
""")

In [None]:
%%time
sdf.count()

In [None]:
%%time
sdf.write.mode("append").saveAsTable("proto.joined")

In [None]:
spark.sql("SELECT count(*) FROM proto.joined;").show()

In [None]:
spark.stop()