In [1]:
from pathlib import Path
import teehr
import shutil

In [2]:
evaluation_path = str(Path.home() / "temp" / "iceberg" / "evaluation")
warehouse_path = str(Path(evaluation_path) / "spark-warehouse")
catalog_name = "local"
schema_name = "db"

In [3]:
from sedona.spark import *

config = (
    SedonaContext.builder()
    .config(
        "spark.jars.packages",
        "org.apache.sedona:sedona-spark-3.5_2.12:1.7.1,"
        "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.9.0,"
        "org.datasyslab:geotools-wrapper:1.7.1-28.5"
    )
    .config(
        "spark.jars.repositories",
        "https://artifacts.unidata.ucar.edu/repository/unidata-all",
    )
    .config(
        "spark.sql.extensions",
        "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"
    )
    .config(
        f"spark.sql.catalog.{catalog_name}",
        "org.apache.iceberg.spark.SparkCatalog"
    )
    .config(
        f"spark.sql.catalog.{catalog_name}.type", "hadoop"
    )
    .config(
        f"spark.sql.catalog.{catalog_name}.warehouse",
        f"{warehouse_path}/{catalog_name}"
    )
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")
    .config("spark.sql.session.timeZone", "UTC")
    .config("spark.driver.host", "localhost")
    .config("spark.driver.bindAddress", "localhost")
    .config("spark.driver.memory", "16g")
    .getOrCreate()
)
sedona = SedonaContext.create(config)

https://artifacts.unidata.ucar.edu/repository/unidata-all added as a remote repository with the name: repo-1
Ivy Default Cache set to: /Users/mdenno/.ivy2/cache
The jars for the packages stored in: /Users/mdenno/.ivy2/jars
org.apache.sedona#sedona-spark-3.5_2.12 added as a dependency
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
org.datasyslab#geotools-wrapper added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-7e833e56-93a3-46db-bcc3-69f2b8e7aae1;1.0
	confs: [default]
	found org.apache.sedona#sedona-spark-3.5_2.12;1.7.1 in central
	found org.apache.sedona#sedona-common;1.7.1 in central
	found org.apache.commons#commons-math3;3.6.1 in central


:: loading settings :: url = jar:file:/Users/mdenno/repos/teehr/.venv/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.locationtech.jts#jts-core;1.20.0 in central
	found org.wololo#jts2geojson;0.16.1 in central
	found org.locationtech.spatial4j#spatial4j;0.8 in central
	found com.google.geometry#s2-geometry;2.0.0 in central
	found com.google.guava#guava;25.1-jre in central
	found com.google.code.findbugs#jsr305;3.0.2 in central
	found org.checkerframework#checker-qual;2.0.0 in central
	found com.google.errorprone#error_prone_annotations;2.1.3 in central
	found com.google.j2objc#j2objc-annotations;1.1 in central
	found org.codehaus.mojo#animal-sniffer-annotations;1.14 in central
	found com.uber#h3;4.1.1 in central
	found net.sf.geographiclib#GeographicLib-Java;1.52 in central
	found com.github.ben-manes.caffeine#caffeine;2.9.2 in central
	found org.checkerframework#checker-qual;3.10.0 in central
	found com.google.errorprone#error_prone_annotations;2.5.1 in central
	found org.apache.sedona#sedona-spark-common-3.5_2.12;1.7.1 in central
	found org.apache.sedona#shade-proto;1.7.1 in central
	foun

In [4]:
# Delete existing Evaluation (useful when testing)
shutil.rmtree(evaluation_path, ignore_errors=True)

# Create an Evaluation object and create the directory
ev = teehr.Evaluation(
    dir_path=evaluation_path,
    create_dir=True,
    spark=sedona
)

In [5]:
ev.list_s3_evaluations()

Unnamed: 0,name,description,url
0,e0_2_location_example,Example evaluation datsets with 2 USGS gages,s3a://ciroh-rti-public-data/teehr-data-warehou...
1,e1_camels_daily_streamflow,Daily average streamflow at ther Camels basins,s3a://ciroh-rti-public-data/teehr-data-warehou...
2,e2_camels_hourly_streamflow,Hourly instantaneous streamflow at ther Camels...,s3a://ciroh-rti-public-data/teehr-data-warehou...
3,e3_usgs_hourly_streamflow,Hourly instantaneous streamflow at USGS CONUS ...,s3a://ciroh-rti-public-data/teehr-data-warehou...
4,e4_nwm_operational,Empty template to load and evaluate NWM operat...,s3a://ciroh-rti-public-data/teehr-data-warehou...


In [6]:
# Clone the e0_camels_daily_streamflow evaluation from the S3 bucket
ev.clone_from_s3("e1_camels_daily_streamflow")
# ev.clone_from_s3("e0_2_location_example")

25/06/03 21:55:43 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
25/06/03 21:56:05 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [7]:
import apply_migrations
import importlib
importlib.reload(apply_migrations)

apply_migrations.evolve_catalog_schema(sedona, catalog_name, schema_name)

print(f"Schema evolution completed for {catalog_name}.")

Applying schema version 1 to local.db


                                                                                

Applying schema version 2 to local.db
Applying schema version 3 to local.db
Applying schema version 4 to local.db
Schema evolution completed for local.


In [8]:
units_sdf = ev.units.to_sdf()
units_sdf.writeTo("local.db.units").append()

In [9]:
configuration_sdf = ev.configurations.to_sdf()
configuration_sdf.writeTo("local.db.configurations").append()

In [10]:
variables_sdf = ev.variables.to_sdf()
variables_sdf.writeTo("local.db.variables").append()

In [11]:
attributes_sdf = ev.attributes.to_sdf()
attributes_sdf.writeTo("local.db.attributes").append()

In [12]:
locations_sdf = ev.locations.to_sdf()
locations_sdf.writeTo("local.db.locations").append()

In [13]:
location_attrs_sdf = ev.location_attributes.to_sdf()
location_attrs_sdf.writeTo("local.db.location_attributes").append()
location_attrs_sdf.show()

+-------------+--------------+-------------------+
|  location_id|attribute_name|              value|
+-------------+--------------+-------------------+
|usgs-01013500|        q_mean| 44.467109455834866|
|usgs-01022500|        q_mean| 14.786380055715862|
|usgs-01030500|        q_mean|  77.36721025688733|
|usgs-01031500|        q_mean|  18.13589110971241|
|usgs-01047000|        q_mean|  23.09950986229729|
|usgs-01052500|        q_mean| 10.959371324254077|
|usgs-01054200|        q_mean|   5.70061348678967|
|usgs-01055000|        q_mean|  6.623313284003387|
|usgs-01057000|        q_mean|   4.01856522562012|
|usgs-01073000|        q_mean| 0.6170135822904296|
|usgs-01078000|        q_mean|  4.515503797876271|
|usgs-01123000|        q_mean| 1.5623952797949385|
|usgs-01134500|        q_mean|  4.684171708725575|
|usgs-01137500|        q_mean|  6.248845351029678|
|usgs-01139000|        q_mean|  5.007937386091005|
|usgs-01139800|        q_mean|0.48647037402160476|
|usgs-01142500|        q_mean| 

In [14]:
location_attrs_sdf = ev.primary_timeseries.to_sdf()
location_attrs_sdf.writeTo("local.db.primary_timeseries").append()
location_attrs_sdf.show()

                                                                                

+--------------+-------------------+----------+---------+-------------+------------------+--------------------+
|reference_time|         value_time|     value|unit_name|  location_id|configuration_name|       variable_name|
+--------------+-------------------+----------+---------+-------------+------------------+--------------------+
|          NULL|1992-10-30 00:00:00|0.44481045|    m^3/s|usgs-07196900| usgs_observations|streamflow_daily_...|
|          NULL|1992-10-31 00:00:00| 0.2881239|    m^3/s|usgs-07196900| usgs_observations|streamflow_daily_...|
|          NULL|1992-11-01 00:00:00|0.67913234|    m^3/s|usgs-07196900| usgs_observations|streamflow_daily_...|
|          NULL|1992-11-02 00:00:00|0.56987655|    m^3/s|usgs-07196900| usgs_observations|streamflow_daily_...|
|          NULL|1992-11-03 00:00:00|0.33697048|    m^3/s|usgs-07196900| usgs_observations|streamflow_daily_...|
|          NULL|1992-11-04 00:00:00|0.24730046|    m^3/s|usgs-07196900| usgs_observations|streamflow_dai

In [15]:
secondary_timeseries_sdf = ev.secondary_timeseries.to_sdf()
secondary_timeseries_sdf.writeTo("local.db.secondary_timeseries").append()
secondary_timeseries_sdf.show()



+-------------------+----------+---------+--------------+------+-------------------+--------------------+--------------+
|         value_time|     value|unit_name|   location_id|member| configuration_name|       variable_name|reference_time|
+-------------------+----------+---------+--------------+------+-------------------+--------------------+--------------+
|1989-01-01 00:00:00|0.19999999|    m^3/s|nwm30-10025746|  NULL|nwm30_retrospective|streamflow_daily_...|          NULL|
|1989-01-02 00:00:00|0.19999999|    m^3/s|nwm30-10025746|  NULL|nwm30_retrospective|streamflow_daily_...|          NULL|
|1989-01-03 00:00:00|0.19999999|    m^3/s|nwm30-10025746|  NULL|nwm30_retrospective|streamflow_daily_...|          NULL|
|1989-01-04 00:00:00|0.19999999|    m^3/s|nwm30-10025746|  NULL|nwm30_retrospective|streamflow_daily_...|          NULL|
|1989-01-05 00:00:00|0.19874999|    m^3/s|nwm30-10025746|  NULL|nwm30_retrospective|streamflow_daily_...|          NULL|
|1989-01-06 00:00:00|      0.19|

                                                                                

In [16]:
location_crosswalk_sdf = ev.location_crosswalks.to_sdf()
location_crosswalk_sdf.writeTo("local.db.location_crosswalks").append()
location_crosswalk_sdf.show()

+-------------------+---------------------+
|primary_location_id|secondary_location_id|
+-------------------+---------------------+
|      usgs-01030500|           nwm30-3923|
|      usgs-04057510|         nwm30-272589|
|      usgs-11299600|         nwm30-348419|
|      usgs-07195800|         nwm30-399452|
|      usgs-07197000|         nwm30-400496|
|      usgs-07196900|         nwm30-400822|
|      usgs-03213700|         nwm30-435154|
|      usgs-03281500|         nwm30-503758|
|      usgs-07335700|         nwm30-588170|
|      usgs-01013500|         nwm30-724696|
|      usgs-07083000|         nwm30-916821|
|      usgs-07346045|        nwm30-1017865|
|      usgs-02221525|        nwm30-1056599|
|      usgs-10234500|        nwm30-1215135|
|      usgs-09035900|        nwm30-1238533|
|      usgs-08050800|        nwm30-1275870|
|      usgs-09047700|        nwm30-1314083|
|      usgs-09066300|        nwm30-1319214|
|      usgs-09066200|        nwm30-1320244|
|      usgs-09065500|        nwm

In [17]:
sedona.stop()

- Create joined timeseries needs some filters so that we can incrementally create the joined timeseries.  
- Joined timeseries should maybe only contain the joined ts.  That join process does not parallelize well.  Other joins do.