In [1]:
import os
from pathlib import Path

import teehr

from teehr.evaluation.spark_session_utils import create_spark_session

In [2]:
teehr.__version__

'0.6.0dev2'

#### Test 1. Staring up a spark cluster and reading from s3

In [None]:
%%time
spark = create_spark_session(
    start_spark_cluster=True
)

In [None]:
ev = teehr.Evaluation(
    spark=spark,
    dir_path="/data/temp_warehouse",
    check_evaluation_version=False
)

In [5]:
obs_sdf = spark.read.parquet("s3a://ciroh-rti-public-data/teehr/protocols/science-eval/timeseries/usgs*.parquet")

#### Test 2. Connecting to the teehr-data-warehouse in s3. We need to pass in credentials to be able to access the s3 data warehouse.
Anonymous credentials do not work.

We need to bypass the minio catalog config if we want to interact with the s3 warehouse

In [2]:
os.environ['IN_CLUSTER'] = 'false'
os.environ.get("IN_CLUSTER", "false").lower()

'false'

**NOTE:** We need to pass in personal AWS access tokens to access the data warehouse in s3 for now 

In [None]:
%%time
spark = create_spark_session(
    remote_catalog_uri="http://dev-teehr-sys-iceberg-alb-2105268770.us-east-2.elb.amazonaws.com",
    remote_warehouse_dir="s3://dev-teehr-sys-iceberg-warehouse/warehouse/",
    aws_access_key_id="",
    aws_secret_access_key="", 
)

In [None]:
ev = teehr.Evaluation(
    spark=spark,
    dir_path="/data/temp_warehouse",
    create_dir=True
)

In [5]:
ev.remote_catalog

RemoteCatalog(warehouse_dir='s3://dev-teehr-sys-iceberg-warehouse/warehouse/', catalog_name='iceberg', namespace_name='teehr', catalog_type='rest', catalog_uri='http://dev-teehr-sys-iceberg-alb-2105268770.us-east-2.elb.amazonaws.com')

In [None]:
ev.set_active_catalog("remote")

In [None]:
ev.locations.to_sdf().count()

In [None]:
ev.locations.to_geopandas()

#### Test 3. Minimal setup: Provide local Evaluation directory. Try reading from s3 using PySpark

In [6]:
spark = create_spark_session()

INFO:session_utils:🚀 Creating Spark session: TEEHR Evaluation
INFO:session_utils:✅ Spark local configuration successful!
INFO:session_utils:🔑 Using anonymous AWS credentials for S3 access
INFO:session_utils:Configuring Iceberg catalogs...
INFO:session_utils:⚠️  Configuring remote catalog for MinIO access
INFO:session_utils:Spark session created for TEEHR Evaluation.
INFO:session_utils:🎉 Spark session created successfully!


In [7]:
%%time
ev = teehr.Evaluation(
    spark=spark,
    dir_path="/data/temp_warehouse",
    create_dir=True
)

INFO:teehr.evaluation.evaluation:Directory /data/temp_warehouse already exists. Not creating it again.
INFO:teehr.evaluation.evaluation:Using provided Spark session.
INFO:teehr.evaluation.evaluation:Active catalog set to local.


CPU times: user 3.75 ms, sys: 589 μs, total: 4.34 ms
Wall time: 5.49 ms


In [8]:
options = {
    "header": "true",
    "ignoreMissingFiles": "true"
}
tbl = ev.attributes()
schema = tbl.schema_func().to_structtype()

s3_dirpath = "s3a://ciroh-rti-public-data/teehr-data-warehouse/v0_4_evaluations/e3_usgs_hourly_streamflow/dataset/attributes/"

attrs_sdf = spark.read.format("csv").options(**options).load(s3_dirpath, schema=schema) 

#### Test 4. Pass in extra config and packages

In [3]:
%%time
spark = create_spark_session(
    extra_configs={"spark.sql.shuffle.partitions": "200"},
    # extra_packages=["/path/to/my_custom_udfs.jar"]  # this will fail since it's fake, but this is the pattern
)

ev = teehr.Evaluation(
    spark=spark,
    dir_path="/data/temp_warehouse",
    create_dir=True
)

INFO:session_utils:🚀 Creating Spark session: TEEHR Evaluation
INFO:session_utils:✅ Spark local configuration successful!
INFO:session_utils:🔑 Using anonymous AWS credentials for S3 access
INFO:session_utils:Configuring Iceberg catalogs...
INFO:session_utils:⚠️  Configuring remote catalog for MinIO access
INFO:session_utils:Spark session created for TEEHR Evaluation.
INFO:session_utils:🎉 Spark session created successfully!
INFO:teehr.evaluation.evaluation:Directory /data/temp_warehouse already exists. Not creating it again.
INFO:teehr.evaluation.evaluation:Using provided Spark session.
INFO:teehr.evaluation.evaluation:Active catalog set to local.


CPU times: user 6.91 ms, sys: 45.9 ms, total: 52.8 ms
Wall time: 12.1 s


In [4]:
ev.log_spark_config()

INFO:teehr.evaluation.spark_session_utils:Final Spark configuration:
INFO:teehr.evaluation.spark_session_utils: spark.app.id: local-1761249102607
INFO:teehr.evaluation.spark_session_utils: spark.app.initial.file.urls: 
INFO:teehr.evaluation.spark_session_utils:    file:///home/jovyan/.ivy2.5.2/jars/org.apache.hadoop_hadoop-aws-3.4.1.jar
INFO:teehr.evaluation.spark_session_utils:    file:///home/jovyan/.ivy2.5.2/jars/org.slf4j_slf4j-api-2.0.17.jar
INFO:teehr.evaluation.spark_session_utils:    file:///home/jovyan/.ivy2.5.2/jars/org.antlr_antlr-runtime-3.5.3.jar
INFO:teehr.evaluation.spark_session_utils:    file:///home/jovyan/.ivy2.5.2/jars/org.datasyslab_geotools-wrapper-1.8.0-33.1.jar
INFO:teehr.evaluation.spark_session_utils:    file:///home/jovyan/.ivy2.5.2/jars/org.antlr_ST4-4.3.4.jar
INFO:teehr.evaluation.spark_session_utils:    file:///home/jovyan/.ivy2.5.2/jars/com.ibm.icu_icu4j-72.1.jar
INFO:teehr.evaluation.spark_session_utils:    file:///home/jovyan/.ivy2.5.2/jars/org.antlr_an