In [1]:
import os
from pathlib import Path
import shutil

import teehr

teehr.__version__

'0.6.0dev4'

In [2]:
from teehr.evaluation.spark_session_utils import create_spark_session

In [3]:
spark = create_spark_session(
    aws_access_key_id="minioadmin",
    aws_secret_access_key="minioadmin123",
    update_configs={
        "spark.hadoop.fs.s3a.aws.credentials.provider":  "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"
    }
)

INFO:teehr.evaluation.spark_session_utils:üöÄ Creating Spark session: TEEHR Evaluation
INFO:teehr.evaluation.spark_session_utils:‚úÖ Spark local configuration successful!
INFO:teehr.evaluation.spark_session_utils:Setting Hadoop's default AWS credentials provider and AWS region
INFO:teehr.evaluation.spark_session_utils:üîë Using user-provided AWS credentials
INFO:teehr.evaluation.spark_session_utils:Configuring Iceberg catalogs...
INFO:teehr.evaluation.spark_session_utils:‚öôÔ∏è All settings applied. Creating Spark session...
INFO:teehr.evaluation.spark_session_utils:üéâ Spark session created successfully!


In [4]:
%%time
dir_path = "/data/temp_warehouse"

shutil.rmtree(dir_path, ignore_errors=True)

ev = teehr.Evaluation(
    spark=spark,
    dir_path=dir_path,
    create_dir=True
)

INFO:teehr.evaluation.evaluation:Creating directory /data/temp_warehouse.
INFO:teehr.evaluation.evaluation:Using provided Spark session.
INFO:teehr.evaluation.evaluation:Active catalog set to local.


CPU times: user 5.19 ms, sys: 10.9 ms, total: 16.1 ms
Wall time: 305 ms


In [5]:
# Copy local template
ev.clone_template()

INFO:teehr.evaluation.evaluation:Copying template from /srv/conda/envs/notebook/lib/python3.12/site-packages/teehr/template to /data/temp_warehouse/local
INFO:teehr.utilities.apply_migrations:‚úÖ Created schema: local.schema_evolution
INFO:teehr.utilities.apply_migrations:‚úÖ Created table: local.schema_evolution.schema_version_history
INFO:teehr.utilities.apply_migrations:Applying schema version 1 to local.teehr


In [6]:
ev.apply_schema_migration()

INFO:teehr.evaluation.evaluation:Copying migration scripts to evaluation directory.
INFO:teehr.utilities.apply_migrations:‚úÖ Created schema: iceberg.schema_evolution
INFO:teehr.utilities.apply_migrations:‚úÖ Created table: iceberg.schema_evolution.schema_version_history
INFO:teehr.utilities.apply_migrations:Applying schema version 1 to iceberg.teehr
INFO:teehr.evaluation.evaluation:Schema evolution completed for iceberg.


In [7]:
ev.set_active_catalog("remote")

ev.active_catalog

INFO:teehr.evaluation.evaluation:Active catalog set to remote.


RemoteCatalog(warehouse_dir='s3://warehouse/', catalog_name='iceberg', namespace_name='teehr', catalog_type='rest', catalog_uri='http://iceberg-rest:8181')

#### Attributes

In [8]:
options = {
    "header": "true",
    "ignoreMissingFiles": "true"
}

tbl = ev.attributes()
schema = tbl.schema_func().to_structtype()
s3_dirpath = "s3a://ciroh-rti-public-data/teehr-data-warehouse/v0_4_evaluations/e3_usgs_hourly_streamflow/dataset/attributes/"

sdf = spark.read.format("csv").options(**options).load(s3_dirpath, schema=schema) 

In [9]:
ev.write.to_warehouse(table_name="attributes", source_data=sdf)

#### Locations

In [10]:
tbl = ev.locations()
schema = tbl.schema_func().to_structtype()
s3_dirpath = "s3a://ciroh-rti-public-data/teehr-data-warehouse/v0_4_evaluations/e3_usgs_hourly_streamflow/dataset/locations/"

sdf = spark.read.format("parquet").options(**options).load(s3_dirpath, schema=schema) 

In [11]:
df = sdf.toPandas()

In [None]:
# Handpicked sites that seemed interesting
usgs_gages = [
    "usgs-02424000",
    "usgs-03068800",
    "usgs-01570500",
    "usgs-01347000",
    "usgs-05443500",
    "usgs-06770500",
    "usgs-08313000",
    "usgs-11421000",
    "usgs-14319500"
]

In [13]:
locs_df = df[df["id"].isin(usgs_gages)]

In [14]:
ev.write.to_warehouse(table_name="locations", source_data=locs_df)

#### Location attributes

In [15]:
tbl = ev.location_attributes()
schema = tbl.schema_func().to_structtype()
s3_dirpath = "s3a://ciroh-rti-public-data/teehr-data-warehouse/v0_4_evaluations/e3_usgs_hourly_streamflow/dataset/location_attributes/"

sdf = spark.read.format("parquet").options(**options).load(s3_dirpath, schema=schema) 

In [16]:
sdf = sdf.filter(sdf.location_id.isin(usgs_gages))

In [17]:
ev.write.to_warehouse(table_name="location_attributes", source_data=sdf)

#### Location crosswalks

In [18]:
tbl = ev.location_crosswalks()
schema = tbl.schema_func().to_structtype()
s3_dirpath = "s3a://ciroh-rti-public-data/teehr-data-warehouse/v0_4_evaluations/e3_usgs_hourly_streamflow/dataset/location_crosswalks/"

sdf = spark.read.format("parquet").options(**options).load(s3_dirpath, schema=schema) 

In [19]:
sdf = sdf.filter(sdf.primary_location_id.isin(usgs_gages))

In [20]:
ev.write.to_warehouse(table_name="location_crosswalks", source_data=sdf)

#### Units

In [21]:
tbl = ev.units()
schema = tbl.schema_func().to_structtype()
s3_dirpath = "s3a://ciroh-rti-public-data/teehr-data-warehouse/v0_4_evaluations/e3_usgs_hourly_streamflow/dataset/units/"

sdf = spark.read.format("csv").options(**options).load(s3_dirpath, schema=schema) 

ev.write.to_warehouse(table_name="units", source_data=sdf)

#### Variables

In [22]:
tbl = ev.variables()
schema = tbl.schema_func().to_structtype()
s3_dirpath = "s3a://ciroh-rti-public-data/teehr-data-warehouse/v0_4_evaluations/e3_usgs_hourly_streamflow/dataset/variables/"

sdf = spark.read.format("csv").options(**options).load(s3_dirpath, schema=schema) 

ev.write.to_warehouse(table_name="variables", source_data=sdf)

#### Configurations

In [23]:
tbl = ev.configurations()
schema = tbl.schema_func().to_structtype()
s3_dirpath = "s3a://ciroh-rti-public-data/teehr-data-warehouse/v0_4_evaluations/e3_usgs_hourly_streamflow/dataset/configurations/"

sdf = spark.read.format("csv").options(**options).load(s3_dirpath, schema=schema) 

ev.write.to_warehouse(table_name="configurations", source_data=sdf)

#### Primary Timeseries

In [24]:
tbl = ev.primary_timeseries()
schema = tbl.schema_func().to_structtype()
s3_dirpath = "s3a://ciroh-rti-public-data/teehr-data-warehouse/v0_4_evaluations/e3_usgs_hourly_streamflow/dataset/primary_timeseries/"

sdf = spark.read.format("parquet").options(**options).load(s3_dirpath, schema=schema) 

sdf = sdf.filter(sdf.location_id.isin(usgs_gages))

ev.write.to_warehouse(table_name="primary_timeseries", source_data=sdf)

#### Secondary Timeseries

In [25]:
%%time
tbl = ev.secondary_timeseries()
schema = tbl.schema_func().to_structtype()
s3_dirpath = "s3a://ciroh-rti-public-data/teehr-data-warehouse/v0_4_evaluations/e3_usgs_hourly_streamflow/dataset/secondary_timeseries/"
sdf = spark.read.format("parquet").options(**options).load(s3_dirpath, schema=schema) 

xwalk_df = ev.location_crosswalks.to_pandas()

sdf = sdf.filter(sdf.location_id.isin(xwalk_df.secondary_location_id.tolist()))

ev.write.to_warehouse(table_name="secondary_timeseries", source_data=sdf)

INFO:teehr.evaluation.tables.base_table:Loading files from iceberg.teehr.location_crosswalks.
INFO:teehr.evaluation.read:Reading files from iceberg.teehr.location_crosswalks.


CPU times: user 35.3 ms, sys: 5.83 ms, total: 41.2 ms
Wall time: 1min 5s


In [26]:
ev.secondary_timeseries.to_sdf().count()

INFO:teehr.evaluation.tables.base_table:Loading files from iceberg.teehr.secondary_timeseries.
INFO:teehr.evaluation.read:Reading files from iceberg.teehr.secondary_timeseries.


3319920

In [27]:
ev.secondary_timeseries.distinct_values("configuration_name")

INFO:teehr.evaluation.tables.base_table:Loading files from iceberg.teehr.secondary_timeseries.
INFO:teehr.evaluation.read:Reading files from iceberg.teehr.secondary_timeseries.


['nwm30_retrospective']

In [None]:
spark.stop()