In [1]:
import xarray as xr
import sqlite3
import geopandas as gpd
import pandas as pd
import boto3

from datetime import datetime
import os
from pathlib import Path
import logging

import teehr
from teehr.evaluation.spark_session_utils import create_spark_session

teehr.__version__

'0.6.0dev4'

In [2]:
LOCATION_ID_PREFIX = "nrds22"
CONFIGURATION_NAME = "nrds_v22_cfenom_short_range"
NGEN_OUTPUT_DIR = "/data/datastream_output/vpu16/ngen-run"

### Create the crosswalk where USGS gages exist

In [3]:
def get_gages_from_hydrofabric(folder_to_eval):
    """
    Get the gages from the hydrofabric.

    Ref: https://github.com/JoshCu/ngiab_eval/blob/2e8fd96b21a369bb93b2a491b0c303a4018a290e/ngiab_eval/core.py
    """
    # search inside the folder for _subset.gpkg recursively
    gpkg_file = None
    config_dir = os.path.join(folder_to_eval,"config")
    for root, dirs, files in os.walk(config_dir):
        for file in files:
            if file.endswith(".gpkg"):
                gpkg_file = os.path.join(root, file)
                break

    if gpkg_file is None:
        raise FileNotFoundError(f"No subset.gpkg file found in folder: {folder_to_eval}")

    # figure out if the hf is v20.1 or v2.2
    # 2.2 has a pois table, 20.1 does not
    with sqlite3.connect(gpkg_file) as conn:
        results = conn.execute(
            "SELECT count(*) FROM gpkg_contents WHERE table_name = 'pois'"
        ).fetchall()

    if results[0][0] == 0:
        with sqlite3.connect(gpkg_file) as conn:
            results = conn.execute(
                "SELECT id, rl_gages FROM flowpath_attributes WHERE rl_gages IS NOT NULL"
            ).fetchall()     
            # Fixme Take only the first result if a gage shows up more than once.
            # Should be fixed upstream in hydrofabric with only error handling here.
            results = [(r[0], r[1].split(",")[0]) for r in results]
    else:
        with sqlite3.connect(gpkg_file) as conn:
            results = conn.execute(
                "SELECT id, gage FROM 'flowpath-attributes' WHERE gage IS NOT NULL"
            ).fetchall()

    return results

In [4]:
%%time
# This creates a list of tuples
gage_list = get_gages_from_hydrofabric(NGEN_OUTPUT_DIR)

CPU times: user 239 ms, sys: 496 ms, total: 735 ms
Wall time: 16.1 s


In [5]:
ngen_gages_df = pd.DataFrame(gage_list, columns=["secondary_location_id", "primary_location_id"])
ngen_gages_df["primary_location_id"] = "usgs-" + ngen_gages_df["primary_location_id"].astype(str)
ngen_gages_df["secondary_location_id"] = ngen_gages_df.secondary_location_id.str.replace("wb", LOCATION_ID_PREFIX)

In [6]:
ngen_gages_df.head()

Unnamed: 0,secondary_location_id,primary_location_id
0,nrds22-2877056,usgs-10251980
1,nrds22-2877002,usgs-10251890
2,nrds22-2875761,usgs-10247200
3,nrds22-2880088,usgs-10249190
4,nrds22-2888411,usgs-10245800


### Add configuration name and append crosswalk to remote warehouse

In [7]:
session = boto3.Session()
credentials = session.get_credentials()
aws_access_key_id = credentials.access_key
aws_secret_access_key = credentials.secret_key
aws_region = session.region_name
os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
os.environ["AWS_REGION"] = aws_region

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [8]:
%%time
spark = create_spark_session()

dir_path = "/data/temp_warehouse"

ev = teehr.Evaluation(
    spark=spark,
    dir_path=dir_path,
    create_dir=False
)

INFO:teehr.evaluation.spark_session_utils:üöÄ Creating Spark session: TEEHR Evaluation
INFO:teehr.evaluation.spark_session_utils:‚úÖ Spark local configuration successful!
INFO:teehr.evaluation.spark_session_utils:Setting Hadoop's default AWS credentials provider and AWS region
INFO:botocore.credentials:Found credentials in environment variables.
INFO:teehr.evaluation.spark_session_utils:üîë Using AWS credentials from boto3
INFO:teehr.evaluation.spark_session_utils:Configuring Iceberg catalogs...
INFO:teehr.evaluation.spark_session_utils:‚öôÔ∏è All settings applied. Creating Spark session...
INFO:teehr.evaluation.spark_session_utils:üéâ Spark session created successfully!
INFO:teehr.evaluation.evaluation:Using provided Spark session.
INFO:teehr.evaluation.evaluation:Active catalog set to local.
INFO:teehr.evaluation.evaluation:Found evaluation version 0.6.0 in /data/temp_warehouse/local.


CPU times: user 41.2 ms, sys: 26.9 ms, total: 68 ms
Wall time: 11.5 s


In [9]:
ev.set_active_catalog("remote")
ev.active_catalog

INFO:teehr.evaluation.evaluation:Active catalog set to remote.


RemoteCatalog(warehouse_dir='s3://dev-teehr-iceberg-warehouse/', catalog_name='iceberg', namespace_name='teehr', catalog_type='rest', catalog_uri='http://iceberg-rest:8181')

In [23]:
# Add configuration name?
ev.configurations.add(
    [
        teehr.Configuration(
            name=CONFIGURATION_NAME,
            type="secondary",
            description="POC version of DataStream forecasts, hydrofabric v.2.2, CFE-NOM",
        )
    ]
)

INFO:teehr.evaluation.tables.base_table:Loading files from iceberg.teehr.configurations.
INFO:teehr.evaluation.read:Reading files from iceberg.teehr.configurations.
INFO:teehr.evaluation.tables.domain_table:Validating 1 objects before adding to configurations table
INFO:teehr.evaluation.validate:Validating DataFrame against schema.
INFO:teehr.evaluation.tables.domain_table:Adding 1 objects to configurations table
INFO:teehr.evaluation.tables.domain_table:Validating configurations table after adding 1 objects
INFO:teehr.evaluation.validate:Validating DataFrame against schema.


In [11]:
teehr_locations = ev.locations.to_pandas()

INFO:teehr.evaluation.tables.base_table:Loading files from iceberg.teehr.locations.
INFO:teehr.evaluation.read:Reading files from iceberg.teehr.locations.


In [13]:
ngen_gages_df_clip = ngen_gages_df[ngen_gages_df.primary_location_id.isin(teehr_locations["id"])]

In [20]:
ngen_gages_df_clip.to_parquet("/data/datastream_output/vpu16/nrds_usgs_xwalk.parquet")

In [23]:
%%time
ev.location_crosswalks.load_parquet(in_path="/data/datastream_output/vpu16/nrds_usgs_xwalk.parquet", write_mode="append")

INFO:teehr.evaluation.tables.base_table:Loading files from iceberg.teehr.location_crosswalks.
INFO:teehr.evaluation.read:Reading files from iceberg.teehr.location_crosswalks.
INFO:teehr.loading.location_crosswalks:Converting location crosswalks data from: /data/datastream_output/vpu16/nrds_usgs_xwalk.parquet
INFO:teehr.evaluation.validate:Validating DataFrame against schema.
INFO:teehr.evaluation.extract:Converted 1 files.
INFO:teehr.evaluation.read:Reading files from /data/temp_warehouse/local/cache/loading/location_crosswalks.
INFO:teehr.evaluation.validate:Enforcing warehouse schema.
INFO:teehr.evaluation.validate:Enforcing foreign key constraints.
INFO:teehr.evaluation.tables.base_table:Loading files from iceberg.teehr.locations.
INFO:teehr.evaluation.read:Reading files from iceberg.teehr.locations.
INFO:teehr.evaluation.tables.base_table:Loading files from iceberg.teehr.location_crosswalks.
INFO:teehr.evaluation.read:Reading files from iceberg.teehr.location_crosswalks.


CPU times: user 49.7 ms, sys: 14.9 ms, total: 64.6 ms
Wall time: 3.16 s
