In [1]:
from teehr import Evaluation
from pathlib import Path
import shutil
import xarray as xr
import geopandas as gpd
from teehr.models.tables import (
    Attribute,
    Configuration,
    Variable
)
import duckdb

In [2]:
# Set a path to the directory where the evaluation will be created
TEST_STUDY_DIR = Path("/data/v0_4_protocols/p2_camels_hourly_streamflow")
shutil.rmtree(TEST_STUDY_DIR, ignore_errors=True)
TEST_STUDY_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
# Set a path to the directory where the test data is stored
TEST_DATA_DIR = Path("/data/protocols/p2_hourly_streamflow_sim/teehr_database")
LOCATIONS_FILEPATH = Path(TEST_DATA_DIR, "geometry", "usgs_point_geometry.p2.parquet")
PRIMARY_TIMESERIES_FILEPATH = Path(TEST_DATA_DIR, "primary")
CROSSWALK_FILEPATH = Path(TEST_DATA_DIR, "crosswalks")
SECONDARY_TIMESERIES_FILEPATH = Path(TEST_DATA_DIR, "secondary")
ATTR_FILEPATH = Path(TEST_DATA_DIR, "attributes")

In [4]:
# Create an Evaluation object
ev = Evaluation(dir_path=TEST_STUDY_DIR)

# Enable logging
ev.enable_logging()

# Clone the template
ev.clone_template()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/29 18:18:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/10/29 18:18:39 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [5]:
# Load the location data (observations)
ev.locations.load_spatial(in_path=LOCATIONS_FILEPATH)

<teehr.evaluation.tables.LocationTable at 0x7fd9aa82e8a0>

In [6]:
ev.locations.to_geopandas().head()

Unnamed: 0,id,name,geometry
0,usgs-01013500,"Fish River near Fort Kent, Maine",POINT (-68.58278 47.23750)
1,usgs-01022500,"Narraguagus River at Cherryfield, Maine",POINT (-67.93528 44.60806)
2,usgs-01030500,"Mattawamkeag River near Mattawamkeag, Maine",POINT (-68.30583 45.50111)
3,usgs-01031500,"Piscataquis River near Dover-Foxcroft, Maine",POINT (-69.31472 45.17500)
4,usgs-01047000,"Carrabassett River near North Anson, Maine",POINT (-69.95500 44.86917)


In [7]:
# Load the timeseries data and map over the fields and set constants
ev.primary_timeseries.load_parquet(
    in_path=PRIMARY_TIMESERIES_FILEPATH,
    field_mapping={
        "reference_time": "reference_time",
        "value_time": "value_time",
        "configuration": "configuration_name",
        "measurement_unit": "unit_name",
        "variable_name": "variable_name",
        "value": "value",
        "location_id": "location_id"
    },
    constant_field_values={
        "unit_name": "m^3/s",
        "configuration_name": "usgs_observations"
    }
)

                                                                                

<teehr.evaluation.tables.PrimaryTimeseriesTable at 0x7fd8b84a4560>

In [8]:
# Load the crosswalk data
ev.location_crosswalks.load_parquet(
    in_path=CROSSWALK_FILEPATH
)

<teehr.evaluation.tables.LocationCrosswalkTable at 0x7fd8b84a6900>

In [9]:
# Load the secondary timeseries data and map over the fields and set constants
ev.secondary_timeseries.load_parquet(
    in_path=SECONDARY_TIMESERIES_FILEPATH,
    field_mapping={
        "reference_time": "reference_time",
        "value_time": "value_time",
        "configuration": "configuration_name",
        "measurement_unit": "unit_name",
        "variable_name": "variable_name",
        "value": "value",
        "location_id": "location_id"
    },
    constant_field_values={
        "unit_name": "m^3/s",
        "configuration_name": "nwm30_retrospective"
    }
)

                                                                                

<teehr.evaluation.tables.SecondaryTimeseriesTable at 0x7fd88a7b0b90>

In [10]:
ev.configurations.add(
    configuration=Configuration(
        name="marrmot_37_hbv_obj1",
        type="secondary",
        description="marrmot_37_hbv_obj1"
    )
)

In [11]:
# Load the secondary timeseries data and map over the fields and set constants
ev.secondary_timeseries.load_parquet(
    # in_path=SECONDARY_TIMESERIES_FILEPATH,
    in_path="/data/protocols/p1_daily_streamflow_sim/teehr_database/secondary/configuration=marrmot_37_hbv_obj1/variable_name=streamflow_daily_mean/marrmot_37_hbv_obj1.parquet",
    field_mapping={
        "reference_time": "reference_time",
        "value_time": "value_time",
        "configuration": "configuration_name",
        "measurement_unit": "unit_name",
        "variable_name": "variable_name",
        "value": "value",
        "location_id": "location_id"
    },
    constant_field_values={
        "unit_name": "m^3/s",
    }
)

                                                                                

<teehr.evaluation.tables.SecondaryTimeseriesTable at 0x7fd88a7470b0>

In [12]:
df = duckdb.query(
    f"SELECT distinct(attribute_name) FROM read_parquet('{ATTR_FILEPATH}/**/*.parquet');"
).to_df()
attrs_list = [Attribute(name=i,type="categorical", description=i) for i in list(df.attribute_name)]

In [13]:
# Add some attributes
ev.attributes.add(attrs_list)

In [14]:
# Load the location attribute data
ev.location_attributes.load_parquet(
    in_path=ATTR_FILEPATH,
    field_mapping={"attribute_value": "value"},
    # pattern="test_attr_*.parquet",
)

<teehr.evaluation.tables.LocationAttributeTable at 0x7fd88a2524b0>

In [15]:
# Create the joined timeseries
ev.joined_timeseries.create(execute_udf=True)

24/10/29 18:22:11 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

<teehr.evaluation.tables.JoinedTimeseriesTable at 0x7fd88a22d6d0>

In [16]:
ev.spark.stop()