In [1]:
from teehr import Evaluation
from pathlib import Path
import shutil
import pandas as pd
import geopandas as gpd

In [2]:
# Set a path to the directory where the evaluation will be created
TEST_STUDY_DIR = Path("/data/v0_4_protocols/p0_2_location_example")
shutil.rmtree(TEST_STUDY_DIR, ignore_errors=True)
TEST_STUDY_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
TEST_DATA = Path.home()
LOCATIONS = Path(TEST_DATA, "two_locations.parquet")
XWALKS = Path(TEST_DATA, "two_crosswalks.parquet")
LOCATION_ATTRS = Path(TEST_DATA, "two_location_attributes.parquet")

In [4]:
# Create an Evaluation object
ev = Evaluation(dir_path=TEST_STUDY_DIR)

# Enable logging
ev.enable_logging()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/30 20:26:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/10/30 20:26:15 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [5]:
# Clone the template
# Future: clone an entire evaluation dataset as a starting point.
ev.clone_template()

In [6]:
# Load the location data (observations)
ev.locations.load_spatial(in_path=LOCATIONS)

<teehr.evaluation.tables.LocationTable at 0x7fc5a06f3bf0>

In [7]:
# Query the location data to see it | .filter("id = 'usgs-14138800'")
gdf = ev.locations.to_geopandas()

In [8]:
# Fetch and insert USGS onservation data for gage locations.
ev.fetch.usgs_streamflow(
    start_date="2000-10-01",
    end_date="2012-09-30"
)

INFO:teehr.fetching.usgs.usgs:Fetching USGS streamflow data.
DEBUG:teehr.fetching.utils:Creating periods based on chunk_by.
DEBUG:teehr.fetching.usgs.usgs:Fetching USGS streamflow data from NWIS.
DEBUG:teehr.fetching.usgs.usgs:Formatting column names.
DEBUG:teehr.fetching.utils:Formatting timeseries data types.
DEBUG:teehr.fetching.usgs.usgs:Filtering to hourly data.
DEBUG:teehr.fetching.usgs.usgs:Filtering out no data values.
DEBUG:teehr.fetching.usgs.usgs:Converting to SI units.
DEBUG:teehr.fetching.usgs.usgs:Formatting output filename.
DEBUG:teehr.fetching.utils:Writing parquet file: /data/v0_4_protocols/p0_2_location_example/cache/fetching/usgs/usgs_observations/streamflow_hourly_inst/2000-10-01_2012-09-30.parquet
INFO:teehr.loading.timeseries:Validating and inserting timeseries data from /data/v0_4_protocols/p0_2_location_example/cache/fetching/usgs
                                                                                

In [9]:
# Query timeseries data
(
    ev.primary_timeseries
    # .filter("location_id = 'usgs-14138800'")
    .filter("value_time = '2000-10-01 00:00:00'")
    .to_pandas()
)

INFO:teehr.evaluation.tables:Setting filter <class 'filter'>.
DEBUG:teehr.querying.filter_format:Filter value_time = '2000-10-01 00:00:00' is already string.  Applying as is.


Unnamed: 0,reference_time,value_time,value,unit_name,location_id,configuration_name,variable_name
0,NaT,2000-10-01,3.341388,m^3/s,usgs-14138800,usgs_observations,streamflow_hourly_inst
1,NaT,2000-10-01,1.132674,m^3/s,usgs-14316700,usgs_observations,streamflow_hourly_inst


In [10]:
# Load the crosswalk data
ev.location_crosswalks.load_parquet(
    in_path=XWALKS
)

INFO:teehr.loading.location_crosswalks:Converting crosswalks data: /home/jovyan/two_crosswalks.parquet
DEBUG:teehr.loading.location_crosswalks:Using default field mapping.
INFO:teehr.loading.location_crosswalks:Converting location crosswalks data from: /home/jovyan/two_crosswalks.parquet
INFO:teehr.loading.location_crosswalks:Converted 1 files.
INFO:teehr.loading.location_crosswalks:Validating and inserting location crosswalks data from /data/v0_4_protocols/p0_2_location_example/cache/loading/location_crosswalks


<teehr.evaluation.tables.LocationCrosswalkTable at 0x7fc5a06f33b0>

In [11]:
ev.location_crosswalks.to_pandas()

Unnamed: 0,primary_location_id,secondary_location_id
0,usgs-14316700,nwm30-23894572
1,usgs-14138800,nwm30-23736071


In [12]:
ev.fetch.nwm_retrospective_points(
    nwm_version="nwm30",
    variable_name="streamflow",
    start_date="2000-10-01",
    end_date="2012-09-30"
)

INFO:teehr.evaluation.utils:Getting schema variable name for streamflow.
INFO:teehr.evaluation.fetch:Getting secondary location IDs.
INFO:teehr.evaluation.tables:Performing the query.
DEBUG:teehr.querying.filter_format:Filter is not a list.  Making a list.
DEBUG:teehr.querying.filter_format:Validating and applying {'column': 'secondary_location_id', 'operator': 'like', 'value': 'nwm30-%'}
DEBUG:teehr.querying.filter_format:Filter: {"column":"secondary_location_id","operator":"like","value":"nwm30-%"}
INFO:teehr.fetching.nwm.retrospective_points:Fetching NWM retrospective point data, version: nwm30.
DEBUG:teehr.fetching.nwm.retrospective_points:Chunking data by time.
DEBUG:teehr.fetching.utils:Creating periods based on chunk_by.
DEBUG:teehr.fetching.nwm.retrospective_points:Fetching point data for 2000-10-01 00:00:00 to 2012-09-30 23:59:59.999999999.
DEBUG:teehr.fetching.nwm.retrospective_points:Converting DataArray to a formatted DataFrame.
DEBUG:teehr.fetching.utils:Formatting timeser

In [13]:
ev.secondary_timeseries.to_pandas()

Unnamed: 0,reference_time,value_time,value,unit_name,location_id,configuration_name,variable_name
0,NaT,2000-10-01 00:00:00,0.38,m^3/s,nwm30-23894572,nwm30_retrospective,streamflow_hourly_inst
1,NaT,2000-10-01 00:00:00,0.06,m^3/s,nwm30-23736071,nwm30_retrospective,streamflow_hourly_inst
2,NaT,2000-10-01 01:00:00,0.38,m^3/s,nwm30-23894572,nwm30_retrospective,streamflow_hourly_inst
3,NaT,2000-10-01 01:00:00,0.06,m^3/s,nwm30-23736071,nwm30_retrospective,streamflow_hourly_inst
4,NaT,2000-10-01 02:00:00,0.38,m^3/s,nwm30-23894572,nwm30_retrospective,streamflow_hourly_inst
...,...,...,...,...,...,...,...
210379,NaT,2012-09-30 21:00:00,0.07,m^3/s,nwm30-23736071,nwm30_retrospective,streamflow_hourly_inst
210380,NaT,2012-09-30 22:00:00,0.52,m^3/s,nwm30-23894572,nwm30_retrospective,streamflow_hourly_inst
210381,NaT,2012-09-30 22:00:00,0.07,m^3/s,nwm30-23736071,nwm30_retrospective,streamflow_hourly_inst
210382,NaT,2012-09-30 23:00:00,0.52,m^3/s,nwm30-23894572,nwm30_retrospective,streamflow_hourly_inst


In [14]:
from teehr.models.tables import (
    Attribute,
)
import duckdb

In [15]:
df = duckdb.query(
    f"SELECT distinct(attribute_name) FROM read_parquet('{LOCATION_ATTRS}');"
).to_df()
attrs_list = [Attribute(name=i,type="categorical", description=i) for i in list(df.attribute_name)]
# attrs_list

In [16]:
# Add some attributes
ev.attributes.add(attrs_list)

DEBUG:teehr.loading.add_domains:Adding attribute to /data/v0_4_protocols/p0_2_location_example/dataset/attributes/attributes.csv


In [17]:
ev.location_attributes.load_parquet(LOCATION_ATTRS, field_mapping={"attribute_value": "value"})

INFO:teehr.loading.location_attributes:Converting attributes data: /home/jovyan/two_location_attributes.parquet
DEBUG:teehr.loading.location_attributes:Merging user field_mapping with default field mapping.
INFO:teehr.loading.location_attributes:Converting location attributes data from: /home/jovyan/two_location_attributes.parquet
INFO:teehr.loading.location_attributes:Converted 1 files.
INFO:teehr.loading.location_attributes:Validating and inserting location attributes data from /data/v0_4_protocols/p0_2_location_example/cache/loading/location_attributes


<teehr.evaluation.tables.LocationAttributeTable at 0x7fc56e648170>

In [18]:
ev.location_attributes.filter("location_id = 'usgs-14138800'").to_pandas()

INFO:teehr.evaluation.tables:Setting filter <class 'filter'>.
DEBUG:teehr.querying.filter_format:Filter location_id = 'usgs-14138800' is already string.  Applying as is.


Unnamed: 0,location_id,attribute_name,value
0,usgs-14138800,aridity,0.270945979187767
1,usgs-14138800,dom_land_cover,Evergreen Needleleaf Forest
2,usgs-14138800,dom_land_cover_frac,1.0
3,usgs-14138800,drainage_area,21.2
4,usgs-14138800,elev_mean,821.62
5,usgs-14138800,forest_frac,1.0
6,usgs-14138800,frac_snow,0.317266212149897
7,usgs-14138800,frac_urban,0.0
8,usgs-14138800,high_prec_freq,12.55
9,usgs-14138800,p_mean,7.72975085557837


In [19]:
# Create the joined timeseries
ev.joined_timeseries.create(execute_udf=True)

INFO:teehr.loading.joined_timeseries:Joining primary and secondary timeseries
INFO:teehr.loading.joined_timeseries:Adding attributes
24/10/30 20:27:28 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
INFO:teehr.loading.joined_timeseries:Writing joined timeseries to disk
                                                                                

<teehr.evaluation.tables.JoinedTimeseriesTable at 0x7fc56e2a6bd0>

In [20]:
ev.joined_timeseries.to_pandas().head()

                                                                                

Unnamed: 0,reference_time,value_time,primary_location_id,secondary_location_id,primary_value,secondary_value,unit_name,location_id,frac_snow,frac_urban,...,q_mean,baseflow_index,river_forecast_center,month,year,water_year,primary_normalized_flow,secondary_normalized_flow,configuration_name,variable_name
0,NaT,2000-10-01 00:00:00,usgs-14316700,nwm30-23894572,1.132674,0.38,m^3/s,usgs-14316700,0.176336580742005,0.0,...,19.90923953325952,0.508616082222394,NWRFC,10,2000,2001,0.001927,0.000646,nwm30_retrospective,streamflow_hourly_inst
1,NaT,2000-10-01 00:00:00,usgs-14138800,nwm30-23736071,3.341388,0.06,m^3/s,usgs-14138800,0.317266212149897,0.0,...,1.5975415858787263,0.457869583655904,NWRFC,10,2000,2001,0.157613,0.00283,nwm30_retrospective,streamflow_hourly_inst
2,NaT,2000-10-01 01:00:00,usgs-14316700,nwm30-23894572,1.132674,0.38,m^3/s,usgs-14316700,0.176336580742005,0.0,...,19.90923953325952,0.508616082222394,NWRFC,10,2000,2001,0.001927,0.000646,nwm30_retrospective,streamflow_hourly_inst
3,NaT,2000-10-01 01:00:00,usgs-14138800,nwm30-23736071,3.992675,0.06,m^3/s,usgs-14138800,0.317266212149897,0.0,...,1.5975415858787263,0.457869583655904,NWRFC,10,2000,2001,0.188334,0.00283,nwm30_retrospective,streamflow_hourly_inst
4,NaT,2000-10-01 02:00:00,usgs-14316700,nwm30-23894572,1.132674,0.38,m^3/s,usgs-14316700,0.176336580742005,0.0,...,19.90923953325952,0.508616082222394,NWRFC,10,2000,2001,0.001927,0.000646,nwm30_retrospective,streamflow_hourly_inst


In [21]:
ev.spark.stop()