In [1]:
from teehr import Evaluation
from pathlib import Path
import shutil

In [2]:
# Set a path to the directory where the evaluation will be created
TEST_STUDY_DIR = Path(Path().home(), "temp", "real_study")
shutil.rmtree(TEST_STUDY_DIR, ignore_errors=True)
TEST_STUDY_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
TEST_DATA = "/home/slandsteiner/repos/teehr/tests/data/two_locations/"
LOCATIONS = Path(TEST_DATA, "two_locations.parquet")
XWALKS = Path(TEST_DATA, "two_crosswalks.parquet")
LOCATION_ATTRS = Path(TEST_DATA, "two_location_attributes.parquet")

In [4]:
# Create an Evaluation object
eval = Evaluation(dir_path=TEST_STUDY_DIR)

# Enable logging
eval.enable_logging()

24/09/10 15:26:38 WARN Utils: Your hostname, Dev-VM-Ubuntu-22 resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
24/09/10 15:26:38 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/10 15:26:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
# Clone the template
eval.clone_template()

In [6]:
# Load the location data (observations)
eval.load.import_locations(in_path=LOCATIONS)

In [7]:
# Load the timeseries data and map over the fields and set constants
eval.locations.to_geopandas()

                                                                                

Unnamed: 0,id,name,geometry
0,usgs-14316700,"STEAMBOAT CREEK NEAR GLIDE, OR",POINT (-122.72894 43.34984)
1,usgs-14138800,"BLAZED ALDER CREEK NEAR RHODODENDRON, OR",POINT (-121.89147 45.45262)


In [8]:
eval.fetch.usgs_streamflow(
    start_date="2010-10-01",
    end_date="2012-09-30"
)

INFO:teehr.fetching.usgs.usgs:Fetching USGS streamflow data.
DEBUG:teehr.fetching.utils:Creating periods based on chunk_by.
DEBUG:teehr.fetching.usgs.usgs:Fetching USGS streamflow data from NWIS.
DEBUG:teehr.fetching.usgs.usgs:Formatting column names.
DEBUG:teehr.fetching.utils:Formatting timeseries data types.
DEBUG:teehr.fetching.usgs.usgs:Filtering to hourly data.
DEBUG:teehr.fetching.usgs.usgs:Filtering out no data values.
DEBUG:teehr.fetching.usgs.usgs:Converting to SI units.
DEBUG:teehr.fetching.usgs.usgs:Formatting output filename.
DEBUG:teehr.fetching.utils:Writing parquet file: /home/slandsteiner/temp/real_study/cache/fetching/usgs/usgs_observations/streamflow_hourly_inst/2010-10-01_2012-09-30.parquet
INFO:teehr.loading.timeseries:Validating and inserting timeseries data from /home/slandsteiner/temp/real_study/cache/fetching/usgs
INFO:teehr.loading.utils:Creating database tables.
INFO:teehr.loading.utils:Inserting units from dataset to database.
DEBUG:teehr.loading.utils:Inser

In [9]:
eval.primary_timeseries.to_pandas()

Unnamed: 0,reference_time,value_time,configuration_name,unit_name,variable_name,value,location_id
0,2010-10-01 00:00:00,2010-10-01 00:00:00,usgs_observations,m^3/s,streamflow_hourly_inst,0.129691,usgs-14138800
1,2010-10-01 01:00:00,2010-10-01 01:00:00,usgs_observations,m^3/s,streamflow_hourly_inst,0.122329,usgs-14138800
2,2010-10-01 02:00:00,2010-10-01 02:00:00,usgs_observations,m^3/s,streamflow_hourly_inst,0.122329,usgs-14138800
3,2010-10-01 03:00:00,2010-10-01 03:00:00,usgs_observations,m^3/s,streamflow_hourly_inst,0.122329,usgs-14138800
4,2010-10-01 04:00:00,2010-10-01 04:00:00,usgs_observations,m^3/s,streamflow_hourly_inst,0.122329,usgs-14138800
...,...,...,...,...,...,...,...
34133,2012-09-29 20:00:00,2012-09-29 20:00:00,usgs_observations,m^3/s,streamflow_hourly_inst,1.033565,usgs-14316700
34134,2012-09-29 21:00:00,2012-09-29 21:00:00,usgs_observations,m^3/s,streamflow_hourly_inst,1.033565,usgs-14316700
34135,2012-09-29 22:00:00,2012-09-29 22:00:00,usgs_observations,m^3/s,streamflow_hourly_inst,1.033565,usgs-14316700
34136,2012-09-29 23:00:00,2012-09-29 23:00:00,usgs_observations,m^3/s,streamflow_hourly_inst,1.033565,usgs-14316700


In [10]:
# Load the crosswalk data
eval.load.import_location_crosswalks(
    in_path=XWALKS
)

INFO:teehr.loading.location_crosswalks:Converting crosswalks data: /home/slandsteiner/repos/teehr/tests/data/two_locations/two_crosswalks.parquet
DEBUG:teehr.loading.location_crosswalks:Using default field mapping.
INFO:teehr.loading.location_crosswalks:Converting location crosswalks data from: /home/slandsteiner/repos/teehr/tests/data/two_locations/two_crosswalks.parquet
INFO:teehr.loading.location_crosswalks:Converted 1 files.
INFO:teehr.loading.location_crosswalks:Validating and inserting location crosswalks data from /home/slandsteiner/temp/real_study/cache/loading/location_crosswalks
INFO:teehr.loading.utils:Creating database tables.
DEBUG:teehr.loading.utils:Inserting locations from /home/slandsteiner/temp/real_study/dataset/locations**/*.parquet.
INFO:teehr.loading.location_crosswalks:Recursively validating and inserting all files in: /home/slandsteiner/temp/real_study/cache/loading/location_crosswalks/**/*.parquet
INFO:teehr.loading.location_crosswalks:Validating and inserting 

In [11]:
eval.fetch.nwm_retrospective_points(
    nwm_version="nwm30",
    variable_name="streamflow",
    start_date="2010-10-01",
    end_date="2012-09-30"
)

INFO:teehr.evaluation.utils:Getting schema variable name for streamflow.
INFO:teehr.evaluation.fetch:Getting secondary location IDs.
DEBUG:teehr.querying.filter_format:Filter is not a list.  Making a list.
DEBUG:teehr.querying.filter_format:Validating and applying {'column': 'secondary_location_id', 'operator': 'like', 'value': 'nwm30-%'}
DEBUG:teehr.querying.filter_format:Filter: {"column":"secondary_location_id","operator":"like","value":"nwm30-%"}
INFO:teehr.fetching.nwm.retrospective_points:Fetching NWM retrospective point data, version: nwm30.
DEBUG:teehr.fetching.nwm.retrospective_points:Chunking data by time.
DEBUG:teehr.fetching.utils:Creating periods based on chunk_by.
DEBUG:teehr.fetching.nwm.retrospective_points:Fetching point data for 2010-10-01 00:00:00 to 2012-09-30 23:59:59.999999999.
DEBUG:teehr.fetching.nwm.retrospective_points:Converting DataArray to a formatted DataFrame.
DEBUG:teehr.fetching.utils:Formatting timeseries data types.
DEBUG:teehr.fetching.utils:Writing 

In [12]:
eval.secondary_timeseries.to_pandas()

Unnamed: 0,reference_time,value_time,configuration_name,unit_name,variable_name,value,location_id
0,2010-10-01 00:00:00,2010-10-01 00:00:00,nwm30_retrospective,m^3/s,streamflow_hourly_inst,1.29,nwm30-23894572
1,2010-10-01 00:00:00,2010-10-01 00:00:00,nwm30_retrospective,m^3/s,streamflow_hourly_inst,0.07,nwm30-23736071
2,2010-10-01 01:00:00,2010-10-01 01:00:00,nwm30_retrospective,m^3/s,streamflow_hourly_inst,1.30,nwm30-23894572
3,2010-10-01 01:00:00,2010-10-01 01:00:00,nwm30_retrospective,m^3/s,streamflow_hourly_inst,0.07,nwm30-23736071
4,2010-10-01 02:00:00,2010-10-01 02:00:00,nwm30_retrospective,m^3/s,streamflow_hourly_inst,1.30,nwm30-23894572
...,...,...,...,...,...,...,...
35083,2012-09-30 21:00:00,2012-09-30 21:00:00,nwm30_retrospective,m^3/s,streamflow_hourly_inst,0.07,nwm30-23736071
35084,2012-09-30 22:00:00,2012-09-30 22:00:00,nwm30_retrospective,m^3/s,streamflow_hourly_inst,0.52,nwm30-23894572
35085,2012-09-30 22:00:00,2012-09-30 22:00:00,nwm30_retrospective,m^3/s,streamflow_hourly_inst,0.07,nwm30-23736071
35086,2012-09-30 23:00:00,2012-09-30 23:00:00,nwm30_retrospective,m^3/s,streamflow_hourly_inst,0.52,nwm30-23894572


In [13]:
from teehr.models.tables import (
    Attribute,
)
import duckdb

In [14]:
df = duckdb.query(
    f"SELECT distinct(attribute_name) FROM read_parquet('{LOCATION_ATTRS}');"
).to_df()
attrs_list = [Attribute(name=i,type="catagorical", description=i) for i in list(df.attribute_name)]
# attrs_list

In [15]:
# Add some attributes
eval.load.add_attribute(attrs_list)

DEBUG:teehr.loading.add_domains:Adding attribute to /home/slandsteiner/temp/real_study/dataset/attributes/attributes.csv
INFO:teehr.loading.utils:Creating database tables.


In [16]:
eval.load.import_location_attributes(LOCATION_ATTRS, field_mapping={"attribute_value": "value"})

INFO:teehr.loading.location_attributes:Converting attributes data: /home/slandsteiner/repos/teehr/tests/data/two_locations/two_location_attributes.parquet
DEBUG:teehr.loading.location_attributes:Merging user field_mapping with default field mapping.
INFO:teehr.loading.location_attributes:Converting location attributes data from: /home/slandsteiner/repos/teehr/tests/data/two_locations/two_location_attributes.parquet
INFO:teehr.loading.location_attributes:Converted 1 files.
INFO:teehr.loading.location_attributes:Validating and inserting location attributes data from /home/slandsteiner/temp/real_study/cache/loading/location_attributes
INFO:teehr.loading.utils:Creating database tables.
DEBUG:teehr.loading.utils:Inserting locations from /home/slandsteiner/temp/real_study/dataset/locations**/*.parquet.
INFO:teehr.loading.utils:Inserting attributes from dataset to database.
DEBUG:teehr.loading.utils:Inserting attributes from /home/slandsteiner/temp/real_study/dataset/attributes/attributes.csv

In [17]:
# Create the joined timeseries
eval.create_joined_timeseries(execute_udf=True)

INFO:teehr.loading.joined_timeseries:Joining primary and secondary timeseries
INFO:teehr.loading.joined_timeseries:Adding attributes
24/09/10 15:35:42 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
INFO:teehr.loading.joined_timeseries:Writing joined timeseries to disk
                                                                                

In [18]:
eval.joined_timeseries.to_pandas()

                                                                                

Unnamed: 0,reference_time,value_time,primary_location_id,secondary_location_id,primary_value,secondary_value,unit_name,location_id,frac_snow,frac_urban,...,q_mean,baseflow_index,river_forecast_center,month,year,water_year,primary_normalized_flow,secondary_normalized_flow,configuration_name,variable_name
0,2010-10-01 00:00:00,2010-10-01 00:00:00,usgs-14316700,nwm30-23894572,1.333723,1.29,m^3/s,usgs-14316700,0.176336580742005,0.0,...,19.909239533259516,0.508616082222394,NWRFC,10,2010,2011,0.002269,0.002194,nwm30_retrospective,streamflow_hourly_inst
1,2010-10-01 00:00:00,2010-10-01 00:00:00,usgs-14138800,nwm30-23736071,0.129691,0.07,m^3/s,usgs-14138800,0.317266212149897,0.0,...,1.5975415858787265,0.457869583655904,NWRFC,10,2010,2011,0.006118,0.003302,nwm30_retrospective,streamflow_hourly_inst
2,2010-10-01 01:00:00,2010-10-01 01:00:00,usgs-14316700,nwm30-23894572,1.333723,1.30,m^3/s,usgs-14316700,0.176336580742005,0.0,...,19.909239533259516,0.508616082222394,NWRFC,10,2010,2011,0.002269,0.002211,nwm30_retrospective,streamflow_hourly_inst
3,2010-10-01 01:00:00,2010-10-01 01:00:00,usgs-14138800,nwm30-23736071,0.122329,0.07,m^3/s,usgs-14138800,0.317266212149897,0.0,...,1.5975415858787265,0.457869583655904,NWRFC,10,2010,2011,0.005770,0.003302,nwm30_retrospective,streamflow_hourly_inst
4,2010-10-01 02:00:00,2010-10-01 02:00:00,usgs-14316700,nwm30-23894572,1.333723,1.30,m^3/s,usgs-14316700,0.176336580742005,0.0,...,19.909239533259516,0.508616082222394,NWRFC,10,2010,2011,0.002269,0.002211,nwm30_retrospective,streamflow_hourly_inst
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34133,2012-09-29 22:00:00,2012-09-29 22:00:00,usgs-14138800,nwm30-23736071,0.052953,0.07,m^3/s,usgs-14138800,0.317266212149897,0.0,...,1.5975415858787265,0.457869583655904,NWRFC,9,2012,2012,0.002498,0.003302,nwm30_retrospective,streamflow_hourly_inst
34134,2012-09-29 23:00:00,2012-09-29 23:00:00,usgs-14316700,nwm30-23894572,1.033565,0.54,m^3/s,usgs-14316700,0.176336580742005,0.0,...,19.909239533259516,0.508616082222394,NWRFC,9,2012,2012,0.001758,0.000919,nwm30_retrospective,streamflow_hourly_inst
34135,2012-09-29 23:00:00,2012-09-29 23:00:00,usgs-14138800,nwm30-23736071,0.052953,0.07,m^3/s,usgs-14138800,0.317266212149897,0.0,...,1.5975415858787265,0.457869583655904,NWRFC,9,2012,2012,0.002498,0.003302,nwm30_retrospective,streamflow_hourly_inst
34136,2012-09-30 00:00:00,2012-09-30 00:00:00,usgs-14316700,nwm30-23894572,1.061882,0.54,m^3/s,usgs-14316700,0.176336580742005,0.0,...,19.909239533259516,0.508616082222394,NWRFC,9,2012,2012,0.001806,0.000919,nwm30_retrospective,streamflow_hourly_inst
