In [10]:
import teehr
from pathlib import Path
import shutil

In [11]:
# Set a path to the directory where the evaluation will be created
TEST_STUDY_DIR = Path(Path().home(), "temp", "real_study")
shutil.rmtree(TEST_STUDY_DIR, ignore_errors=True)
TEST_STUDY_DIR.mkdir(parents=True, exist_ok=True)

In [12]:
TEST_DATA = Path(Path(teehr.__file__).parent.parent.parent, "tests/data/two_locations/")
# TEST_DATA = "/home/sam/git/teehr/tests/data/two_locations/"
# TEST_DATA = "/home/slandsteiner/repos/teehr/tests/data/two_locations/"
LOCATIONS = Path(TEST_DATA, "two_locations.parquet")
XWALKS = Path(TEST_DATA, "two_crosswalks.parquet")
LOCATION_ATTRS = Path(TEST_DATA, "two_location_attributes.parquet")

In [13]:
# Create an Evaluation object
ev = teehr.Evaluation(dir_path=TEST_STUDY_DIR)

# Enable logging
ev.enable_logging()

In [14]:
# Clone the template
ev.clone_template()

In [15]:
# Load the location data (observations)
ev.locations.load_spatial(in_path=LOCATIONS)

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [16]:
# Load the timeseries data and map over the fields and set constants
ev.locations.to_geopandas()

Unnamed: 0,id,name,geometry
0,usgs-14316700,"STEAMBOAT CREEK NEAR GLIDE, OR",POINT (-122.72894 43.34984)
1,usgs-14138800,"BLAZED ALDER CREEK NEAR RHODODENDRON, OR",POINT (-121.89147 45.45262)


In [17]:
ev.fetch.usgs_streamflow(
    start_date="2000-10-01",
    end_date="2012-09-30"
)

INFO:teehr.fetching.usgs.usgs:Fetching USGS streamflow data.
DEBUG:teehr.fetching.utils:Creating periods based on chunk_by.
DEBUG:teehr.fetching.usgs.usgs:Fetching USGS streamflow data from NWIS.
DEBUG:teehr.fetching.usgs.usgs:Formatting column names.
DEBUG:teehr.fetching.utils:Formatting timeseries data types.
DEBUG:teehr.fetching.usgs.usgs:Filtering to hourly data.
DEBUG:teehr.fetching.usgs.usgs:Filtering out no data values.
DEBUG:teehr.fetching.usgs.usgs:Converting to SI units.
DEBUG:teehr.fetching.usgs.usgs:Formatting output filename.
DEBUG:teehr.fetching.utils:Writing parquet file: /Users/mdenno/temp/real_study/cache/fetching/usgs/usgs_observations/streamflow_hourly_inst/2000-10-01_2012-09-30.parquet
INFO:teehr.loading.timeseries:Validating and inserting timeseries data from /Users/mdenno/temp/real_study/cache/fetching/usgs
                                                                                

In [18]:
ev.primary_timeseries.to_pandas()

Unnamed: 0,reference_time,value_time,value,unit_name,location_id,configuration_name,variable_name
0,NaT,2000-10-01 00:00:00,3.341388,m^3/s,usgs-14138800,usgs_observations,streamflow_hourly_inst
1,NaT,2000-10-01 01:00:00,3.992675,m^3/s,usgs-14138800,usgs_observations,streamflow_hourly_inst
2,NaT,2000-10-01 02:00:00,4.445745,m^3/s,usgs-14138800,usgs_observations,streamflow_hourly_inst
3,NaT,2000-10-01 03:00:00,5.408518,m^3/s,usgs-14138800,usgs_observations,streamflow_hourly_inst
4,NaT,2000-10-01 04:00:00,5.606736,m^3/s,usgs-14138800,usgs_observations,streamflow_hourly_inst
...,...,...,...,...,...,...,...
200345,NaT,2012-09-29 20:00:00,1.033565,m^3/s,usgs-14316700,usgs_observations,streamflow_hourly_inst
200346,NaT,2012-09-29 21:00:00,1.033565,m^3/s,usgs-14316700,usgs_observations,streamflow_hourly_inst
200347,NaT,2012-09-29 22:00:00,1.033565,m^3/s,usgs-14316700,usgs_observations,streamflow_hourly_inst
200348,NaT,2012-09-29 23:00:00,1.033565,m^3/s,usgs-14316700,usgs_observations,streamflow_hourly_inst


In [None]:
# Load the crosswalk data
ev.location_crosswalks.load_parquet(
    in_path=XWALKS
)

INFO:teehr.loading.location_crosswalks:Converting crosswalks data: /Users/mdenno/repos/teehr/tests/data/two_locations/two_crosswalks.parquet
DEBUG:teehr.loading.location_crosswalks:Using default field mapping.
INFO:teehr.loading.location_crosswalks:Converting location crosswalks data from: /Users/mdenno/repos/teehr/tests/data/two_locations/two_crosswalks.parquet
INFO:teehr.loading.location_crosswalks:Converted 1 files.
INFO:teehr.loading.location_crosswalks:Validating and inserting location crosswalks data from /Users/mdenno/temp/real_study/cache/loading/location_crosswalks


24/11/12 07:53:39 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 316334 ms exceeds timeout 120000 ms
24/11/12 07:53:39 WARN SparkContext: Killing executors is not supported by current scheduler.
24/11/12 07:53:48 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$

In [None]:
ev.fetch.nwm_retrospective_points(
    nwm_version="nwm30",
    variable_name="streamflow",
    start_date="2000-10-01",
    end_date="2012-09-30"
)

In [None]:
ev.secondary_timeseries.to_pandas()

In [None]:
from teehr.models.tables import (
    Attribute,
)
import duckdb

In [None]:
df = duckdb.query(
    f"SELECT distinct(attribute_name) FROM read_parquet('{LOCATION_ATTRS}');"
).to_df()
attrs_list = [Attribute(name=i,type="catagorical", description=i) for i in list(df.attribute_name)]
# attrs_list

In [None]:
# Add some attributes
ev.attributes.add(attrs_list)

In [None]:
ev.location_attributes.load_parquet(LOCATION_ATTRS, field_mapping={"attribute_value": "value"})

In [None]:
# Create the joined timeseries
ev.joined_timeseries.create(execute_udf=True)

In [None]:
ev.joined_timeseries.to_pandas()