In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import shutil
import glob

import teehr

### Setup

In [2]:
# set up pathing for data to read into evaluation
inputDir = Path(Path.home(),'temp', 'PSU_data', 'teehr_inputs')
primaryTS_dir = Path(inputDir, 'primary_timeseries')
secondaryTS_dir = Path(inputDir, 'secondary_timeseries_TEST')
crosswalk_dir = Path(inputDir, 'crosswalk')
configurations_dir = Path(inputDir, 'configurations')

# set pathing for evaluation, wipe contents if it exists
eval_dir = Path(Path.home(), 'temp', 'PSU_data', 'PSU_evaluation')
if eval_dir.exists():
    shutil.rmtree(eval_dir, ignore_errors=True)

### Create eval

In [3]:
# read eval
ev = teehr.Evaluation(dir_path=eval_dir, create_dir=True)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/10/02 12:51:12 WARN Utils: Your hostname, RTI-504155, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/10/02 12:51:12 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/home/slandsteiner/repos/teehr/.venv/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/slandsteiner/.ivy2.5.2/cache
The jars for the packages stored in: /home/slandsteiner/.ivy2.5.2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-0f2a3a79-eb84-4688-bc1a-67d349630e1d;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.4.1 in central
	found software.amazon.awssdk#bundle;2.24.6 in central
	found org.wildfly.openssl#wildfly-openssl;1.1.3.Final in central
:: resolution report :: r

In [4]:
# clone the operational template
ev.clone_from_s3('e4_nwm_operational')

25/10/02 12:51:19 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
An empty dataframe was returned for 'primary_timeseries'.                       
An empty dataframe was returned for 'secondary_timeseries'.
An empty dataframe was returned for 'joined_timeseries'.


### Inspect initial data

In [5]:
configurations_df = ev.configurations.to_pandas()
configurations_df

Unnamed: 0,name,type,description
0,usgs_observations,primary,USGS streamflow gauge observations


In [6]:
crosswalk_df = ev.location_crosswalks.to_pandas()
crosswalk_df

Unnamed: 0,primary_location_id,secondary_location_id
0,huc12-010100020101,huc12-010100020101
1,huc12-010100020102,huc12-010100020102
2,huc12-010100020103,huc12-010100020103
3,huc12-010100020104,huc12-010100020104
4,huc12-010100020105,huc12-010100020105
...,...,...
170370,usgs-04109000,nwm30-12246166
170371,usgs-12205000,nwm21-23956418
170372,usgs-11526500,nwm20-8244332
170373,usgs-1203951610,usgs-1203951610


In [7]:
secondary_timeseries_df = ev.secondary_timeseries.to_pandas()
secondary_timeseries_df

Unnamed: 0,reference_time,value_time,value,variable_name,configuration_name,unit_name,location_id,member


In [8]:
units_df = ev.units.to_pandas()
units_df

Unnamed: 0,name,long_name
0,m^3/s,Cubic Meters Per Second
1,ft^3/s,Cubic Feet Per Second
2,km^2,Square Kilometers
3,mm/s,Millimeters Per Second
4,in,Inches
5,in/hr,Inches per Hour
6,mi^2,Square Miles
7,mm,Millimeters
8,mm/hr,Millimeters per Hour


In [9]:
variables_df = ev.variables.to_pandas()
variables_df

Unnamed: 0,name,long_name
0,streamflow_hourly_inst,Hourly Instantaneous Streamflow
1,streamflow_daily_mean,Daily Mean Streamflow
2,rainfall_hourly_rate,Hourly Rainfall Rate
3,precip_hourly_accum,Hourly Accumulated Precipitation
4,precip_daily_accum,Daily Accumulated Precipitation


In [10]:
locations_df = ev.locations.to_pandas()
locations_df

                                                                                

Unnamed: 0,id,name,geometry
0,usgsbasin-01049000,"Sebasticook River near Pittsfield, Maine",b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x85\x02...
1,usgsbasin-01066500,"LITTLE OSSIPEE RIVER NEAR SOUTH LIMINGTON, ME",b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x0b\x01...
2,usgsbasin-01072800,"COCHECO RIVER NEAR ROCHESTER, NH",b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\xbb\x00...
3,usgsbasin-01075000,"PEMIGEWASSET RIVER AT WOODSTOCK, NH",b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\xfd\x00...
4,usgsbasin-01075800,"STEVENS BROOK NEAR WENTWORTH, NH",b'\x01\x03\x00\x00\x00\x01\x00\x00\x00F\x00\x0...
...,...,...,...
146640,usgs-50332000,"RIVER GUT AT RIVER, ST. CROIX USVI",b'\x01\x01\x00\x00\x005\xd4($\x194P\xc0\x11l\x...
146641,usgs-50333500,"RIVER GUT NR GOLDEN GROVE, ST. CROIX USVI",b'\x01\x01\x00\x00\x00\x0cx\x99a#3P\xc0\x9f\xf...
146642,usgs-50337500,"GUT 4.5 AT CANE VALLEY ST. CROIX, USVI",b'\x01\x01\x00\x00\x00\x0e\x9a\x02>d6P\xc0\xc9...
146643,usgs-50345000,"JOLLY HILL GUT AT JOLLY HILL, ST. CROIX USVI",b'\x01\x01\x00\x00\x00h\xe1\xfc\x9757P\xc0\x95...


### Load PSU data into template

#### Add configurations

In [11]:
# load PSU data
PSU_config_path = Path(configurations_dir, 'PSU_configurations.csv')
PSU_configurations_df = pd.read_csv(PSU_config_path)
PSU_configurations_df

Unnamed: 0,name,type,description
0,d_ensemble,secondary,PSU ensemble mean of dHBV dPRMS dSacSMA
1,d_hbv,secondary,PSU differentiable HBV
2,d_prms,secondary,PSU differentiable PRMS
3,d_sacsma,secondary,PSU differentiable SacSMA


In [12]:
# add to the table
unique_configurations = PSU_configurations_df.name.unique()
for configuration in unique_configurations:
    row = PSU_configurations_df[PSU_configurations_df['name'] == configuration]
    c_name = row['name'].values[0]
    c_type = row['type'].values[0]
    c_desc = row['description'].values[0]
    new_configuration = teehr.Configuration(
        name=c_name,
        type=c_type,
        description=c_desc
    )
    ev.configurations.add(new_configuration)

In [13]:
# reload and inspect
configurations_df = ev.configurations.to_pandas()
configurations_df

Unnamed: 0,name,type,description
0,d_ensemble,secondary,PSU ensemble mean of dHBV dPRMS dSacSMA
1,usgs_observations,primary,USGS streamflow gauge observations
2,d_sacsma,secondary,PSU differentiable SacSMA
3,d_prms,secondary,PSU differentiable PRMS
4,d_hbv,secondary,PSU differentiable HBV


#### Update crosswalk

In [14]:
PSU_crosswalk_path = Path(crosswalk_dir, 'PSU_crosswalk.csv')
PSU_crosswalk_df = pd.read_csv(PSU_crosswalk_path)
PSU_crosswalk_df

Unnamed: 0,primary_location_id,secondary_location_id
0,usgs-01011000,d_ensemble-01011000
1,usgs-01013500,d_ensemble-01013500
2,usgs-01015800,d_ensemble-01015800
3,usgs-01017000,d_ensemble-01017000
4,usgs-01019000,d_ensemble-01019000
...,...,...
13510,usgs-14366000,nwm30-78026518
13511,usgs-14369500,nwm30-78026512
13512,usgs-14372300,nwm30-78026497
13513,usgs-14377100,nwm30-78026603


In [15]:
# add to the table
ev.location_crosswalks.load_dataframe(df = PSU_crosswalk_df)

In [16]:
# reload and inspect
crosswalk_df = ev.location_crosswalks.to_pandas()
crosswalk_df

Unnamed: 0,primary_location_id,secondary_location_id
0,huc12-010100020101,huc12-010100020101
1,huc12-010100020102,huc12-010100020102
2,huc12-010100020103,huc12-010100020103
3,huc12-010100020104,huc12-010100020104
4,huc12-010100020105,huc12-010100020105
...,...,...
183834,usgs-04109000,nwm30-12246166
183835,usgs-12205000,nwm21-23956418
183836,usgs-11526500,nwm20-8244332
183837,usgs-1203951610,usgs-1203951610


#### Trim locations/location_crosswalks table to only relevant gages

##### Trim locations

In [17]:
# obtain unique gage list from PSU crosswalk
unique_PSU_gages = PSU_crosswalk_df['primary_location_id'].unique()

# trim locations_df to only consider id values in the  unique list, and inspect
locations_df_trimmed = locations_df[locations_df.id.isin(unique_PSU_gages)]
locations_df_trimmed = locations_df_trimmed.reset_index(drop=True)
locations_df_trimmed

Unnamed: 0,id,name,geometry
0,usgs-01011000,"Allagash River near Allagash, Maine",b'\x01\x01\x00\x00\x00l\xf6\xf6\x9d\x15EQ\xc0\...
1,usgs-01013500,"Fish River near Fort Kent, Maine","b'\x01\x01\x00\x00\x00\xdf{,;L%Q\xc0fffff\x9eG@'"
2,usgs-01015800,"Aroostook River near Masardis, Maine",b'\x01\x01\x00\x00\x00\xc9` c\xc9\x17Q\xc0r\x1...
3,usgs-01017000,"Aroostook River at Washburn, Maine","b'\x01\x01\x00\x00\x00""\xcd\xb3\xed\x0f\nQ\xc0..."
4,usgs-01019000,"Grand Lake Stream at Grand Lake Stream, Maine",b'\x01\x01\x00\x00\x00\xb23\xbfy5\xf1P\xc0\xae...
...,...,...,...
2698,usgs-14366000,"APPLEGATE RIVER NEAR APPLEGATE, OR",b'\x01\x01\x00\x00\x00\xa6\xd6\xfb\x8d\xf6\xc8...
2699,usgs-14369500,"APPLEGATE RIVER NEAR WILDERVILLE, OR",b'\x01\x01\x00\x00\x00[\x183\xd3\x07\xda^\xc0\...
2700,usgs-14372300,"ROGUE RIVER NEAR AGNESS, OR",b'\x01\x01\x00\x00\x00\xb9\xbc\xde\xa2\xb8\x03...
2701,usgs-14377100,"ILLINOIS RIVER NEAR KERBY, OR",b'\x01\x01\x00\x00\x00\x98\xc0\xad\xbby\xea^\x...


In [18]:
# overwrite, reload, and inspect
ev.locations.load_dataframe(
    df=locations_df_trimmed,
    write_mode='overwrite'
)
locations_df = ev.locations.to_pandas()
locations_df

Unnamed: 0,id,name,geometry
0,usgs-01011000,"Allagash River near Allagash, Maine",b'\x01\x01\x00\x00\x00l\xf6\xf6\x9d\x15EQ\xc0\...
1,usgs-01013500,"Fish River near Fort Kent, Maine","b'\x01\x01\x00\x00\x00\xdf{,;L%Q\xc0fffff\x9eG@'"
2,usgs-01015800,"Aroostook River near Masardis, Maine",b'\x01\x01\x00\x00\x00\xc9` c\xc9\x17Q\xc0r\x1...
3,usgs-01017000,"Aroostook River at Washburn, Maine","b'\x01\x01\x00\x00\x00""\xcd\xb3\xed\x0f\nQ\xc0..."
4,usgs-01019000,"Grand Lake Stream at Grand Lake Stream, Maine",b'\x01\x01\x00\x00\x00\xb23\xbfy5\xf1P\xc0\xae...
...,...,...,...
2698,usgs-14366000,"APPLEGATE RIVER NEAR APPLEGATE, OR",b'\x01\x01\x00\x00\x00\xa6\xd6\xfb\x8d\xf6\xc8...
2699,usgs-14369500,"APPLEGATE RIVER NEAR WILDERVILLE, OR",b'\x01\x01\x00\x00\x00[\x183\xd3\x07\xda^\xc0\...
2700,usgs-14372300,"ROGUE RIVER NEAR AGNESS, OR",b'\x01\x01\x00\x00\x00\xb9\xbc\xde\xa2\xb8\x03...
2701,usgs-14377100,"ILLINOIS RIVER NEAR KERBY, OR",b'\x01\x01\x00\x00\x00\x98\xc0\xad\xbby\xea^\x...


##### Trim crosswalk

In [19]:
# trim crosswalk_df to only consider id values in the  unique list, and inspect
crosswalk_df_trimmed = crosswalk_df[crosswalk_df.primary_location_id.isin(unique_PSU_gages)]
crosswalk_df_trimmed = crosswalk_df_trimmed.reset_index(drop=True)
crosswalk_df_trimmed

Unnamed: 0,primary_location_id,secondary_location_id
0,usgs-01011000,d_ensemble-01011000
1,usgs-01013500,d_ensemble-01013500
2,usgs-01015800,d_ensemble-01015800
3,usgs-01017000,d_ensemble-01017000
4,usgs-01019000,d_ensemble-01019000
...,...,...
23805,usgs-08110430,nwm20-5574947
23806,usgs-02425000,nwm20-21662336
23807,usgs-08175000,nwm20-1623207
23808,usgs-04109000,nwm30-12246166


In [20]:
# overwrite, reload, and inspect
ev.location_crosswalks.load_dataframe(
    df=crosswalk_df_trimmed,
    write_mode='overwrite'
)
crosswalk_df = ev.location_crosswalks.to_pandas()
crosswalk_df

Unnamed: 0,primary_location_id,secondary_location_id
0,usgs-01011000,d_ensemble-01011000
1,usgs-01013500,d_ensemble-01013500
2,usgs-01015800,d_ensemble-01015800
3,usgs-01017000,d_ensemble-01017000
4,usgs-01019000,d_ensemble-01019000
...,...,...
23805,usgs-14366000,usgs-14366000
23806,usgs-14369500,usgs-14369500
23807,usgs-14372300,usgs-14372300
23808,usgs-14377100,usgs-14377100


#### Load observed USGS data into Eval

In [21]:
ev.primary_timeseries.load_parquet(in_path=primaryTS_dir, max_workers=1)

                                                                                

In [22]:
primary_timeseries_sdf = ev.primary_timeseries.to_sdf()
primary_timeseries_sdf.show()

+--------------+-------------------+---------+---------+-------------+------------------+--------------------+
|reference_time|         value_time|    value|unit_name|  location_id|configuration_name|       variable_name|
+--------------+-------------------+---------+---------+-------------+------------------+--------------------+
|          NULL|1994-04-30 00:00:00|326.46964|    m^3/s|usgs-01011000| usgs_observations|streamflow_daily_...|
|          NULL|1994-06-17 00:00:00|142.38655|    m^3/s|usgs-01011000| usgs_observations|streamflow_daily_...|
|          NULL|1995-10-30 00:00:00|127.68539|    m^3/s|usgs-01011000| usgs_observations|streamflow_daily_...|
|          NULL|1995-11-12 00:00:00|93.598976|    m^3/s|usgs-01011000| usgs_observations|streamflow_daily_...|
|          NULL|1996-07-19 00:00:00| 67.93684|    m^3/s|usgs-01011000| usgs_observations|streamflow_daily_...|
|          NULL|1996-12-08 00:00:00| 88.76151|    m^3/s|usgs-01011000| usgs_observations|streamflow_daily_...|
|

#### Load simulated PSU data into the Eval

In [23]:
# add data to eval
ev.secondary_timeseries.load_parquet(in_path=secondaryTS_dir, max_workers=1)

                                                                                

In [24]:
# load table, and inspect
secondary_timeseries_sdf = ev.secondary_timeseries.to_sdf()
secondary_timeseries_sdf.show()

+--------------+-------------------+----------+---------+-------------+------+------------------+--------------------+
|reference_time|         value_time|     value|unit_name|  location_id|member|configuration_name|       variable_name|
+--------------+-------------------+----------+---------+-------------+------+------------------+--------------------+
|          NULL|2003-08-09 00:00:00|  0.544612|    m^3/s|usgs-05452200|  None|          d_sacsma|streamflow_daily_...|
|          NULL|1992-10-01 00:00:00|0.05656362|    m^3/s|usgs-01049000|  None|          d_sacsma|streamflow_daily_...|
|          NULL|2003-08-09 00:00:00| 1.7304579|    m^3/s|usgs-05471200|  None|          d_sacsma|streamflow_daily_...|
|          NULL|1992-10-01 00:00:00|0.16337647|    m^3/s|usgs-01111500|  None|          d_sacsma|streamflow_daily_...|
|          NULL|2003-08-09 00:00:00| 1.2884283|    m^3/s|usgs-05496000|  None|          d_sacsma|streamflow_daily_...|
|          NULL|1992-10-01 00:00:00|   2.21335| 

### Write JTS

In [27]:
sdf = ev.joined_timeseries.to_sdf()
sdf.show()

+--------------+----------+-------------+---------------+-------------+------------------+---------+-------------------+---------------------+------+
|reference_time|value_time|primary_value|secondary_value|variable_name|configuration_name|unit_name|primary_location_id|secondary_location_id|member|
+--------------+----------+-------------+---------------+-------------+------------------+---------+-------------------+---------------------+------+
+--------------+----------+-------------+---------------+-------------+------------------+---------+-------------------+---------------------+------+



In [28]:
ev.joined_timeseries.create(add_attrs=False)

                                                                                

In [29]:
sdf = ev.joined_timeseries.to_sdf()
sdf.show()

+--------------+-------------------+-------------------+---------------------+-------------+---------------+---------+------+------------------+--------------------+
|reference_time|         value_time|primary_location_id|secondary_location_id|primary_value|secondary_value|unit_name|member|configuration_name|       variable_name|
+--------------+-------------------+-------------------+---------------------+-------------+---------------+---------+------+------------------+--------------------+
|          NULL|1993-03-24 00:00:00|      usgs-06799100|        usgs-06799100|     9.542777|      13.559723|    m^3/s|  None|          d_sacsma|streamflow_daily_...|
|          NULL|1992-10-26 00:00:00|      usgs-01011000|        usgs-01011000|     33.93774|       71.04006|    m^3/s|  None|          d_sacsma|streamflow_daily_...|
|          NULL|1993-04-07 00:00:00|      usgs-06799100|        usgs-06799100|     10.67545|      14.996458|    m^3/s|  None|          d_sacsma|streamflow_daily_...|
|   

In [30]:
ev.spark.stop()