This file contains an example of how to build a simple TEEHR dataset.

The input data is all CSV and GeoJSON files.  This is intended to be
the simplest example of how TEEHR can be used.

In [None]:
import pandas as pd
import geopandas as gpd

# Setup database paths
from pathlib import Path
from teehr.classes.duckdb_database import DuckDBDatabase
from teehr.classes.duckdb_joined_parquet import DuckDBJoinedParquet

In [None]:
!aws s3 cp

In [None]:
RAW_DATA_FILEPATH = Path("../raw")

# define the base TEEHR directory location
# TEEHR_BASE = Path(Path.home(), "teehr/example_1/teehr_base")
TEEHR_BASE = Path("../teehr_base")

In [None]:
# create folders for each type of TEEHR 'table'
PRIMARY_FILEPATH = Path(TEEHR_BASE, 'primary')
SECONDARY_FILEPATH = Path(TEEHR_BASE, 'secondary')
CROSSWALK_FILEPATH = Path(TEEHR_BASE, 'crosswalk')
GEOMETRY_FILEPATH = Path(TEEHR_BASE, 'geometry')
ATTRIBUTE_FILEPATH = Path(TEEHR_BASE, 'attribute')
JOINED_FILEPATH = Path(TEEHR_BASE, 'joined')
DB_FILEPATH = Path(TEEHR_BASE, 'teehr.db')

PRIMARY_FILEPATH.mkdir(exist_ok=True, parents=True)
SECONDARY_FILEPATH.mkdir(exist_ok=True, parents=True)
CROSSWALK_FILEPATH.mkdir(exist_ok=True, parents=True)
GEOMETRY_FILEPATH.mkdir(exist_ok=True, parents=True)
ATTRIBUTE_FILEPATH.mkdir(exist_ok=True, parents=True)
JOINED_FILEPATH.mkdir(exist_ok=True, parents=True)

In [None]:
# Convert location dat to parquet
locations = gpd.read_file(Path(RAW_DATA_FILEPATH, "gages.geojson"))
locations.to_parquet(Path(GEOMETRY_FILEPATH, "locations.parquet"))
locations

In [None]:
# Convert crosswalks
sim_xw = pd.read_csv(Path(RAW_DATA_FILEPATH, "sim-crosswalk.csv"))
sim_xw.to_parquet(Path(CROSSWALK_FILEPATH, "sim-crosswalk.parquet"))
sim_xw

In [None]:
baseline_xw = pd.read_csv(Path(RAW_DATA_FILEPATH, "baseline-crosswalk.csv"))
baseline_xw.to_parquet(Path(CROSSWALK_FILEPATH, "baseline-crosswalk.parquet"))
baseline_xw

In [None]:
# Convert attributes
attr1 = pd.read_csv(Path(RAW_DATA_FILEPATH, "gage_attr_2yr_discharge.csv"))
attr1.to_parquet(Path(ATTRIBUTE_FILEPATH, "2yr_discharge.parquet"))
display(attr1)

attr2 = pd.read_csv(Path(RAW_DATA_FILEPATH, "gage_attr_drainage_area_km2.csv"))
attr2.to_parquet(Path(ATTRIBUTE_FILEPATH, "drainage_area.parquet"))
display(attr2)

attr3 = pd.read_csv(Path(RAW_DATA_FILEPATH, "gage_attr_ecoregion.csv"))
attr3.to_parquet(Path(ATTRIBUTE_FILEPATH, "ecoregion.parquet"))
display(attr3)

In [None]:
# Convert timeseries
obs = pd.read_csv(Path(RAW_DATA_FILEPATH, "obs.csv"))
obs

In [None]:
# Add the other columns required for TEEHR
obs['configuration'] = 'usgs'
obs['variable_name'] = 'streamflow_daily_mean'
obs['measurement_unit'] = 'cms'
obs['reference_time'] = None
# Reference_time column must be cast as type datetime64[ns] if set to None
obs['reference_time'] = obs['reference_time'].astype('datetime64[ns]')
obs.to_parquet(Path(PRIMARY_FILEPATH, "obs.parquet"))
obs

In [None]:
baseline_ts = pd.read_csv(Path(RAW_DATA_FILEPATH, "baseline.csv"))
baseline_ts

In [None]:
# Add the other columns required for TEEHR
baseline_ts['configuration'] = 'modeled'
baseline_ts['variable_name'] = 'streamflow_daily_mean'
baseline_ts['measurement_unit'] = 'cms'
baseline_ts['reference_time'] = None
# Reference_time column must be cast as type datetime64[ns] if set to None
baseline_ts['reference_time'] = (
    baseline_ts['reference_time'].astype('datetime64[ns]')
)
baseline_ts.to_parquet(Path(SECONDARY_FILEPATH, "baseline.parquet"))
baseline_ts

In [None]:
sim_ts = pd.read_csv(Path(RAW_DATA_FILEPATH, "sim.csv"))
sim_ts

In [None]:
# Add the other columns required for TEEHR
sim_ts['configuration'] = 'sim'
sim_ts['variable_name'] = 'streamflow_daily_mean'
sim_ts['measurement_unit'] = 'cms'
sim_ts['reference_time'] = None
# Reference_time column must be cast as type datetime64[ns] if set to None
sim_ts['reference_time'] = (
    baseline_ts['reference_time'].astype('datetime64[ns]')
)
sim_ts.to_parquet(Path(SECONDARY_FILEPATH, "sim.parquet"))
sim_ts

In [None]:
PRIMARY_FILEPATH = f"{PRIMARY_FILEPATH}/**/*.parquet"
SECONDARY_FILEPATH = f"{SECONDARY_FILEPATH}/**/*.parquet"
CROSSWALK_FILEPATH = f"{CROSSWALK_FILEPATH }/**/*.parquet"
GEOMETRY_FILEPATH = f"{GEOMETRY_FILEPATH }/**/*.parquet"
ATTRIBUTE_FILEPATH = f"{ATTRIBUTE_FILEPATH}/**/*.parquet"

In [None]:
# Join the data
if DB_FILEPATH.is_file():
    DB_FILEPATH.unlink()

db = DuckDBDatabase(DB_FILEPATH)

In [None]:
# Insert the timeseries data
db.insert_joined_timeseries(
    primary_filepath=PRIMARY_FILEPATH,
    secondary_filepath=SECONDARY_FILEPATH,
    crosswalk_filepath=CROSSWALK_FILEPATH,
    drop_added_fields=True,
)

In [None]:
# Insert geometry
db.insert_geometry(GEOMETRY_FILEPATH)

In [None]:
# Insert attributes
db.insert_attributes(ATTRIBUTE_FILEPATH)

In [None]:
db.query(f"""
    COPY (
        SELECT *
        FROM joined_timeseries
        ORDER BY configuration, primary_location_id, value_time
    )
   TO '{JOINED_FILEPATH}/joined.parquet' (FORMAT PARQUET)
""")


In [None]:
JOINED_FILEPATH = f"{JOINED_FILEPATH}/**/*.parquet"

In [None]:
db = DuckDBJoinedParquet(JOINED_FILEPATH, GEOMETRY_FILEPATH)

In [None]:
jts = db.get_joined_timeseries(
    filters=[
        {
            "column": "primary_location_id",
            "operator": "=",
            "value": "gage-A"
        },
    ],
    order_by=["configuration", "primary_location_id", "value_time"],
)
jts

In [None]:
metrics = db.get_metrics(
    group_by=["primary_location_id", "configuration"],
    order_by=["primary_location_id", "configuration"],
    include_metrics="all"
)
print(metrics)