In [None]:
import hvplot.pandas
import numpy as np
import pandas as pd
import panel as pn
import geopandas as gpd
from trino.dbapi import connect
import folium
import param
import os

import holoviews as hv
import geoviews as gv
import cartopy.crs as ccrs
from holoviews import opts

from teehr.querying.utils import df_to_gdf

In [2]:
import warnings
warnings.filterwarnings("ignore", message="pandas only supports SQLAlchemy connectable")

In [None]:
# Trino connection configuration
TRINO_HOST = os.environ.get("TRINO_HOST", "localhost")
TRINO_PORT = os.environ.get("TRINO_PORT", 8080)
TRINO_USER = os.environ.get("TRINO_USER", "teehr")
TRINO_CATALOG = os.environ.get("TRINO_CATALOG", "iceberg")
TRINO_SCHEMA = os.environ.get("TRINO_SCHEMA", "teehr")

In [4]:
pn.extension('tabulator', design="material", sizing_mode="stretch_width")

In [5]:
class DashboardState(param.Parameterized):
    locations_gdf = param.ClassSelector(class_=gpd.GeoDataFrame, default=None)

state = DashboardState()

In [6]:

def get_trino_connection():
    """Establishes and returns a Trino database connection."""
    conn = connect(
        host=TRINO_HOST,
        user=TRINO_USER,
        catalog=TRINO_CATALOG,
        schema=TRINO_SCHEMA,
        http_scheme='http',
        port=TRINO_PORT,
        # For production, add authentication:
        # auth=BasicAuthentication("username", "password")
    )
    return conn


In [15]:
def get_unique_location_prefixes():
    """"""
    conn = get_trino_connection()
    query = """
    WITH prefixes AS (
        SELECT 
            split(id, '-')[1] AS id_prefix,
            split(id, '-')[2] AS id_suffix
        FROM iceberg.teehr.locations
    )
    SELECT id_prefix as primary_location_id_prefix,
        count(*) AS location_count
    FROM prefixes
    GROUP BY id_prefix
    """
    df = pd.read_sql(query, conn)
    return df

get_unique_location_prefixes()

Unnamed: 0,primary_location_id_prefix,location_count
0,usgs,9


In [19]:
def get_unique_crosswalks_by_prefix():
    """Gets unique crosswalks grouped by primary and secondary location id prefixes."""
    conn = get_trino_connection()
    query = """
    WITH prefixes AS (
        SELECT 
            split(primary_location_id, '-')[1] AS primary_id_prefix,
            split(primary_location_id, '-')[2] AS primary_id_suffix,
            split(secondary_location_id, '-')[1] AS secondary_id_prefix,
            split(secondary_location_id, '-')[2] AS secondary_id_suffix,
            lc.*
        FROM iceberg.teehr.location_crosswalks lc
    )
    SELECT 
        primary_id_prefix, secondary_id_prefix,
        count(*) AS location_count
    FROM prefixes
    GROUP BY primary_id_prefix, secondary_id_prefix
    """
    df = pd.read_sql(query, conn)
    # gdf = df_to_gdf(df)
    return df

get_unique_crosswalks_by_prefix()

Unnamed: 0,primary_id_prefix,secondary_id_prefix,location_count
0,usgs,nwm30,9


In [23]:
def get_unique_primary_timeseries_configurations():
    """Gets unique primary timeseries configurations grouped by location id prefixes."""
    conn = get_trino_connection()
    query = """
    WITH prefixes AS (
        SELECT 
            split(location_id, '-')[1] AS primary_id_prefix,
            split(location_id, '-')[2] AS primary_id_suffix,
            pt.*
        FROM iceberg.teehr.primary_timeseries pt
    )
    SELECT 
        primary_id_prefix, configuration_name, variable_name, unit_name,
        count(*) AS timeseries_value_count
    FROM prefixes
    GROUP BY primary_id_prefix, configuration_name, variable_name, unit_name
    """
    df = pd.read_sql(query, conn)
    # gdf = df_to_gdf(df)
    return df

get_unique_primary_timeseries_configurations()

Unnamed: 0,primary_id_prefix,configuration_name,variable_name,unit_name,timeseries_value_count
0,usgs,usgs_observations,streamflow_hourly_inst,m^3/s,2218992


In [20]:
def get_unique_secondary_timeseries_configurations():
    """Gets unique secondary timeseries configurations grouped by location id prefixes."""
    conn = get_trino_connection()
    query = """
    WITH prefixes AS (
        SELECT 
            split(location_id, '-')[1] AS secondary_id_prefix,
            split(location_id, '-')[2] AS secondary_id_suffix,
            st.*
        FROM iceberg.teehr.secondary_timeseries st
    )
    SELECT 
        secondary_id_prefix, configuration_name, variable_name, unit_name,
        count(*) AS timeseries_value_count
    FROM prefixes
    GROUP BY secondary_id_prefix, configuration_name, variable_name, unit_name
    """
    df = pd.read_sql(query, conn)
    # gdf = df_to_gdf(df)
    return df

get_unique_secondary_timeseries_configurations()

Unnamed: 0,secondary_id_prefix,configuration_name,variable_name,unit_name,timeseries_value_count
0,nwm30,nwm30_retrospective,streamflow_hourly_inst,m^3/s,3319920


In [28]:
def get_primary_timeseries_statistics_by_location():
    """Gets primary timeseries statistics for a specific location."""
    conn = get_trino_connection()
    query = f"""
    WITH prefixes AS (
        SELECT 
            split(location_id, '-')[1] AS primary_id_prefix,
            split(location_id, '-')[2] AS primary_id_suffix,
            pt.*
        FROM iceberg.teehr.primary_timeseries pt
    )
    SELECT 
        location_id, configuration_name, variable_name, unit_name,
        count(*) AS timeseries_value_count,
        min(value_time) as start_time,
        max(value_time) as end_time,
        date_diff('hour', min(value_time), max(value_time)) as hours_between,
        date_diff('day', min(value_time), max(value_time)) as days_between
    FROM prefixes
    GROUP BY location_id, configuration_name, variable_name, unit_name
    """
    df = pd.read_sql(query, conn)
    # gdf = df_to_gdf(df)
    return df

get_primary_timeseries_statistics_by_location()

Unnamed: 0,location_id,configuration_name,variable_name,unit_name,timeseries_value_count,start_time,end_time,hours_between,days_between
0,usgs-02424000,usgs_observations,streamflow_hourly_inst,m^3/s,240459,1994-10-01 06:00:00+00:00,2023-12-31 23:00:00+00:00,256409,10683
1,usgs-05443500,usgs_observations,streamflow_hourly_inst,m^3/s,245099,1987-04-01 06:00:00+00:00,2023-12-31 23:00:00+00:00,322169,13423
2,usgs-11421000,usgs_observations,streamflow_hourly_inst,m^3/s,275679,1987-10-01 08:00:00+00:00,2023-12-31 23:00:00+00:00,317775,13240
3,usgs-01570500,usgs_observations,streamflow_hourly_inst,m^3/s,205062,1985-10-01 05:00:00+00:00,2023-12-31 23:00:00+00:00,335298,13970
4,usgs-03068800,usgs_observations,streamflow_hourly_inst,m^3/s,216650,1998-01-23 05:00:00+00:00,2023-12-31 23:00:00+00:00,227370,9473
5,usgs-14319500,usgs_observations,streamflow_hourly_inst,m^3/s,287892,1988-10-02 07:00:00+00:00,2023-12-31 23:00:00+00:00,308968,12873
6,usgs-08313000,usgs_observations,streamflow_hourly_inst,m^3/s,274853,1990-10-01 07:00:00+00:00,2023-12-31 23:00:00+00:00,291472,12144
7,usgs-06770500,usgs_observations,streamflow_hourly_inst,m^3/s,220418,1990-10-01 06:00:00+00:00,2023-12-31 20:00:00+00:00,291470,12144
8,usgs-01347000,usgs_observations,streamflow_hourly_inst,m^3/s,252880,1990-10-01 05:00:00+00:00,2023-12-31 23:00:00+00:00,291474,12144


In [None]:
def get_secondary_timeseries_statistics_by_location():
    """Gets secondary timeseries statistics for a specific location."""
    conn = get_trino_connection()
    query = f"""
    WITH prefixes AS (
        SELECT 
            split(location_id, '-')[1] AS primary_id_prefix,
            split(location_id, '-')[2] AS primary_id_suffix,
            split(secondary_location_id, '-')[1] AS secondary_id_prefix,
            split(secondary_location_id, '-')[2] AS secondary_id_suffix,
            lc.primary_location_id,
            st.*
        FROM iceberg.teehr.secondary_timeseries st
        JOIN iceberg.teehr.location_crosswalks lc
            ON st.location_id = lc.secondary_location_id
    )
    SELECT 
        primary_location_id, location_id as secondary_location_id, configuration_name, variable_name, unit_name,
        count(*) AS timeseries_value_count,
        min(value_time) as start_time,
        max(value_time) as end_time,
        date_diff('hour', min(value_time), max(value_time)) as hours_between,
        date_diff('day', min(value_time), max(value_time)) as days_between
    FROM prefixes
    GROUP BY primary_location_id, location_id, configuration_name, variable_name, unit_name
    """
    df = pd.read_sql(query, conn)
    # gdf = df_to_gdf(df)
    return df

get_secondary_timeseries_statistics_by_location()

Unnamed: 0,primary_location_id,secondary_location_id,configuration_name,variable_name,unit_name,count,start_time,end_time,hours_between,days_between
0,usgs-14319500,nwm30-23893934,nwm30_retrospective,streamflow_hourly_inst,m^3/s,368880,1981-01-01 00:00:00+00:00,2023-01-30 23:00:00+00:00,368879,15369
1,usgs-11421000,nwm30-7981844,nwm30_retrospective,streamflow_hourly_inst,m^3/s,368880,1981-01-01 00:00:00+00:00,2023-01-30 23:00:00+00:00,368879,15369
2,usgs-08313000,nwm30-17865930,nwm30_retrospective,streamflow_hourly_inst,m^3/s,368880,1981-01-01 00:00:00+00:00,2023-01-30 23:00:00+00:00,368879,15369
3,usgs-03068800,nwm30-3776515,nwm30_retrospective,streamflow_hourly_inst,m^3/s,368880,1981-01-01 00:00:00+00:00,2023-01-30 23:00:00+00:00,368879,15369
4,usgs-01347000,nwm30-22741627,nwm30_retrospective,streamflow_hourly_inst,m^3/s,368880,1981-01-01 00:00:00+00:00,2023-01-30 23:00:00+00:00,368879,15369
5,usgs-05443500,nwm30-10607692,nwm30_retrospective,streamflow_hourly_inst,m^3/s,368880,1981-01-01 00:00:00+00:00,2023-01-30 23:00:00+00:00,368879,15369
6,usgs-01570500,nwm30-4710000,nwm30_retrospective,streamflow_hourly_inst,m^3/s,368880,1981-01-01 00:00:00+00:00,2023-01-30 23:00:00+00:00,368879,15369
7,usgs-02424000,nwm30-21661814,nwm30_retrospective,streamflow_hourly_inst,m^3/s,368880,1981-01-01 00:00:00+00:00,2023-01-30 23:00:00+00:00,368879,15369
8,usgs-06770500,nwm30-7268297,nwm30_retrospective,streamflow_hourly_inst,m^3/s,368880,1981-01-01 00:00:00+00:00,2023-01-30 23:00:00+00:00,368879,15369


In [None]:
unique_location_prefixes_tabulator = pn.widgets.Tabulator(
    get_unique_location_prefixes(), 
    theme="bootstrap", 
    pagination="remote", 
    page_size=10
)
unique_primary_timeseries_configurations_tabulator = pn.widgets.Tabulator(
    get_unique_primary_timeseries_configurations(), 
    theme="bootstrap", 
    pagination="remote", 
    page_size=10
)
unique_secondary_timeseries_configurations_tabulator = pn.widgets.Tabulator(
    get_unique_secondary_timeseries_configurations(), 
    theme="bootstrap", 
    pagination="remote", 
    page_size=10
)
unique_crosswalks_by_prefix_tabulator = pn.widgets.Tabulator(
    get_unique_crosswalks_by_prefix(), 
    theme="bootstrap",
    pagination="remote",
    page_size=10
)
primary_timeseries_statistics_by_location_tabulator = pn.widgets.Tabulator(
    get_primary_timeseries_statistics_by_location(), 
    theme="bootstrap",
    pagination="remote",
    page_size=10
)
secondary_timeseries_statistics_by_location_tabulator = pn.widgets.Tabulator(
    get_secondary_timeseries_statistics_by_location(), 
    theme="bootstrap",
    pagination="remote",
    page_size=10
)

# locations_widget = pn.widgets.Select(
#     name="location",
#     options=list(state.metrics_gdf['primary_location_id']),
#     value=state.location_id,
# )

sidebar = pn.Column(
    pn.pane.Markdown(f"""
        ### 🔧 Database Connection Information
        ```
        Host: {TRINO_HOST}
        Port: {TRINO_PORT}
        User: {TRINO_USER}
        Catalog: {TRINO_CATALOG}
        Schema: {TRINO_SCHEMA}
        ```
    """),
    "---",
)

main = pn.Column(
    pn.pane.Markdown("""
        # Data Summary Dashboard
        ### This dashboard is a placeholder for various data summaries and statistics related to the TEEHR warehouse.  
        ### We will update this as we identify useful summaries to include.
    """),
    pn.pane.Markdown("""
        ## 📊 Unique Location Prefixes
        ### This table provides a summary of unique location ID prefixes stored in the TEEHR warehouse.
    """),
    unique_location_prefixes_tabulator,
    "---",
    pn.pane.Markdown("""
        ## 🔄 Crosswalks by Location Prefixes
        ### This table provides a summary of unique crosswalks grouped by primary and secondary location ID prefixes.
    """),
    unique_crosswalks_by_prefix_tabulator,
    "---",
    pn.pane.Markdown("""
        ## 📈 Primary Time Series Configurations
        ### This table provides a summary of unique primary time series configurations stored in the TEEHR warehouse.
    """),
    unique_primary_timeseries_configurations_tabulator,
    "---",
    pn.pane.Markdown("""
        ## 📈 Secondary Time Series Configurations
        ### This table provides a summary of unique secondary time series configurations stored in the TEEHR warehouse.
    """),
    unique_secondary_timeseries_configurations_tabulator,
    "---",
    pn.pane.Markdown("""
        ## 📊 Primary Time Series Statistics by Location
        ### This table provides a summary of primary time series statistics grouped by location.
    """),
    primary_timeseries_statistics_by_location_tabulator,
    "---",
    pn.pane.Markdown("""
        ## 📊 Secondary Time Series Statistics by Location
        ### This table provides a summary of secondary time series statistics grouped by location.
    """),
    secondary_timeseries_statistics_by_location_tabulator,
    "---"
)


pn.template.MaterialTemplate(
    site="TEEHR",
    title="Data Summary Dashboard",
    sidebar=[sidebar],
    main=[main],
).servable()