In [None]:
import os
from pathlib import Path
import shutil
from datetime import datetime
from datetime import timedelta

import teehr

from teehr.evaluation.spark_session_utils import create_spark_session

In [None]:
LOCAL_EV_DIR = "/data/temp_warehouse"

In [None]:
spark = create_spark_session(
    aws_access_key_id="minioadmin",
    aws_secret_access_key="minioadmin123"
)

In [None]:
ev = teehr.Evaluation(
    spark=spark,
    dir_path=LOCAL_EV_DIR,
    check_evaluation_version=False
)

In [None]:
ev.set_active_catalog("remote")

#### Get USGS locations

In [None]:
locations_list = ev.locations.to_sdf().select("id").rdd.map(lambda row: row[0]).collect()
locations_list = "', '".join(locations_list)

In [None]:
locations_list

In [None]:
CURRENT_DT = datetime.now()

#### Check latest available USGS in primary

This assumes all gages have the same datetime for most recent. If they are "sparse", we may have to fetch one gage at a time?

In [None]:
# latest_usgs_value_time = ev.spark.sql(f"""
#     SELECT value_time, location_id
#     FROM iceberg.teehr.primary_timeseries
#     WHERE 
#         configuration_name = 'usgs_observations' AND
#         location_id IN ('{locations_list}')
#     ORDER BY value_time DESC
#     LIMIT 1
# ;""").collect()[0].asDict()["value_time"]

# Get for all locations
latest_usgs_value_time = ev.spark.sql(f"""
    SELECT value_time, location_id
    FROM iceberg.teehr.primary_timeseries
    WHERE 
        configuration_name = 'usgs_observations'
    ORDER BY value_time DESC
    LIMIT 1
;""").collect()
if len(latest_usgs_value_time) > 0:
    latest_usgs_value_time = latest_usgs_value_time[0].asDict()["value_time"]
    start_dt = latest_usgs_value_time + timedelta(minutes=1)
else:
    start_dt = CURRENT_DT - timedelta(days=1)

#### Fetch and load USGS data

Default write_mode is "append" which will allow duplicates

In [None]:
ev.fetch.usgs_streamflow(
    start_date=start_dt,
    end_date=CURRENT_DT
)

#### Check latest NWM forecast data

In [None]:
# from pyspark.sql import functions as F

In [None]:
NWM_CONFIGURATION = "short_range"
NWM_VERSION = "nwm30"

#### Fetch and load NWM forecasts

In [None]:
# (
#     ev
#     .location_crosswalks
#     .to_sdf()
#     .filter(F.col("secondary_location_id").startswith("nwm30-"))
#     .select("secondary_location_id")
#     .distinct()
#     .select("secondary_location_id").rdd.map(lambda row: row[0]).collect()
# )

In [None]:
from teehr.fetching.utils import format_nwm_configuration_metadata

In [None]:
teehr_nwm_config = format_nwm_configuration_metadata(
    nwm_config_name=NWM_CONFIGURATION,
    nwm_version=NWM_VERSION
)
print(teehr_nwm_config)

In [None]:
# Get most recent across all NWM configuration locations
latest_nwm_reference_time = ev.spark.sql(f"""
    SELECT reference_time, location_id
    FROM iceberg.teehr.secondary_timeseries
    WHERE 
        configuration_name = '{teehr_nwm_config["name"]}'
    LIMIT 1
;""").collect()
if len(latest_nwm_reference_time) > 0:
    latest_nwm_reference_time = latest_nwm_reference_time[0].asDict()["reference_time"]
    start_dt = latest_nwm_reference_time + timedelta(minutes=1)
else:
    start_dt = CURRENT_DT - timedelta(days=1)

NOTE: Can we use dask for NWM fetching?

In [None]:
from dask.distributed import Client
client = Client()
client

In [None]:
import dask
dask.config.set({"distributed.dashboard.link": "{JUPYTERHUB_SERVICE_PREFIX}proxy/{port}/status"})

Note. Having memory issues here.

In [None]:
%%time
ev.fetch.nwm_operational_points(
    start_date=start_dt,
    end_date=CURRENT_DT,
    nwm_configuration="short_range",
    nwm_version="nwm30",
    output_type="channel_rt",
    variable_name="streamflow"
)

In [None]:

print(start_dt)