In [None]:
import os
from pathlib import Path
import shutil

import teehr
import pandas as pd
import requests

teehr.__version__

In [None]:
from teehr.evaluation.spark_session_utils import create_spark_session

spark = create_spark_session(
    aws_access_key_id="minioadmin",
    aws_secret_access_key="minioadmin123",
    update_configs={
        "spark.hadoop.fs.s3a.aws.credentials.provider":  "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"
    }
)

In [None]:
%%time
dir_path = "/data/temp_warehouse"

ev = teehr.Evaluation(
    spark=spark,
    dir_path=dir_path,
    create_dir=True
)

In [None]:
ev.set_active_catalog("remote")

ev.active_catalog

### Add configuration

In [None]:
from teehr import Configuration

configuration = Configuration(
    name="nwpsrfc_streamflow_forecast",
    type="secondary",
    description="NWPS RFC Streamflow Forecast",
)

ev.configurations.add(configuration)

### Add variable

In [None]:
from teehr import Variable

variable = Variable(
    name="streamflow_6hr_inst",
    long_name="Instantaneous 6-hour streamflow"
)

ev.variables.add(variable)

### Add crosswalk entries

In [None]:
def get_new_crosswalks() -> pd.DataFrame:
    """Get new crosswalks for NWPS RFC Streamflow Forecast."""
    og_df = ev.location_crosswalks.to_pandas()
    usgs_stripped = og_df['primary_location_id'].str.removeprefix('usgs-').tolist()

    rfc_lids = []
    for usgs_id in usgs_stripped:
        endpoint = f"https://api.water.noaa.gov/nwps/v1/gauges/{usgs_id}"
        try:
            response = requests.get(endpoint)
            response.raise_for_status()
            metadata = response.json()
        except requests.exceptions.RequestException as e:
            print(f"exception: {e}")
            continue
        if 'lid' in metadata:
            lid = metadata['lid']
            rfc_lids.append(lid)
        else:
            print(f"lid not found for usgs_id: {usgs_id}")

    rfc_lids_format = ['nwpsrfc-' + lid for lid in rfc_lids]
    usgs_ids_format = ['usgs-' + id for id in usgs_stripped]

    data = {
        'primary_location_id':usgs_ids_format,
        'secondary_location_id':rfc_lids_format,
    }

    df = pd.DataFrame(data)

    return df

In [None]:
df = get_new_crosswalks()

ev.location_crosswalks.load_dataframe(df)

### Kill spark

In [None]:
ev.spark.stop()