# Example NGEN to Parquet
This is an example notebook to convert ngen catchments, nexus, forcings, and outputs to parquet files for use in TEEHR.  It also loads USGS gage locations and grabs USGS gage data.

This code is not at all DRY at the moment.  There is probably room for tools to handle this process in a standardized yet configurable way.  The NextGen output can vary a lot depending on the model, configuration, etc., so careful planning and understanding of the output would be required. Ideally, code could just be pointed at the `realization`, `catchment` and `nexus` files and everything would be determined programatically from there.  This would require a deeper understanding of the NextGen and BMI, etc. to implement, than we currently have.

Before using any of the code below, you should examine the `*.csv` files carefully as the formats could be different.

In [None]:
%%capture
!pip install hydrotools

In [None]:
# Need to install TEEHR to avoid this
import sys
sys.path.insert(0, "../../src")

In [None]:
import pandas as pd
import geopandas as gpd
import geoviews as gv
from pathlib import Path
from datetime import datetime
import hvplot.pandas
import cartopy.crs as ccrs
from holoviews import opts

import teehr.loading.usgs as tlu

In [None]:
# Set some configurations
NGEN_DIR = "/home/jovyan/shared-readwrite/rti-eval/awi_16_680661_001/ngen/"
NGEN_CONFIG_DIR = Path(NGEN_DIR, "config")
NGEN_FORCINGS_DIR = Path(NGEN_DIR, "forcings")
NGEN_OUTPUT_DIR = Path(NGEN_DIR, "output")

STUDY_DIR = "/home/jovyan/shared-readwrite/rti-eval/awi_16_680661_001/"
STUDY_TS_DIR = Path(STUDY_DIR, "timeseries")
STUDY_GEO_DIR = Path(STUDY_DIR, "geo")

## Catchment GeoJSON

In [None]:
catchment_file_gdf = gpd.read_file(Path(NGEN_CONFIG_DIR, "catchments.geojson"))
catchment_file_gdf["name"] = catchment_file_gdf["id"]
catchment_file_gdf = catchment_file_gdf[["id", "name", "geometry"]]
catchment_file_gdf.to_parquet(Path(STUDY_GEO_DIR, "cat_geometry.parquet"))

In [None]:
catchment_crosswalk = pd.DataFrame({"primary_location_id": catchment_file_gdf["id"], "secondary_location_id": catchment_file_gdf["id"]})
catchment_crosswalk.to_parquet(Path(STUDY_GEO_DIR, "cat_cat_crosswalk.parquet"))

## Nexus GeoJSON

In [None]:
nexus_file_gdf = gpd.read_file(Path(NGEN_CONFIG_DIR, "nexus.geojson"))
nexus_file_gdf["name"] = nexus_file_gdf["id"]
nexus_file_gdf = nexus_file_gdf[["id", "name", "geometry"]]
nexus_file_gdf.to_parquet(Path(STUDY_GEO_DIR, "nex_geometry.parquet"))

In [None]:
nexus_crosswalk = pd.DataFrame({"primary_location_id": nexus_file_gdf["id"], "secondary_location_id": nexus_file_gdf["id"]})
nexus_crosswalk.to_parquet(Path(STUDY_GEO_DIR, "nex_nex_crosswalk.parquet"))

## Catchment Forcings

In [None]:
cat_forcing_files = NGEN_FORCINGS_DIR.glob(pattern="cat16_cat-*.csv")

cat_dfs = []
for file in cat_forcing_files:
    cat_df = pd.read_csv(file, parse_dates=["time"])
    cat_df["configuration"] = "awi_16_680661_001"
    cat_df["variable_name"] = "precipitation_rate"
    cat_df["reference_time"] = ""
    cat_df["measurement_unit"] = "mm/hr"
    cat_df["location_id"] = file.stem.split("_")[-1]
    cat_df.rename(columns={"time":"value_time", "precip_rate":"value"}, inplace=True, errors="raise")
    cat_df["value"] = cat_df["value"] * 3600
    cat_df = cat_df[["reference_time", "location_id", "value_time", "value", "variable_name", "measurement_unit", "configuration"]]
    # Depending on size may want to write inididual parquet files.
    # cat_df.to_parquet(Path(STUDY_FORCINGS_DIR, f"{file.stem.split('_')[-1]}.parquet"))
    cat_dfs.append(cat_df)
    
pd.concat(cat_dfs).to_parquet(Path(STUDY_TS_DIR, "catchment_forcings.parquet"))

## Catchment Output

In [None]:
cat_output_files = NGEN_OUTPUT_DIR.glob(pattern="cat-*.csv")

cat_out_dfs = []
for file in cat_output_files:
    cat_out_df = pd.read_csv(file)
    cat_out_df.rename(columns={"Time Step":"lead_time","Time":"value_time", "Q_OUT": "value"}, inplace=True, errors="raise")
    cat_out_df["configuration"] = "awi_16_680661_00"
    cat_out_df["location_id"] = file.stem.split("_")[-1]
    cat_out_df["variable_name"] = "runoff"
    cat_out_df["reference_time"] = cat_out_df["value_time"].iloc[0]
    cat_out_df["measurement_unit"] = "m3/s"
    cat_out_df = cat_out_df[["reference_time", "location_id", "value_time", "value", "variable_name", "measurement_unit", "configuration"]]
     # Depending on size may want to write inididual parquet files.
    # cat_out_df.to_parquet(Path(STUDY_OUTPUT_DIR, f"{file.stem.split('_')[-1]}.parquet"))
    cat_out_dfs.append(cat_out_df)
    
pd.concat(cat_out_dfs).to_parquet(Path(STUDY_TS_DIR, "catchment_simulation.parquet"))

## Nexus Output

In [None]:
nexus_output_files = NGEN_OUTPUT_DIR.glob(pattern="nex-*.csv")

nex_out_dfs = []
for file in nexus_output_files:
    nex_out_df = pd.read_csv(file, header=1, names=["lead_time","value_time", "value"], parse_dates=["value_time"])
    nex_out_df["configuration"] = "awi_16_680661_00"
    nex_out_df["location_id"] = file.stem.split("_")[0]
    nex_out_df["variable_name"] = "streamflow"
    nex_out_df["reference_time"] = nex_out_df["value_time"].iloc[0]
    nex_out_df["measurement_unit"] = "m3/s"
    nex_out_df = nex_out_df[["reference_time", "location_id", "value_time", "value", "variable_name", "measurement_unit", "configuration"]]
    # Depending on size may want to write inididual parquet files.
    # nex_out_df.to_parquet(Path(STUDY_OUTPUT_DIR, f"{file.stem.split('_')[-1]}.parquet"))
    nex_out_dfs.append(nex_out_df)
    
pd.concat(nex_out_dfs).to_parquet(Path(STUDY_TS_DIR, "nexus_simulation.parquet"))

## USGS Gages and Crosswalk
The gage data GeoJSON file `usgs_awi_16_680661_001_gages.geojson` was created manually using QGIS and the nexus and catchment GeoJSON files from the study for this example project, but could be automated in the future.

First the `usgs_awi_16_680661_001_gages.geojson` is converted to parquet and then a crosswalk table is created and also saved as a parquet file.

In [None]:
# Convert USGS Gages to Parquet
usgs_gdf = gpd.read_file(Path(STUDY_GEO_DIR, "usgs_awi_16_680661_001_gages.geojson"))
usgs_gdf.rename(columns={"STAID":"id","STANAME":"name"}, inplace=True, errors="raise")
usgs_gdf["id"] = "usgs-" + usgs_gdf["id"].astype(str)
usgs_gdf = usgs_gdf[["id", "name", "geometry"]]
usgs_gdf.to_parquet(Path(STUDY_GEO_DIR, "usgs_awi_16_680661_001_geometry.parquet"))
# usgs_gdf

In [None]:
# This crosswalk list was generated by hand because the hydrofabric for this example 
# does not match the published hydrofabric. In the future, this could be automated 
# based on the hydrofabric data.
nex_location_ids = [
    "nex-680635",
    "nex-680639",
    "nex-680646",
    "nex-680649",
    "nex-680892",
    "nex-680741",
    "nex-680662"
]
usgs_gage_ids = [
    "usgs-10154200",
    "usgs-10155000",
    "usgs-10155200",
    "usgs-10155500",
    "usgs-10156000",
    "usgs-10157500",
    "usgs-10163000"
]
usgs_nex_crosswalk = pd.DataFrame(
    {
        "primary_location_id": usgs_gage_ids,
        "secondary_location_id": nex_location_ids
    }
)
usgs_nex_crosswalk.to_parquet(Path(STUDY_GEO_DIR, "usgs_nex_crosswalk.parquet"))

## Load USGS Gage Data

In [None]:
# Use the TEEHR library to load USGS gage data
usgs_site_codes = [v.replace("usgs-", "") for v in usgs_nex_crosswalk["primary_location_id"].to_list()]
tlu.usgs_to_parquet(
    sites=usgs_site_codes,
    start_date=datetime(1980, 1, 1),
    end_date=datetime(1980, 2, 1),
    output_parquet_dir=Path(STUDY_TS_DIR),
)

## Lets look at the data

In [None]:
tiles = gv.tile_sources.OSM

In [None]:
cat_gdf = gpd.read_parquet(Path(STUDY_GEO_DIR, "cat_geometry.parquet")).to_crs("EPSG:3857")
catchments = cat_gdf.hvplot(crs=ccrs.GOOGLE_MERCATOR)

In [None]:
nex_gdf = gpd.read_parquet(Path(STUDY_GEO_DIR, "nex_geometry.parquet")).to_crs("EPSG:3857")
nexus = nex_gdf.hvplot(color=["red"], crs=ccrs.GOOGLE_MERCATOR)

In [None]:
usgs_gdf = gpd.read_parquet(Path(STUDY_GEO_DIR, "usgs_awi_16_680661_001_geometry.parquet")).to_crs("EPSG:3857")
usgs = usgs_gdf.hvplot(color=["green"], crs=ccrs.GOOGLE_MERCATOR)

In [None]:
(tiles * catchments * nexus * usgs).opts(width=800, height=600)