# Example NGEN to Parquet
This is an example notebook to convert ngen catchments, nexus, forcings, and outputs to parquet files for use in TEEHR.

This code is not at all DRY at the moment.  There is probably room for tools to handle this process in a standardized yet configurable way.  The NextGen output can vary a lot depending on the model, configuration, etc., so careful planning and understanding of the output would be required. Ideally, code could just be pointed at the `realization`, `catchment` and `nexus` files and everything would be determined programatically from there.  This would require a deeper understanding of the NextGen and BMI, etc. to implement, than we currently have.

Before using any of the code below, you should examine the `*.csv` files carefully as the formats could be different.

In [None]:
import pandas as pd
import geopandas as gpd
from pathlib import Path

In [None]:
# Set some configurations
NGEN_DIR = "/home/jovyan/cache/AWI_03W_113060_001_NGEN/"
NGEN_CONFIG_DIR = Path(NGEN_DIR, "config")
NGEN_FORCINGS_DIR = Path(NGEN_DIR, "forcings")
NGEN_OUTPUT_DIR = Path(NGEN_DIR, "output")

STUDY_DIR = "/home/jovyan/shared-readwrite/rti-eval/ngen-simulation-example/"
STUDY_TS_DIR = Path(STUDY_DIR, "timeseries")
STUDY_GEO_DIR = Path(STUDY_DIR, "geo")
# STUDY_USGS_DIR = Path(STUDY_DIR, "timeseries", "usgs")

## Catchment GeoJSON

In [None]:
catchment_file_gdf = gpd.read_file(Path(NGEN_CONFIG_DIR, "catchment_data.geojson"))
catchment_file_gdf["name"] = catchment_file_gdf["id"]
catchment_file_gdf = catchment_file_gdf[["id", "name", "geometry"]]
catchment_file_gdf.to_parquet(Path(STUDY_GEO_DIR, "cat_geometry.parquet"))

In [None]:
catchment_crosswalk = pd.DataFrame({"primary_location_id": catchment_file_gdf["id"], "secondary_location_id": catchment_file_gdf["id"]})
catchment_crosswalk.to_parquet(Path(STUDY_GEO_DIR, "cat_cat_crosswalk.parquet"))

## Nexus GeoJSON

In [None]:
nexus_file_gdf = gpd.read_file(Path(NGEN_CONFIG_DIR, "nexus_data.geojson"))
nexus_file_gdf["name"] = nexus_file_gdf["id"]
nexus_file_gdf = nexus_file_gdf[["id", "name", "geometry"]]
nexus_file_gdf.to_parquet(Path(STUDY_GEO_DIR, "nex_geometry.parquet"))

In [None]:
nexus_crosswalk = pd.DataFrame({"primary_location_id": nexus_file_gdf["id"], "secondary_location_id": nexus_file_gdf["id"]})
nexus_crosswalk.to_parquet(Path(STUDY_GEO_DIR, "nex_nex_crosswalk.parquet"))

## Catchment Forcings

In [None]:
cat_forcing_files = NGEN_FORCINGS_DIR.glob(pattern="cat03w_cat-*.csv")

cat_dfs = []
for file in cat_forcing_files:
    cat_df = pd.read_csv(file)
    cat_df["configuration"] = "AWI_03W_113060_001"
    cat_df["variable_name"] = "precipitation_rate"
    cat_df["reference_time"] = ""
    cat_df["measurement_unit"] = "mm/hr"
    cat_df["location_id"] = file.stem.split("_")[-1]
    cat_df.rename(columns={"time":"value_time", "precip_rate":"value"}, inplace=True, errors="raise")
    cat_df["value"] = cat_df["value"] * 3600
    cat_df = cat_df[["reference_time", "location_id", "value_time", "value", "variable_name", "measurement_unit", "configuration"]]
    # Depending on size may want to write inididual parquet files.
    # cat_df.to_parquet(Path(STUDY_FORCINGS_DIR, f"{file.stem.split('_')[-1]}.parquet"))
    cat_dfs.append(cat_df)
    
pd.concat(cat_dfs).to_parquet(Path(STUDY_TS_DIR, "catchment_forcings.parquet"))

## Catchment Output

In [None]:
cat_output_files = NGEN_OUTPUT_DIR.glob(pattern="cat-*.csv")

cat_out_dfs = []
for file in cat_output_files:
    cat_out_df = pd.read_csv(file)
    cat_out_df["configuration"] = "AWI_03W_113060_001"
    cat_out_df["location_id"] = file.stem.split("_")[-1]
    cat_out_df["variable_name"] = "runoff"
    cat_out_df["reference_time"] = ""
    cat_out_df["measurement_unit"] = "m3/s"
    cat_out_df.rename(columns={"Time Step":"lead_time","Time":"value_time", "Q_OUT": "value"}, inplace=True, errors="raise")
    cat_out_df = cat_out_df[["reference_time", "location_id", "value_time", "value", "variable_name", "measurement_unit", "configuration"]]
     # Depending on size may want to write inididual parquet files.
    # cat_out_df.to_parquet(Path(STUDY_OUTPUT_DIR, f"{file.stem.split('_')[-1]}.parquet"))
    cat_out_dfs.append(cat_out_df)
    
pd.concat(cat_out_dfs).to_parquet(Path(STUDY_TS_DIR, "catchment_simulation.parquet"))

## Nexus Output

In [None]:
nexus_output_files = NGEN_OUTPUT_DIR.glob(pattern="nex-*.csv")

nex_out_dfs = []
for file in nexus_output_files:
    nex_out_df = pd.read_csv(file, header=1, names=["lead_time","value_time", "value"])
    nex_out_df["configuration"] = "AWI_03W_113060_001"
    nex_out_df["nexus_id"] = file.stem.split("_")[0]
    nex_out_df["variable_name"] = "streamflow"
    nex_out_df["reference_time"] = ""
    nex_out_df["measurement_unit"] = "m3/s"
    nex_out_df = nex_out_df[["reference_time", "nexus_id", "value_time", "value", "variable_name", "measurement_unit", "configuration"]]
    # Depending on size may want to write inididual parquet files.
    # nex_out_df.to_parquet(Path(STUDY_OUTPUT_DIR, f"{file.stem.split('_')[-1]}.parquet"))
    nex_out_dfs.append(nex_out_df)
    
pd.concat(nex_out_dfs).to_parquet(Path(STUDY_TS_DIR, "nexus_simulation.parquet"))