# Example NGEN to Parquet
This is an example notebook to convert ngen catchments, nexus, forcings, and outputs to parquet files for use in TEEHR.

This code is not at all DRY at the moment.  There is probably room for tools to handle this process and a standardized yet configurable way.  The NextGen output can vary a lot depending on the model, configuration, etc., so careful planning and understanding of the output would be required.

In [None]:
import pandas as pd
import geopandas as gpd
from pathlib import Path

In [None]:
# Set some configurations
NGEN_DIR = "/home/jovyan/cache/AWI_03W_113060_001_NGEN/"
NGEN_CONFIG_DIR = Path(NGEN_DIR, "config")
NGEN_FORCINGS_DIR = Path(NGEN_DIR, "forcings")
NGEN_OUTPUT_DIR = Path(NGEN_DIR, "output")

STUDY_DIR = "/home/jovyan/cache/ngen_simulation_example/"
STUDY_FORCINGS_DIR = Path(STUDY_DIR, "timeseries", "forcings")
STUDY_SIM_DIR = Path(STUDY_DIR, "timeseries", "simulation")
STUDY_GEO_DIR = Path(STUDY_DIR, "geo")
STUDY_USGS_DIR = Path(STUDY_DIR, "timeseries", "usgs")

## Catchment GeoJSON

In [None]:
catchment_file_gdf = gpd.read_file(Path(NGEN_CONFIG_DIR, "catchment_data.geojson"))

## Nexus GeoJSON

In [None]:
nexus_file_gdf = gpd.read_file(Path(NGEN_CONFIG_DIR, "nexus_data.geojson"))

## Catchment Forcings

In [None]:
cat_forcing_files = NGEN_FORCINGS_DIR.glob(pattern="cat03w_cat-*.csv")

In [None]:
for file in cat_forcing_files:
    cat_df = pd.read_csv(file)
    cat_df["configuration"] = "AWI_03W_113060_001"
    cat_df["variable_name"] = "precipitation_flux"
    cat_df["reference_time"] = ""
    cat_df["measurement_unit"] = "mm s^-1"
    cat_df["catchment_id"] = file.stem.split("_")[-1]
    cat_df.rename(columns={"time":"value_time", "precip_rate":"value"}, inplace=True, errors="raise")
    cat_df = cat_df[["reference_time", "catchment_id", "value_time", "value", "variable_name", "measurement_unit", "configuration"]]
    cat_df.to_parquet(Path(STUDY_FORCINGS_DIR, f"{file.stem.split('_')[-1]}.parquet"))

## Catchment Output

In [None]:
cat_output_files = NGEN_OUTPUT_DIR.glob(pattern="cat-*.csv")

In [None]:
for file in cat_output_files:
    cat_out_df = pd.read_csv(file)
    cat_out_df["configuration"] = "AWI_03W_113060_001"
    cat_out_df["catchment_id"] = file.stem.split("_")[-1]
    cat_out_df["variable_name"] = "runoff"
    cat_out_df["reference_time"] = ""
    cat_out_df["measurement_unit"] = "m^3/s"
    cat_out_df.rename(columns={"Time Step":"lead_time","Time":"value_time", "Q_OUT": "value"}, inplace=True, errors="raise")
    cat_out_df = cat_out_df[["reference_time", "catchment_id", "value_time", "value", "variable_name", "measurement_unit", "configuration"]]
    cat_out_df.to_parquet(Path(STUDY_OUTPUT_DIR, f"{file.stem.split('_')[-1]}.parquet"))

In [None]:
## Nexus Output

In [None]:
nexus_output_files = NGEN_OUTPUT_DIR.glob(pattern="nex-*.csv")

In [None]:
for file in nexus_output_files:
    nex_out_df = pd.read_csv(file, header=1, names=["lead_time","value_time", "value"])
    nex_out_df["configuration"] = "AWI_03W_113060_001"
    nex_out_df["nexus_id"] = file.stem.split("_")[0]
    nex_out_df["variable_name"] = "streamflow"
    nex_out_df["reference_time"] = ""
    nex_out_df["measurement_unit"] = "m^3/s"
    nex_out_df = nex_out_df[["reference_time", "nexus_id", "value_time", "value", "variable_name", "measurement_unit", "configuration"]]
    nex_out_df.to_parquet(Path(STUDY_OUTPUT_DIR, f"{file.stem.split('_')[-1]}.parquet"))