## Explore downloading and formatting operational NWM streamflow forecasts from GCS

##### This notebook demonstrates the use of a utility to download NWM streamflow forecasts from the GCS bucket using a standardized data model

In [None]:
from nwm_to_parquet_sam import nwm_to_parquet

##### To download the data, you must first specify a number of parameters:
`gcs_dir` (str) - GCS directory of the operational NWM data

`configuration` (str) - NWM forecast category

`component` (str) - Component of the NWM forecast category

`ingest_days` (int) - Number for days to ingest beginning from the start date

`start_date` (str) - Beginning date defining data ingestion time period

`variable_name` (str) - Name of the NWM data variable to download

`multifile_filepath` (str) - Name of the kerchunk combined reference file

`json_dir` (str) - Path to the directory for the kerchunk reference files

`output_parquet_dir` (str) - Path to the directory for the final parquet files

`location_ids` (np.array) - Array specifying NWM IDs of interest

In [None]:
gcs_dir = "gcs://national-water-model"
configuration = "medium_range_mem1"
component = "channel_rt"
ingest_days = 1
start_date = "2022-12-18"
variable_name = "streamflow"
multifile_filepath = "nwm.json"
json_dir = "jsons/"
output_parquet_dir = "/home/jovyan/temp/parquet"

##### For this demo, we'll get the `location_ids` from an exisiting route_link crosswalk table and use only those NWM IDs that coincide with a USGS gauge

In [None]:
import pandas as pd
import numpy as np

route_link_filename = "/home/jovyan/shared/rti-eval/nwm/route_link_conus.parquet"

# Read in route link file and drop rows with no usgs gauge
df_route_link = pd.read_parquet(route_link_filename)
df_route_link.replace('', np.nan, inplace=True)
df_route_link.dropna(subset=["gage_id"], inplace=True)

# Get the gauge IDs to use for selection
location_ids = df_route_link.nwm_feature_id.values.astype(float)

##### Now we can get the data. We can take advantage of Dask to improve performance

In [None]:
from dask.distributed import Client
client = Client()
client

In [None]:
%%time
nwm_to_parquet(gcs_dir, 
               configuration, 
               component, 
               ingest_days, 
               start_date, 
               variable_name,
               multifile_filepath,
               json_dir,
               output_parquet_dir,
               location_ids)

##### Let's take a look at the output files

In [None]:
df_nwm = pd.read_parquet("/home/jovyan/temp/parquet/20221218T06Z.parquet")
df_nwm