In [6]:
# Import the required packages.

# Run this if TEEHR is not installed
# import sys
# sys.path.insert(0, "../../src")

import os

import teehr.loading.nwm.nwm_points as tlp

from pathlib import Path
from dask.distributed import Client

In [7]:
# Set some notebook variables to point to the relevant study files.
# (Please refer the ../loading/point_config_models.py file for all configuration options)

CONFIGURATION = "short_range"  # analysis_assim, short_range, analysis_assim_hawaii, medium_range_mem1
OUTPUT_TYPE = "channel_rt"
VARIABLE_NAME = "streamflow"
T_MINUS = [0, 1, 2]  # Only used if an assimilation run is selected

NWM_VERSION = "nwm22"  # Currently accepts "nwm22" or "nwm30"
                       # Use "nwm22" for dates prior to 09-19-2023

DATA_SOURCE = "GCS"    # Specifies the remote location from which to fetch the data
                       # ("GCS", "NOMADS", "DSTOR")

KERCHUNK_METHOD = "auto"  # When data_source = "GCS", specifies the preference in creating Kerchunk reference json files.
                          # "create" - always create new json files from netcdf files in GCS and save locally
                          # "use_available" - read the CIROH pre-generated jsons from s3, ignoring any that are unavailable
                          # "auto" - read the CIROH pre-generated jsons from s3, and create any that are unavailable, storing locally

PROCESS_BY_Z_HOUR = True  # If True, NWM files will be processed by z-hour per day. If False, files will be
                          # processed in chunks (defined by STEPSIZE). This can help if you want to read many reaches
                          # at once (all ~2.7 million for medium range for example).

STEPSIZE = 100  # Only used if PROCESS_BY_Z_HOUR = False. Controls how many files are processed in memory at once
                # Higher values can increase performance at the expense on memory  (default value: 100)

IGNORE_MISSING_FILE = True  # If True, the missing file(s) will be skipped and the process will resume
                            # If False, TEEHR will fail if a missing NWM file is encountered

OVERWRITE_OUTPUT = True  # If True (default), existing output files will be overwritten
                         # If False, existing files are retained

START_DATE = "2023-03-18"
INGEST_DAYS = 1

OUTPUT_ROOT = Path(Path().home(), "temp")
JSON_DIR = Path(OUTPUT_ROOT, "zarr", CONFIGURATION)
OUTPUT_DIR = Path(OUTPUT_ROOT, "timeseries", CONFIGURATION)

# For this simple example, we'll get data for 10 NWM reaches that coincide with USGS gauges
LOCATION_IDS = [7086109,  7040481,  7053819,  7111205,  7110249, 14299781, 14251875, 14267476,  7152082, 14828145]

In [8]:
n_workers = max(os.cpu_count() - 1, 1)
client = Client(n_workers=n_workers)
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 36847 instead


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:36847/status,

0,1
Dashboard: http://127.0.0.1:36847/status,Workers: 7
Total threads: 14,Total memory: 19.52 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:44565,Workers: 7
Dashboard: http://127.0.0.1:36847/status,Total threads: 14
Started: Just now,Total memory: 19.52 GiB

0,1
Comm: tcp://127.0.0.1:45459,Total threads: 2
Dashboard: http://127.0.0.1:35913/status,Memory: 2.79 GiB
Nanny: tcp://127.0.0.1:36379,
Local directory: /tmp/dask-scratch-space/worker-y6nrddpb,Local directory: /tmp/dask-scratch-space/worker-y6nrddpb

0,1
Comm: tcp://127.0.0.1:36969,Total threads: 2
Dashboard: http://127.0.0.1:44813/status,Memory: 2.79 GiB
Nanny: tcp://127.0.0.1:33983,
Local directory: /tmp/dask-scratch-space/worker-e5_5n9z7,Local directory: /tmp/dask-scratch-space/worker-e5_5n9z7

0,1
Comm: tcp://127.0.0.1:39603,Total threads: 2
Dashboard: http://127.0.0.1:42777/status,Memory: 2.79 GiB
Nanny: tcp://127.0.0.1:36691,
Local directory: /tmp/dask-scratch-space/worker-3cspbkxz,Local directory: /tmp/dask-scratch-space/worker-3cspbkxz

0,1
Comm: tcp://127.0.0.1:35583,Total threads: 2
Dashboard: http://127.0.0.1:43295/status,Memory: 2.79 GiB
Nanny: tcp://127.0.0.1:34721,
Local directory: /tmp/dask-scratch-space/worker-l4l1a5t0,Local directory: /tmp/dask-scratch-space/worker-l4l1a5t0

0,1
Comm: tcp://127.0.0.1:42763,Total threads: 2
Dashboard: http://127.0.0.1:36529/status,Memory: 2.79 GiB
Nanny: tcp://127.0.0.1:39937,
Local directory: /tmp/dask-scratch-space/worker-dqduvw15,Local directory: /tmp/dask-scratch-space/worker-dqduvw15

0,1
Comm: tcp://127.0.0.1:43277,Total threads: 2
Dashboard: http://127.0.0.1:37085/status,Memory: 2.79 GiB
Nanny: tcp://127.0.0.1:41631,
Local directory: /tmp/dask-scratch-space/worker-m43umlr1,Local directory: /tmp/dask-scratch-space/worker-m43umlr1

0,1
Comm: tcp://127.0.0.1:37853,Total threads: 2
Dashboard: http://127.0.0.1:43941/status,Memory: 2.79 GiB
Nanny: tcp://127.0.0.1:35859,
Local directory: /tmp/dask-scratch-space/worker-825vr78u,Local directory: /tmp/dask-scratch-space/worker-825vr78u


In [9]:
%%time
tlp.nwm_to_parquet(
    CONFIGURATION,
    OUTPUT_TYPE,
    VARIABLE_NAME,
    START_DATE,
    INGEST_DAYS,
    LOCATION_IDS,
    JSON_DIR,
    OUTPUT_DIR,
    T_MINUS,
    PROCESS_BY_Z_HOUR,
    STEPSIZE,
    IGNORE_MISSING_FILE,
    OVERWRITE_OUTPUT,
)

Overwriting 20230318T00Z.parquet
Overwriting 20230318T01Z.parquet
Overwriting 20230318T02Z.parquet
Overwriting 20230318T03Z.parquet
Overwriting 20230318T04Z.parquet
Overwriting 20230318T05Z.parquet
Overwriting 20230318T06Z.parquet
Overwriting 20230318T07Z.parquet
Overwriting 20230318T08Z.parquet
Overwriting 20230318T09Z.parquet
Overwriting 20230318T10Z.parquet
Overwriting 20230318T11Z.parquet
Overwriting 20230318T12Z.parquet
Overwriting 20230318T13Z.parquet
Overwriting 20230318T14Z.parquet
Overwriting 20230318T15Z.parquet
Overwriting 20230318T16Z.parquet
Overwriting 20230318T17Z.parquet
Overwriting 20230318T18Z.parquet
Overwriting 20230318T19Z.parquet
Overwriting 20230318T20Z.parquet
Overwriting 20230318T21Z.parquet
Overwriting 20230318T22Z.parquet
Overwriting 20230318T23Z.parquet
CPU times: user 13.1 s, sys: 1.49 s, total: 14.6 s
Wall time: 1min 41s
