In [None]:
import os
import shutil
import fsspec
import ujson
from kerchunk.hdf import SingleHdf5ToZarr
from kerchunk.combine import MultiZarrToZarr
import xarray as xr
import dask
import hvplot.xarray
from datetime import datetime, timedelta
import pandas as pd
import pickle

In [None]:
from dask.distributed import Client, LocalCluster, progress

cluster = LocalCluster()
client = Client(cluster)
client

In [None]:
# adding project dirs to path so code may be referenced from the notebook
import sys
sys.path.insert(0, '../../evaluation')
sys.path.insert(0, '../../evaluation/queries')
sys.path.insert(0, '../../evaluation/loading')

In [None]:
# Query some forcast data from parquet files
import importlib
import queries
import config
import utils as hu
importlib.reload(queries)
importlib.reload(config)
importlib.reload(hu)
import grid_to_parquet
importlib.reload(grid_to_parquet)
from datetime import datetime, timedelta

In [None]:
# Setup some criteria
ingest_days = 30
start_dt = datetime(2022, 12, 18, 6) # First one is at 00Z in date
td = timedelta(hours=6)
number_of_forecasts = 1 #ingest_days * 4

In [None]:
fs = fsspec.filesystem('gcs', anon=True)
fs2 = fsspec.filesystem('')

In [None]:
json_dir = 'forcing_jsons/'

if not os.path.exists(json_dir):
    os.makedirs(json_dir)

In [None]:
so = dict(mode='rb', anon=True, default_fill_cache=False, default_cache_type='first') # args to fs.open()
# default_fill_cache=False avoids caching data in between file chunks to lowers memory usage.

In [None]:
def gen_json(u):
    with fs.open(u, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)
        p = u.split('/')
        date = p[3]
        fname = p[5]
        outf = f'{json_dir}{date}.{fname}.json'
        with open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode());

In [None]:
%%time
print(datetime.now())
# Loop though forecasts, fetch and insert
for f in range(number_of_forecasts):
    reference_time = start_dt + td * f
    ref_time_str = reference_time.strftime("%Y%m%dT%HZ")
    configuration = "forcing_medium_range"

    print(f"Start download of {ref_time_str}")

    blob_list = grid_to_parquet.list_blobs_forcing(
        configuration=configuration,
        reference_time = ref_time_str,
        must_contain = "forcing"
    )
    
    blob_list = [f"gcs://national-water-model/{b}" for b in blob_list]
    
    results = dask.compute(*[dask.delayed(gen_json)(u) for u in blob_list], retries=10)
    

In [None]:
backend_args = { "consolidated": False,
                 "storage_options": { "fo": 'forcing_jsons/nwm.20221218.nwm.t06z.medium_range.forcing.f001.conus.nc.json',
                                "remote_protocol": "gcs", 
                                "remote_options": {'anon':True} }}
ds = xr.open_dataset(
    "reference://", engine="zarr",
    backend_kwargs=backend_args
)

In [None]:
%%time
src = ds["RAINRATE"].persist()

In [None]:
%%time
df = grid_to_parquet.calc_zonal_stats_weights(
    src=src,
    weights_filepath=config.HUC10_MEDIUM_RANGE_WEIGHTS_FILEPATH
)
df