In [None]:
import os
import shutil
import fsspec
import ujson
from kerchunk.hdf import SingleHdf5ToZarr
from kerchunk.combine import MultiZarrToZarr
import xarray as xr
import dask
import hvplot.xarray
from datetime import datetime, timedelta

In [None]:
from dask.distributed import Client, LocalCluster, progress

cluster = LocalCluster()
client = Client(cluster)
client

In [None]:
# adding project dirs to path so code may be referenced from the notebook
import sys
sys.path.insert(0, '../../evaluation')
sys.path.insert(0, '../../evaluation/queries')
sys.path.insert(0, '../../evaluation/loading')


In [None]:
# Query some forcast data from parquet files
import importlib
import queries
import config
import utils
importlib.reload(queries)
importlib.reload(config)
importlib.reload(utils)
import grid_to_parquet
importlib.reload(grid_to_parquet)
from datetime import datetime, timedelta

In [None]:
# Setup some criteria
ingest_days = 30
start_dt = datetime(2022, 12, 18, 6) # First one is at 00Z in date
td = timedelta(hours=6)
number_of_forecasts = 1 #ingest_days * 4

In [None]:
fs = fsspec.filesystem('gcs', anon=True)
fs2 = fsspec.filesystem('')

In [None]:
json_dir = 'jsons/'

if not os.path.exists(json_dir):
    os.makedirs(json_dir)

In [None]:
so = dict(mode='rb', anon=True, default_fill_cache=False, default_cache_type='first') # args to fs.open()
# default_fill_cache=False avoids caching data in between file chunks to lowers memory usage.

In [None]:
def gen_json(u):
    with fs.open(u, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)
        p = u.split('/')
        date = p[3]
        fname = p[5]
        outf = f'{json_dir}{date}.{fname}.json'
        with open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode());

In [None]:
%%time
print(datetime.now())
# Loop though forecasts, fetch and insert
for f in range(number_of_forecasts):
    reference_time = start_dt + td * f
    ref_time_str = reference_time.strftime("%Y%m%dT%HZ")
    configuration = "medium_range_mem1"

    print(f"Start download of {ref_time_str}")

    blob_list = grid_to_parquet.list_blobs_forcing(
        configuration=configuration,
        reference_time = ref_time_str,
        must_contain = "channel_rt"
    )
    
    blob_list = [f"gcs://national-water-model/{b}" for b in blob_list]
    
    results = dask.compute(*[dask.delayed(gen_json)(u) for u in blob_list], retries=10)
    

In [None]:
json_list = fs2.glob(f'{json_dir}/nwm.20221218.nwm.t06z*.json')
json_list = sorted(json_list)

In [None]:
mzz = MultiZarrToZarr(json_list,
        remote_protocol='gcs',
        remote_options={'anon':True},
        concat_dims=['time'],
        identical_dims = ['x', 'y'],
    )

In [None]:
%%time
mzz.translate('nwm.json')

In [None]:
backend_args = { "consolidated": False,
                 "storage_options": { "fo": 'nwm.json',
                                "remote_protocol": "gcs", 
                                "remote_options": {'anon':True} }}
ds = xr.open_dataset(
    "reference://", engine="zarr",
    backend_kwargs=backend_args
)

In [None]:
ds

In [None]:
rl_gdf = utils.parquet_to_gdf(config.ROUTE_LINK_PARQUET)
rl_gdf

In [None]:
var = 'streamflow'
var_mean = ds[var].mean(dim=['time']).persist()

In [None]:
df = var_mean.to_dataframe()

In [None]:
df

In [None]:
rl_gdf

In [None]:
df.reset_index(inplace=True)

In [None]:
df2 = df.merge(rl_gdf, left_on="feature_id", right_on="nwm_feature_id")

In [None]:
df2

In [None]:
import numpy as np
import hvplot.pandas
import hvplot.xarray
import geoviews as gv
from holoviews.operation.datashader import rasterize
import cartopy.crs as ccrs

In [None]:
p = df2.hvplot.points('longitude', 'latitude', crs=ccrs.PlateCarree(),
                     c='streamflow', colorbar=True, size=14, cmap='viridis')

In [None]:
g = rasterize(p, aggregator='mean', x_sampling=0.02, y_sampling=0.02, width=500).opts(tools=['hover'],
                aspect='equal', logz=True, cmap='viridis', clim=(1e-2, np.nan))

In [None]:
g * gv.tile_sources.OSM