# CNAPS RerenceFileSystem JSON  64 bit
Create ReferenceFileSystem JSON file for a collection of COAWST NetCDF 3 64 bit files on S3 

In [None]:
import os
import fsspec
import ujson   # fast json
from kerchunk.netCDF3 import NetCDF3ToZarr
from kerchunk.combine import MultiZarrToZarr, auto_dask, JustLoad
from pathlib import Path
import xarray as xr
import cf_xarray
import dask
import hvplot.xarray

In [None]:
fs_read = fsspec.filesystem('file')

In [None]:
nc_list = fs_read.glob('/shared/users/rsignell/data/jzambon/nc64/his_????????.nc')
nc_list

In [None]:
# try opening a file:
# xr.open_dataset(fs_read.open(nc_list[0]))    # netCDF4 files can be opened directly from a bucket, but not NetCDF3 files

In [None]:
print(nc_list[0])
print(nc_list[-1])

In [None]:
json_dir = '/shared/users/rsignell/data/jzambon/jsons/'

In [None]:
json_list = fs_read.glob(f'{json_dir}*.json')
print(len(json_list))
if len(json_list)>0:
    print(json_list[0])
    print(json_list[-1])

In [None]:
nc_processed_list = [j.split('.json')[0].replace('/jsons','') for j in json_list]
if len(nc_processed_list)>0:
    print(nc_processed_list[0])
    print(nc_processed_list[-1])

In [None]:
nc_process_list = sorted(list(set(nc_list) - set(nc_processed_list)))
print(len(nc_process_list))
print(nc_process_list[0])
print(nc_process_list[-1])

Generate references in memory, returning a dict

In [None]:
def gen_ref(f):
    return NetCDF3ToZarr(f,inline_threshold=300, version=2).translate()

Test one file:

In [None]:
d = gen_ref(nc_process_list[0])

In [None]:
fs5 = fsspec.filesystem("reference", fo=d,
                       skip_instance_cache=True)
m = fs5.get_mapper("")

In [None]:
ds = xr.open_dataset(m, engine="zarr", chunks={}, 
                     backend_kwargs=dict(consolidated=False))

In [None]:
ds

In [None]:
#ds.Hwave.hvplot.quadmesh(x='lon_rho', y='lat_rho', rasterize=True, geo=True, cmap='turbo', clim=(0,5))

#### Parallel creation of JSON for each file using Dask Bag

Use local cluster as files are local -- Dask Gateway workers can't see local filesystem

In [None]:
from dask.distributed import Client

In [None]:
client = Client()

In [None]:
client

In [None]:
nc_process_list

In [None]:
import dask.bag as db
from dask.distributed import progress
bag = db.from_sequence(nc_process_list, npartitions=4).map(gen_ref)

In [None]:
bag.visualize()

In [None]:
bag = bag.persist()
progress(bag)

In [None]:
dicts = bag.compute()

In [None]:
import zarr

def modify_fill_value(out):
    out_ = zarr.open(out)
    out_.lon.fill_value = -999
    out_.lat.fill_value = -999
    return out

def modify_metadata(out):
    g = zarr.open(out)    
    for n, v in g.arrays():
        if len(v.shape)>1:
            if v.dtype == '>f4':
                g[n].fill_value = 1.0e+37
    g.ocean_time.attrs['standard_name'] = 'time'
    return out
    
def postprocess(out):
    out = modify_metadata(out)
    return out

In [None]:
mzz = MultiZarrToZarr(
    dicts,
    concat_dims='ocean_time',
    inline_threshold=500,
    identical_dims=['lat_psi','lat_rho','lat_u','lat_v',
                    'lon_psi','lon_rho','lon_u','lon_v'],
    postprocess=postprocess   
)

In [None]:
%time mzz.translate('./combined64.json')

Examine the resulting dataset

In [None]:
fs5 = fsspec.filesystem("reference", fo='combined64.json',
                       skip_instance_cache=True)
m = fs5.get_mapper("")

In [None]:
ds = xr.open_dataset(m, engine="zarr", chunks={}, 
                     backend_kwargs=dict(consolidated=False))

In [None]:
ds

In [None]:
ds['Hwave'][:,200,200].hvplot(x='ocean_time', grid=True)

In [None]:
ds['Hwave'][0,:,:].plot()

In [None]:
client.close()