# CNAPS RerenceFileSystem JSON  64 bit
Create ReferenceFileSystem JSON file for a collection of COAWST NetCDF 3 64 bit files on S3 

In [None]:
import os
import fsspec
import ujson   # fast json
from kerchunk.netCDF3 import NetCDF3ToZarr
from kerchunk.combine import MultiZarrToZarr, auto_dask, JustLoad
from pathlib import Path
import xarray as xr
import cf_xarray
import dask
import hvplot.xarray

In [None]:
fs_read = fsspec.filesystem('file')

In [None]:
nc_list = fs_read.glob('/shared/users/rsignell/data/jzambon/nc64/his_????????.nc')
nc_list

In [None]:
# try opening a file:
# xr.open_dataset(fs_read.open(nc_list[0]))    # netCDF4 files can be opened directly from a bucket, but not NetCDF3 files

In [None]:
print(nc_list[0])
print(nc_list[-1])

In [None]:
json_dir = '/shared/users/rsignell/data/jzambon/jsons/'

In [None]:
json_list = fs_read.glob(f'{json_dir}*.json')
print(len(json_list))
if len(json_list)>0:
    print(json_list[0])
    print(json_list[-1])

In [None]:
nc_processed_list = [j.split('.json')[0].replace('/jsons','') for j in json_list]
if len(nc_processed_list)>0:
    print(nc_processed_list[0])
    print(nc_processed_list[-1])

In [None]:
nc_process_list = list(set(nc_list) - set(nc_processed_list))
print(len(nc_process_list))
print(nc_process_list[0])
print(nc_process_list[-1])

Generate references in memory, returning a dict

In [None]:
def gen_ref(f):
    with fs_read.open(f) as infile:
        return NetCDF3ToZarr(infile, f, inline_threshold=300, version=2).translate()

Test one file:

In [None]:
d = NetCDF3ToZarr(nc_process_list[0], inline_threshold=300, version=2).translate()

In [None]:
skip_instance_cache=True

In [None]:
fs5 = fsspec.filesystem("reference", fo=d,
                       skip_instance_cache=True)
m = fs5.get_mapper("")

In [None]:
ds = xr.open_dataset(m, engine="zarr", chunks={}, 
                     backend_kwargs=dict(consolidated=False))

In [None]:
ds.Hwave.hvplot.quadmesh(x='lon_rho', y='lat_rho', rasterize=True, geo=True, cmap='turbo', clim=(0,5))

#### Parallel creation of JSON for each file using Dask Futures

Use local cluster as files are local -- Dask Gateway workers can't see local filesystem

In [None]:
from dask.distributed import Client

In [None]:
client = Client()

In [None]:
import dask.bag as db
bag = db.from_sequence(nc_process_list, npartitions=len(nc_process_list)).map(gen_ref)

In [None]:
bag.visualize()

In [None]:
%time dicts = bag.compute()

In [None]:
d = []
for f in nc_process_list:
    dt = NetCDF3ToZarr(nc_process_list[0], inline_threshold=300, version=2).translate()
    d.append(dt)

In [None]:
import zarr

def modify_metadata(out):
    g = zarr.open(out)    
    for n, v in g.arrays():
        if len(v.shape)>1:
            v.fill_value = 1.e+37
    g.ocean_time.attrs['standard_name'] = 'time'
    return out
    
def postprocess(out):
    out = modify_metadata(out)
    return out

In [None]:
len(d)

In [None]:
mzz = MultiZarrToZarr(
    d,
    concat_dims='ocean_time',
    inline_threshold=500,
    identical_dims=['lat_psi','lat_rho','lat_u','lat_v',
                                'lon_psi','lon_rho','lon_u','lon_v'],
    #postprocess=postprocess   
)

In [None]:
%time mzz.translate('./combined.json')

Update the json_list with the new files and convert to s3 urls

In [None]:
json_list = fs_read.glob(f'{json_dir}*.json')
json_list = [f's3://{j}' for j in json_list]

Examine the resulting dataset

In [None]:
fs5 = fsspec.filesystem("reference", fo='combined.json',
                       skip_instance_cache=True)
m = fs5.get_mapper("")

In [None]:
fs5.ls('temp')

In [None]:
ds = xr.open_dataset(m, engine="zarr", chunks={}, 
                     backend_kwargs=dict(consolidated=False))

In [None]:
ds

Use CF conventions to select times

In [None]:
ds.cf.isel(time=-1)

Write combined JSON to S3

In [None]:
combined_json = 's3://rsignellbucket1/cnaps/archive.json'

In [None]:
%%time
with fs_write.open(combined_json, 'wb') as f:
    f.write(ujson.dumps(d).encode());

In [None]:
fs_write.size(combined_json)/1e6  # combined JSON size in MB

In [None]:
fs_write.info(combined_json)

#### Try opening the consolidated JSON file from S3

In [None]:
# repeating lines from above for convenience in case notebook is started here:
combined_json = 's3://rsignellbucket1/cnaps/archive.json'
opts = dict(anon=True, client_kwargs={'endpoint_url': 'https://mghp.osn.xsede.org'})

In [None]:
%%time
fs5 = fsspec.filesystem("reference", fo=combined_json, target_options=opts,
                       remote_protocol='s3', remote_options=opts,
                       skip_instance_cache=True)
m = fs5.get_mapper("")
ds = xr.open_dataset(m, engine="zarr", chunks={}, 
                     backend_kwargs=dict(consolidated=False))

In [None]:
#fs5 = fsspec.filesystem("reference", fo=json_list[-1], target_options=opts,
#                       remote_protocol='s3', remote_options=opts)
#m = fs5.get_mapper("")
#ds = xr.open_dataset(m, engine="zarr", chunks={'ocean_time':12}, 
#                     backend_kwargs=dict(consolidated=False))

In [None]:
ds.salt

In [None]:
%%time
da = ds['temp'][-10:,-1,:,:].load()

In [None]:
lon_name = da.cf['longitude'].name
lat_name = da.cf['latitude'].name

da.hvplot.quadmesh(x=lon_name, y=lat_name, geo=True, cmap='turbo', tiles='OSM', rasterize=True)

In [None]:
da[:,150,150].hvplot(x='ocean_time', grid=True)

In [None]:
client.close(); cluster.shutdown()