# Combine yearly CNAPS zarr datasets with kerchunk, save as Parquet

In [None]:
import fsspec
import numpy as np
import zarr
import xarray as xr

import kerchunk.combine
import kerchunk.zarr

from kerchunk.combine import MultiZarrToZarr
from fsspec.implementations.reference import LazyReferenceMapper

In [None]:
fs = fsspec.filesystem('s3', anon=True, 
                       client_kwargs=dict(endpoint_url='https://mghp.osn.xsede.org'))

#### List all Zarr datasets

In [None]:
flist = fs.glob('s3://rsignellbucket1/rsignell/CNAPS/xbitinfo-zarr/useast_avg_*_999.zarr')

#### Create references for each zarr dataset
(do just first three for now)

In [None]:
opts = dict(anon=True, client_kwargs={'endpoint_url': 'https://mghp.osn.xsede.org'})
so=dict(remote_protocol='s3', remote_options=opts, target_options=opts)

In [None]:
%%time
ref_list = [kerchunk.zarr.single_zarr(fs.get_mapper(f), inline_threshold=300, storage_options=so) for f in flist]

#### open one set of references, calculate identical dims

In [None]:
ds = xr.open_dataset(ref_list[0], engine="kerchunk", chunks={}, storage_options=so)

In [None]:
identical_dims = []
for v in ds.variables.keys():
    if 'ocean_time' not in ds[v].dims:
        identical_dims.append(v)

In [None]:
import zarr

def modify_attrs(refs):
    tmp= zarr.open(refs)
    tmp.ocean_time.attrs['standard_name'] = 'time'
    return refs

def postprocess(refs):
    refs = modify_attrs(refs)
    return refs

def preprocess(refs):
    for k in list(refs):
        if k=='dstart':  # drop the "dstart" variable
            refs.pop(k)
    return refs

#### Create combined parquet references

In [None]:
combined_parquet = 'combined.parq'

fs_local = fsspec.filesystem("file")
fs_local.makedirs(combined_parquet, exist_ok=True)

In [None]:
out = LazyReferenceMapper.create(combined_parquet, fs=fs_local, record_size=100000)

In [None]:
%%time
_ = MultiZarrToZarr(
        ref_list,
        remote_protocol="s3",
        remote_options=opts,
        concat_dims=["ocean_time"],
        coo_map={"ocean_time": "cf:ocean_time"},
        identical_dims=identical_dims,
        preprocess=preprocess,
        postprocess=postprocess,
        out=out).translate()
out.flush()

In [None]:
combined_parquet_osn = 's3://rsignellbucket1/rsignell/CNAPS/combined.parq'

In [None]:
ds = xr.open_dataset(combined_parquet, engine="kerchunk", chunks={},
            storage_options=dict(remote_protocol='s3', remote_options=opts, target_options=opts, lazy=True))

In [None]:
fs_write = fsspec.filesystem('s3', profile='osn-rsignellbucket1', 
                       client_kwargs=dict(endpoint_url='https://mghp.osn.xsede.org'))

In [None]:
_ = fs_write.upload(combined_parquet, combined_parquet_osn, recursive=True)

In [None]:
fs_write.upload('cnaps_intake.yml', 's3://rsignellbucket1/rsignell/CNAPS/cnaps_intake.yml')

In [None]:
fs_write.info('s3://rsignellbucket1/rsignell/CNAPS/cnaps_intake.yml')