# Creating and Appending to an Icechunk Store with Virtual References
This notebook demonstrates how to create an icechunk store and then append to it.

See this blog post for more info:  https://tom-nicholas.com/blog/2025/cloud-optimized-scientific-data

In [1]:
import warnings
import os
import fsspec
import icechunk
import xarray as xr
from obstore.store import from_url

from virtualizarr import open_virtual_dataset
from virtualizarr.parsers import HDFParser
from virtualizarr.registry import ObjectStoreRegistry

warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
import virtualizarr
print(icechunk.__version__)
print(virtualizarr.__version__)

1.1.9
2.1.1


In [3]:
# load AWS credentials for Pangeo-EOSC storage as environment vars
from dotenv import load_dotenv
_ = load_dotenv(f'{os.environ['HOME']}/dotenv/school_2025.env')

username = os.environ['JUPYTERHUB_USER']

# Define storage
storage_endpoint = 'https://pangeo-eosc-minioapi.vm.fedcloud.eu'
data_bucket = 'rsignell4-protocoast'
storage_bucket = 'protocoast-school-2025'
storage_name = f'{username}-taranto'

### Create a list of files that will make up the virtual dataset
First we will handle the "nos" SHYFEM files

In [4]:
fs = fsspec.filesystem('s3', anon=True, endpoint_url=storage_endpoint)
flist = fs.glob(f'{data_bucket}/full_dataset/shyfem/taranto/forecast/*/*nos*.nc')
flist = [f's3://{f}' for f in flist]

In [5]:
print(len(flist))
print(flist[-1])

19
s3://rsignell4-protocoast/full_dataset/shyfem/taranto/forecast/20251001/taranto_nos_20251001_nc4.nc


### Try opening one of these NetCDF files

In [6]:
ds = xr.open_dataset(fs.open(flist[0]))

In [7]:
ds['salinity'][:5,100,0].values

array([38.831997, 38.834663, 38.83676 , 38.834667, 38.83502 ],
      dtype=float32)

In [8]:
ds

### Define our Virtualizarr `Parser` and `ObjectStoreRegistry`

In [9]:
bucket = f"s3://{data_bucket}"
store = from_url(bucket, region="not-used", endpoint=storage_endpoint, skip_signature=True)
registry = ObjectStoreRegistry({bucket: store})
parser = HDFParser()

## Create virtual datasets from each file with VirtualiZarr's `open_virtual_dataset`

In [15]:
%%time
ds_list = [
    open_virtual_dataset(
        url=url,
        parser=parser,
        registry=registry,
        loadable_variables=["time"],
    )
    for url in flist[0:-1]
]

CPU times: user 7.14 s, sys: 8.33 s, total: 15.5 s
Wall time: 27.8 s


### "fix" each dataset to match the requirements of the "Rolodex" FMRC indexing

In [16]:
def fix_ds(ds):
    ds = ds.rename_vars(time='valid_time')
    ds = ds.rename_dims(time='step')
    step = (ds.valid_time - ds.valid_time[0]).assign_attrs({"standard_name": "forecast_period"})
    time = ds.valid_time[0].assign_attrs({"standard_name": "forecast_reference_time"})
    ds = ds.assign_coords(step=step, time=time)
    ds = ds.drop_indexes("valid_time")
    ds = ds.drop_vars('valid_time')
    ds = ds.set_coords(['latitude', 'longitude', 'element_index', 'topology', 'total_depth'])
    return ds

In [17]:
ds_list = [fix_ds(ds) for ds in ds_list]

In [18]:
combined_nos = xr.concat(
    ds_list,
    dim="time",
    coords="minimal",
    compat="override",
    combine_attrs="override",
)

In [19]:
combined_nos

Now we handle the "ous" SHYFEM files

In [20]:
flist = fs.glob(f'{data_bucket}/full_dataset/shyfem/taranto/forecast/*/*ous*.nc')
flist = [f's3://{f}' for f in flist]

In [21]:
%%time
ds_list = [
    open_virtual_dataset(
        url=url,
        parser=parser,
        registry=registry,
        loadable_variables=["time"],
    )
    for url in flist[0:-1]
]

CPU times: user 7.66 s, sys: 9.32 s, total: 17 s
Wall time: 32.4 s


In [22]:
ds_list = [fix_ds(ds) for ds in ds_list]

In [23]:
combined_ous = xr.concat(
    ds_list,
    dim="time",
    coords="minimal",
    compat="override",
    combine_attrs="override",
)

### Now create a virtual dataset with the "nos" and "ous" datasets merged together

In [24]:
ds = xr.merge([combined_nos, combined_ous], compat='override')

In [25]:
ds

## Initialize the Icechunk Store
We need configure the `virtual_chunk_container` as make sure the icechunk container credentials allow for anonymous access. 
Details on this can be found [here](https://icechunk.io/en/stable/virtual/).

In [26]:
fs_school = fsspec.filesystem('s3', anon=True, endpoint_url=storage_endpoint)

In [27]:
# remove old existing icechunk storage with this name
try:
    # Use the same prefix as the storage
    fs_school.rm(f's3://{storage_bucket}/icechunk/{storage_name}', recursive=True)
except:
    pass

In [28]:
fs_school.ls(f'{storage_bucket}')

['protocoast-school-2025/rsignell4']

In [29]:
storage = icechunk.s3_storage(
    bucket=storage_bucket,
    prefix=f"icechunk/{storage_name}",
    anonymous=True,
    endpoint_url=storage_endpoint,
    region='not-used',   # N/A for Pangeo-EOSC bucket, but required param
    force_path_style=True)

In [30]:
config = icechunk.RepositoryConfig.default()

config.set_virtual_chunk_container(
    icechunk.VirtualChunkContainer(
        url_prefix=f"s3://{data_bucket}/",
        store=icechunk.s3_store(region="not-used", anonymous=True, s3_compatible=True, 
                                force_path_style=True, endpoint_url=storage_endpoint),
    ),
)

repo = icechunk.Repository.create(storage, config)
session = repo.writable_session("main")

## Write the virtual datasets to the icechunk store and commit

In [31]:
ds.virtualize.to_icechunk(session.store)

In [32]:
session.commit("all but last day of Taranto data")

'X5F5SCEW7XCQYDFNXPSG'

## Check the icechunk dataset

In [33]:
credentials = icechunk.containers_credentials(
    {f"s3://{data_bucket}/": icechunk.s3_credentials(anonymous=True)})

read_repo = icechunk.Repository.open(
    storage, config, authorize_virtual_chunk_access=credentials)

read_session = read_repo.readonly_session("main")

In [34]:
ds = xr.open_zarr(read_session.store, consolidated=False, zarr_format=3)
ds

Unnamed: 0,Array,Chunk
Bytes,683.03 kiB,683.03 kiB
Shape,"(58285, 3)","(58285, 3)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray
"Array Chunk Bytes 683.03 kiB 683.03 kiB Shape (58285, 3) (58285, 3) Dask graph 1 chunks in 2 graph layers Data type int32 numpy.ndarray",3  58285,

Unnamed: 0,Array,Chunk
Bytes,683.03 kiB,683.03 kiB
Shape,"(58285, 3)","(58285, 3)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,120.04 kiB,62.50 kiB
Shape,"(30731,)","(16000,)"
Dask graph,2 chunks in 2 graph layers,2 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 120.04 kiB 62.50 kiB Shape (30731,) (16000,) Dask graph 2 chunks in 2 graph layers Data type float32 numpy.ndarray",30731  1,

Unnamed: 0,Array,Chunk
Bytes,120.04 kiB,62.50 kiB
Shape,"(30731,)","(16000,)"
Dask graph,2 chunks in 2 graph layers,2 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,120.04 kiB,62.50 kiB
Shape,"(30731,)","(16000,)"
Dask graph,2 chunks in 2 graph layers,2 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 120.04 kiB 62.50 kiB Shape (30731,) (16000,) Dask graph 2 chunks in 2 graph layers Data type float32 numpy.ndarray",30731  1,

Unnamed: 0,Array,Chunk
Bytes,120.04 kiB,62.50 kiB
Shape,"(30731,)","(16000,)"
Dask graph,2 chunks in 2 graph layers,2 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,120.04 kiB,62.50 kiB
Shape,"(30731,)","(16000,)"
Dask graph,2 chunks in 2 graph layers,2 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 120.04 kiB 62.50 kiB Shape (30731,) (16000,) Dask graph 2 chunks in 2 graph layers Data type float32 numpy.ndarray",30731  1,

Unnamed: 0,Array,Chunk
Bytes,120.04 kiB,62.50 kiB
Shape,"(30731,)","(16000,)"
Dask graph,2 chunks in 2 graph layers,2 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,20.77 GiB,4.39 MiB
Shape,"(18, 144, 30731, 70)","(1, 72, 16000, 1)"
Dask graph,5040 chunks in 2 graph layers,5040 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 20.77 GiB 4.39 MiB Shape (18, 144, 30731, 70) (1, 72, 16000, 1) Dask graph 5040 chunks in 2 graph layers Data type float32 numpy.ndarray",18  1  70  30731  144,

Unnamed: 0,Array,Chunk
Bytes,20.77 GiB,4.39 MiB
Shape,"(18, 144, 30731, 70)","(1, 72, 16000, 1)"
Dask graph,5040 chunks in 2 graph layers,5040 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,20.77 GiB,4.39 MiB
Shape,"(18, 144, 30731, 70)","(1, 72, 16000, 1)"
Dask graph,5040 chunks in 2 graph layers,5040 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 20.77 GiB 4.39 MiB Shape (18, 144, 30731, 70) (1, 72, 16000, 1) Dask graph 5040 chunks in 2 graph layers Data type float32 numpy.ndarray",18  1  70  30731  144,

Unnamed: 0,Array,Chunk
Bytes,20.77 GiB,4.39 MiB
Shape,"(18, 144, 30731, 70)","(1, 72, 16000, 1)"
Dask graph,5040 chunks in 2 graph layers,5040 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,20.77 GiB,4.39 MiB
Shape,"(18, 144, 30731, 70)","(1, 72, 16000, 1)"
Dask graph,5040 chunks in 2 graph layers,5040 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 20.77 GiB 4.39 MiB Shape (18, 144, 30731, 70) (1, 72, 16000, 1) Dask graph 5040 chunks in 2 graph layers Data type float32 numpy.ndarray",18  1  70  30731  144,

Unnamed: 0,Array,Chunk
Bytes,20.77 GiB,4.39 MiB
Shape,"(18, 144, 30731, 70)","(1, 72, 16000, 1)"
Dask graph,5040 chunks in 2 graph layers,5040 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,303.86 MiB,4.39 MiB
Shape,"(18, 144, 30731)","(1, 72, 16000)"
Dask graph,72 chunks in 2 graph layers,72 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 303.86 MiB 4.39 MiB Shape (18, 144, 30731) (1, 72, 16000) Dask graph 72 chunks in 2 graph layers Data type float32 numpy.ndarray",30731  144  18,

Unnamed: 0,Array,Chunk
Bytes,303.86 MiB,4.39 MiB
Shape,"(18, 144, 30731)","(1, 72, 16000)"
Dask graph,72 chunks in 2 graph layers,72 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,20.77 GiB,4.39 MiB
Shape,"(18, 144, 30731, 70)","(1, 72, 16000, 1)"
Dask graph,5040 chunks in 2 graph layers,5040 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 20.77 GiB 4.39 MiB Shape (18, 144, 30731, 70) (1, 72, 16000, 1) Dask graph 5040 chunks in 2 graph layers Data type float32 numpy.ndarray",18  1  70  30731  144,

Unnamed: 0,Array,Chunk
Bytes,20.77 GiB,4.39 MiB
Shape,"(18, 144, 30731, 70)","(1, 72, 16000, 1)"
Dask graph,5040 chunks in 2 graph layers,5040 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [35]:
ds = xr.open_dataset(
    read_session.store,
    engine='zarr',
    backend_kwargs={'consolidated': False, 'zarr_format': 3}
)

In [36]:
ds