# Creating and Appending to an Icechunk Store with Virtual References
This notebook demonstrates how to create an icechunk store and then append to it.

See this blog post for more info:  https://tom-nicholas.com/blog/2025/cloud-optimized-scientific-data

In [None]:
import warnings
import os
import fsspec
import icechunk
import xarray as xr
from obstore.store import from_url

from virtualizarr import open_virtual_dataset
from virtualizarr.parsers import HDFParser
from virtualizarr.registry import ObjectStoreRegistry

warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
import virtualizarr
print(icechunk.__version__)
print(virtualizarr.__version__)

In [None]:
# load AWS credentials for Pangeo-EOSC storage as environment vars
from dotenv import load_dotenv
_ = load_dotenv(f'{os.environ['HOME']}/dotenv/rsignell4.env')

# Define storage
storage_endpoint = 'https://pangeo-eosc-minioapi.vm.fedcloud.eu'
storage_bucket = 'rsignell4-protocoast'
storage_name = 'taranto-icechunk-test2'

fs = fsspec.filesystem('s3', anon=False, endpoint_url=storage_endpoint)

### Create a list of files that will make up the virtual dataset
First we will handle the "nos" SHYFEM files

In [None]:
flist = fs.glob(f's3://{storage_bucket}/full_dataset/shyfem/taranto/forecast/*/*nos*.nc')
flist = [f's3://{f}' for f in flist]

In [None]:
print(len(flist))
print(flist[-1])

### Try opening one of these NetCDF files

In [None]:
xr.open_dataset(fs.open(flist[0]))

### Define our Virtualizarr `Parser` and `ObjectStoreRegistry`

In [None]:
bucket = "s3://rsignell4-protocoast"
store = from_url(bucket, region="not-used", endpoint=storage_endpoint)
registry = ObjectStoreRegistry({bucket: store})
parser = HDFParser()

## Create virtual datasets from each file with VirtualiZarr's `open_virtual_dataset`

In [None]:
%%time
ds_list = [
    open_virtual_dataset(
        url=url,
        parser=parser,
        registry=registry,
        loadable_variables=["time"],
    )
    for url in flist[0:-1]
]

### "fix" each dataset to match the requirements of the "Rolodex" FMRC indexing

In [None]:
def fix_ds(ds):
    ds = ds.rename_vars(time='valid_time')
    ds = ds.rename_dims(time='step')
    step = (ds.valid_time - ds.valid_time[0]).assign_attrs({"standard_name": "forecast_period"})
    time = ds.valid_time[0].assign_attrs({"standard_name": "forecast_reference_time"})
    ds = ds.assign_coords(step=step, time=time)
    ds = ds.drop_indexes("valid_time")
    ds = ds.drop_vars('valid_time')
    return ds

In [None]:
ds_list = [fix_ds(ds) for ds in ds_list]

In [None]:
combined_nos = xr.concat(
    ds_list,
    dim="time",
    coords="minimal",
    compat="override",
    combine_attrs="override",
)

Now we handle the "ous" SHYFEM files

In [None]:
flist = fs.glob('s3://rsignell4-protocoast/full_dataset/shyfem/taranto/forecast/*/*ous*.nc')
flist = [f's3://{f}' for f in flist]

In [None]:
%%time
ds_list = [
    open_virtual_dataset(
        url=url,
        parser=parser,
        registry=registry,
        loadable_variables=["time"],
    )
    for url in flist[0:-1]
]

In [None]:
flist[0:-1]

In [None]:
ds_list = [fix_ds(ds) for ds in ds_list]

In [None]:
combined_ous = xr.concat(
    ds_list,
    dim="time",
    coords="minimal",
    compat="override",
    combine_attrs="override",
)

### Now create a virtual dataset with the "nos" and "ous" datasets merged together

In [None]:
ds = xr.merge([combined_nos, combined_ous], compat='override')

In [None]:
ds

## Initialize the Icechunk Store
We need configure the `virtual_chunk_container` as make sure the icechunk container credentials allow for anonymous access. 
Details on this can be found [here](https://icechunk.io/en/stable/virtual/).

In [None]:
# remove old existing icechunk storage with this name
try:
    # Use the same prefix as the storage
    fs.rm(f's3://{storage_bucket}/icechunk/{storage_name}', recursive=True)
except:
    pass

In [None]:
storage = icechunk.s3_storage(
    bucket=storage_bucket,
    prefix=f"icechunk/{storage_name}",
    from_env=True,
    endpoint_url=storage_endpoint,
    region='not-used',   # N/A for Pangeo-EOSC bucket, but required param
    force_path_style=True)

In [None]:
config = icechunk.RepositoryConfig.default()

config.set_virtual_chunk_container(
    icechunk.VirtualChunkContainer(
        url_prefix=f"s3://{storage_bucket}/",
        store=icechunk.s3_store(region="not-used", anonymous=False, s3_compatible=True, 
                                force_path_style=True, endpoint_url=storage_endpoint),
    ),
)

repo = icechunk.Repository.create(storage, config)
session = repo.writable_session("main")

## Write the virtual datasets to the icechunk store and commit

In [None]:
ds.virtualize.to_icechunk(session.store)

In [None]:
session.commit("all but last day of Taranto data")

## Check the icechunk dataset

In [None]:
credentials = icechunk.containers_credentials(
    {f"s3://{storage_bucket}/": icechunk.s3_credentials(anonymous=False)})

read_repo = icechunk.Repository.open(
    storage, config, authorize_virtual_chunk_access=credentials)

read_session = read_repo.readonly_session("main")

In [None]:
ds = xr.open_zarr(read_session.store, consolidated=False, zarr_format=3)
ds