zarr-developers · TomNicholas · May 14, 2024 · Mar 29, 2024 · Apr 30, 2024 · Apr 30, 2024
diff --git a/ci/doc.yml b/ci/doc.yml
@@ -13,4 +13,4 @@ dependencies:
       - "sphinx_design"
       - "sphinx_togglebutton"
       - "sphinx-autodoc-typehints"
-      - -e ..
+      - -e  "..[test]"
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,6 +23,7 @@ dynamic = ["version"]
 dependencies = [
     "xarray@git+https://github.com/TomNicholas/xarray.git@concat-avoid-index-auto-creation#egg=xarray",
     "kerchunk==0.2.2",
+    "universal-pathlib",
     "h5netcdf",
     "pydantic",
     "numpy",
@@ -40,6 +41,7 @@ test = [
     "scipy",
     "pooch",
     "ruff",
+    "s3fs"
 
 ]
 

diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py
@@ -5,6 +5,7 @@
 import xarray as xr
 
 from virtualizarr.zarr import ZArray, ZAttrs
+from virtualizarr.utils import _fsspec_openfile_from_filepath
 
 # Distinguishing these via type hints makes it a lot easier to mentally keep track of what the opaque kerchunk "reference dicts" actually mean
 # (idea from https://kobzol.github.io/rust/python/2023/05/20/writing-python-like-its-rust.html)
@@ -35,7 +36,9 @@ class FileType(AutoName):
     zarr = auto()
 
 def read_kerchunk_references_from_file(
-    filepath: str, filetype: Optional[FileType]
+    filepath: str, filetype: Optional[FileType],
+    reader_options: Optional[dict] = {'storage_options':{'key':'', 'secret':'', 'anon':True}}
+
 ) -> KerchunkStoreRefs:
     """
     Read a single legacy file and return kerchunk references to its contents.
@@ -47,55 +50,57 @@ def read_kerchunk_references_from_file(
     filetype : FileType, default: None
         Type of file to be opened. Used to determine which kerchunk file format backend to use.
         If not provided will attempt to automatically infer the correct filetype from the the filepath's extension.
+    reader_options: dict, default {'storage_options':{'key':'', 'secret':'', 'anon':True}}
+        Dict passed into Kerchunk file readers. Note: Each Kerchunk file reader has distinct arguments,
+        so ensure reader_options match selected Kerchunk reader arguments.
     """
 
     if filetype is None:
-        filetype = _automatically_determine_filetype(filepath)
+        filetype = _automatically_determine_filetype(filepath=filepath, reader_options=reader_options)
 
     # if filetype is user defined, convert to FileType
     filetype = FileType(filetype)
 
     if filetype.name.lower() == "netcdf3":
         from kerchunk.netCDF3 import NetCDF3ToZarr
-        refs = NetCDF3ToZarr(filepath, inline_threshold=0).translate()
 
+        refs = NetCDF3ToZarr(filepath, inline_threshold=0, **reader_options).translate()
     elif filetype.name.lower() == "netcdf4":
         from kerchunk.hdf import SingleHdf5ToZarr
-
-        refs = SingleHdf5ToZarr(filepath, inline_threshold=0).translate()
+        refs = SingleHdf5ToZarr(filepath, inline_threshold=0, **reader_options).translate()
     elif filetype.name.lower() == "grib":
         # TODO Grib files should be handled as a DataTree object
         # see https://github.com/TomNicholas/VirtualiZarr/issues/11
         raise NotImplementedError(f"Unsupported file type: {filetype}")
     elif filetype.name.lower() == "tiff":
         from kerchunk.tiff import tiff_to_zarr
 
-        refs = tiff_to_zarr(filepath, inline_threshold=0)
+        refs = tiff_to_zarr(filepath, inline_threshold=0, **reader_options)
     elif filetype.name.lower() == "fits":
         from kerchunk.fits import process_file
 
-        refs = process_file(filepath, inline_threshold=0)
+        refs = process_file(filepath, inline_threshold=0, **reader_options)
     else:
         raise NotImplementedError(f"Unsupported file type: {filetype.name}")
 
     # TODO validate the references that were read before returning?
     return refs
 
 
-def _automatically_determine_filetype(filepath: str) -> FileType:
+def _automatically_determine_filetype(*,filepath: str, reader_options: Optional[dict]={}) -> FileType:
     file_extension = Path(filepath).suffix
+    fpath = _fsspec_openfile_from_filepath(filepath=filepath,reader_options=reader_options)
 
     if file_extension == ".nc":
         # based off of: https://github.com/TomNicholas/VirtualiZarr/pull/43#discussion_r1543415167
-        with open(filepath, 'rb') as f:
-            magic = f.read()
+        magic = fpath.read()
+
         if magic[0:3] == b"CDF":
             filetype = FileType.netcdf3
         elif magic[1:4] == b"HDF":
             filetype = FileType.netcdf4
         else:
             raise ValueError(".nc file does not appear to be NETCDF3 OR NETCDF4")
-
     elif file_extension == ".zarr":
         # TODO we could imagine opening an existing zarr store, concatenating it, and writing a new virtual one...
         raise NotImplementedError()
@@ -108,6 +113,7 @@ def _automatically_determine_filetype(filepath: str) -> FileType:
     else:
         raise NotImplementedError(f"Unrecognised file extension: {file_extension}")
 
+    fpath.close()
     return filetype
 
 

diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py
@@ -159,8 +159,8 @@ def test_automatically_determine_filetype_netcdf3_netcdf4():
     ds.to_netcdf(netcdf3_file_path, engine="scipy", format="NETCDF3_CLASSIC")
     ds.to_netcdf(netcdf4_file_path, engine="h5netcdf")
 
-    assert FileType("netcdf3") == _automatically_determine_filetype(netcdf3_file_path)
-    assert FileType("netcdf4") == _automatically_determine_filetype(netcdf4_file_path)
+    assert FileType("netcdf3") == _automatically_determine_filetype(filepath = netcdf3_file_path)
+    assert FileType("netcdf4") == _automatically_determine_filetype(filepath = netcdf4_file_path)
 
 
 

diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py
@@ -4,6 +4,7 @@
 import xarray as xr
 import xarray.testing as xrt
 from xarray.core.indexes import Index
+import pytest
 
 from virtualizarr import open_virtual_dataset
 from virtualizarr.manifests import ChunkManifest, ManifestArray
@@ -271,6 +272,19 @@ def test_combine_by_coords(self, netcdf4_files):
         assert combined_vds.xindexes["time"].to_pandas_index().is_monotonic_increasing
 
 
+
+
+
+
+
+pytest.importorskip("s3fs")
+@pytest.mark.parametrize("filetype", ['netcdf4', None], ids=["netcdf4 filetype", "None filetype"])
+@pytest.mark.parametrize("indexes", [None, {}], ids=["None index", "empty dict index"])
+def test_anon_read_s3(filetype, indexes):
+    #TODO: Switch away from this s3 url after minIO is implemented.
+    fpath = 's3://carbonplan-share/virtualizarr/local.nc'
+    assert open_virtual_dataset(fpath, filetype=filetype, indexes=indexes)
+
 class TestLoadVirtualDataset:
     def test_loadable_variables(self, netcdf4_file):
         vars_to_load = ['air', 'time']
@@ -283,6 +297,8 @@ def test_loadable_variables(self, netcdf4_file):
                 assert isinstance(vds[name].data, ManifestArray), name
 
         full_ds = xr.open_dataset(netcdf4_file)
+
         for name in full_ds.variables:
+
             if name in vars_to_load:
                 xrt.assert_identical(vds.variables[name], full_ds.variables[name])
diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py
@@ -0,0 +1,46 @@
+from typing import Optional, Union
+
+# TODO: importing fsspec and s3fs to get typing. Is there a better way incase these are optional deps?
+from s3fs.core import S3File
+from fsspec.implementations.local import LocalFileOpener
+
+
+def _fsspec_openfile_from_filepath(*, filepath: str, reader_options: Optional[dict] = {'storage_options':{'key':'', 'secret':'', 'anon':True}}) -> Union[S3File, LocalFileOpener]:
+    """Utility function to facilitate reading remote file paths using fsspec.
+
+    :param filepath: Input filepath
+    :type filepath: str
+    :param reader_options: Dict containing options to pass to fsspec file reader. Default: {'storage_options':{'key':'', 'secret':'', 'anon':True}}
+    :type reader_options: Optional[dict]
+    :rtype: Union[S3File, LocalFileOpener]
+    """
+    import fsspec
+    from upath import UPath
+
+    universal_filepath = UPath(filepath)
+    protocol = universal_filepath.protocol
+
+    # why does UPath give an empty string for a local file protocol :(
+    # import pdb; pdb.set_trace()
+
+    if protocol == '':
+
+        fpath = fsspec.open(filepath, 'rb').open()
+
+    elif protocol in ["s3"]:
+        s3_anon_defaults = {'key':'', 'secret':'', 'anon':True}
+        if not bool(reader_options):
+            storage_options = s3_anon_defaults
+
+        else:
+            storage_options = reader_options.get('storage_options') #type: ignore
+
+            # using dict merge operator to add in defaults if keys are not specified
+            storage_options = s3_anon_defaults | storage_options
+
+        fpath = fsspec.filesystem(protocol, **storage_options).open(filepath)
+
+    else:
+        raise NotImplementedError("Only local and s3 file protocols are currently supported")
+
+    return fpath
diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
@@ -9,6 +9,7 @@
 from xarray.core.variable import IndexVariable
 
 import virtualizarr.kerchunk as kerchunk
+from virtualizarr.utils import _fsspec_openfile_from_filepath
 from virtualizarr.kerchunk import KerchunkStoreRefs, FileType
 from virtualizarr.manifests import ChunkManifest, ManifestArray
 from virtualizarr.zarr import dataset_to_zarr, attrs_from_zarr_group_json, metadata_from_zarr_json
@@ -27,6 +28,7 @@ def open_virtual_dataset(
     loadable_variables: Optional[Iterable[str]] = None,
     indexes: Optional[Mapping[str, Index]] = None,
     virtual_array_class=ManifestArray,
+    reader_options: Optional[dict] = {'storage_options':{'key':'', 'secret':'', 'anon':True}},
 ) -> xr.Dataset:
     """
     Open a file or store as an xarray Dataset wrapping virtualized zarr arrays.
@@ -55,13 +57,17 @@ def open_virtual_dataset(
     virtual_array_class
         Virtual array class to use to represent the references to the chunks in each on-disk array.
         Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that.
+    reader_options: dict, default {'storage_options':{'key':'', 'secret':'', 'anon':True}}
+        Dict passed into Kerchunk file readers. Note: Each Kerchunk file reader has distinct arguments,
+        so ensure reader_options match selected Kerchunk reader arguments.
 
     Returns
     -------
     vds
         An xarray Dataset containing instances of virtual_array_cls for each variable, or normal lazily indexed arrays for each variable in loadable_variables.
     """
 
+
     if drop_variables is None:
         drop_variables = []
     elif isinstance(drop_variables, str):
@@ -103,7 +109,9 @@ def open_virtual_dataset(
             # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...
             # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references
             # TODO really we probably want a dedicated xarray backend that iterates over all variables only once
-            ds = xr.open_dataset(filepath, drop_variables=drop_variables)
+            fpath = _fsspec_openfile_from_filepath(filepath=filepath,reader_options=reader_options)
+
+            ds = xr.open_dataset(fpath, drop_variables=drop_variables)
 
             if indexes is None:
                 # add default indexes by reading data from file
@@ -139,6 +147,7 @@ def open_virtual_dataset(
         return vds
 
 
+
 def open_virtual_dataset_from_v3_store(
     storepath: str,
     drop_variables: List[str],