Quantco · borchero · Sep 10, 2025 · Sep 10, 2025 · Sep 10, 2025 · delsner
@@ -1,2 +1,6 @@
+export AWS_SECRET_ACCESS_KEY=minioadmin
+export AWS_ACCESS_KEY_ID=minioadmin
+export AWS_ENDPOINT_URL=http://localhost:9009
+
 watch_file pixi.toml pixi.lock
 eval "$(pixi shell-hook)"
@@ -56,10 +56,25 @@ jobs:
         uses: prefix-dev/setup-pixi@8ca4608ef7f4daeb54f5205b20d0b7cb42f11143 # v0.8.14
         with:
           environments: ${{ matrix.environment }}
+      - name: Start minio for s3 tests
+        if: matrix.os == 'ubuntu-latest'
+        run: docker compose up minio -d --wait
       - name: Install repository
         run: pixi run -e ${{ matrix.environment }} postinstall
+      - name: Derive test markers
+        id: markers
+        run: |
+          if [[ "${{ matrix.os }}" == "ubuntu-latest" ]]; then
+            echo 'marker=${{ matrix.with_optionals && '""' || '"not with_optionals"'}}' >> $GITHUB_OUTPUT
+          else
+            echo 'marker=${{ matrix.with_optionals && '"not s3"' || '"not with_optionals and not s3"' }}' >> $GITHUB_OUTPUT
+          fi
       - name: Run pytest
-        run: pixi run -e ${{ matrix.environment }} test-coverage --color=yes ${{ matrix.with_optionals && '-m with_optionals' || '-m "not with_optionals"'}} --cov=dataframely --cov-report=xml
+        run: pixi run -e ${{ matrix.environment }} test-coverage --color=yes -m ${{ steps.markers.outputs.marker }} --cov=dataframely --cov-report=xml
+        env:
+          AWS_SECRET_ACCESS_KEY: minioadmin
+          AWS_ACCESS_KEY_ID: minioadmin
+          AWS_ENDPOINT_URL: http://localhost:9000
       - name: Upload codecov
         uses: codecov/codecov-action@18283e04ce6e62d37312384ff67231eb8fd56d24 # v5.4.3
         with:

@@ -1,11 +1,12 @@
 # Copyright (c) QuantCo 2025-2025
 # SPDX-License-Identifier: BSD-3-Clause
 
+import os
 from collections.abc import Iterable
-from pathlib import Path
-from typing import Any
+from typing import Any, cast
 
 import polars as pl
+from fsspec import AbstractFileSystem, url_to_fs
 
 from ._base import (
     SerializedCollection,
@@ -68,7 +69,7 @@ def sink_collection(
         serialized_schemas: dict[str, str],
         **kwargs: Any,
     ) -> None:
-        path = Path(kwargs.pop("directory"))
+        path = str(kwargs.pop("directory"))
 
         # The collection schema is serialized as part of the member parquet metadata
         kwargs["metadata"] = kwargs.get("metadata", {}) | {
@@ -77,7 +78,9 @@ def sink_collection(
 
         for key, lf in dfs.items():
             destination = (
-                path / key if "partition_by" in kwargs else path / f"{key}.parquet"
+                os.path.join(path, key)
+                if "partition_by" in kwargs
+                else os.path.join(path, f"{key}.parquet")
             )
             self.sink_frame(
                 lf,
@@ -93,7 +96,7 @@ def write_collection(
         serialized_schemas: dict[str, str],
         **kwargs: Any,
     ) -> None:
-        path = Path(kwargs.pop("directory"))
+        path = str(kwargs.pop("directory"))
 
         # The collection schema is serialized as part of the member parquet metadata
         kwargs["metadata"] = kwargs.get("metadata", {}) | {
@@ -102,7 +105,9 @@ def write_collection(
 
         for key, lf in dfs.items():
             destination = (
-                path / key if "partition_by" in kwargs else path / f"{key}.parquet"
+                os.path.join(path, key)
+                if "partition_by" in kwargs
+                else os.path.join(path, f"{key}.parquet")
             )
             self.write_frame(
                 lf.collect(),
@@ -114,53 +119,60 @@ def write_collection(
     def scan_collection(
         self, members: Iterable[str], **kwargs: Any
     ) -> tuple[dict[str, pl.LazyFrame], list[SerializedCollection | None]]:
-        path = Path(kwargs.pop("directory"))
+        path = str(kwargs.pop("directory"))
         return self._collection_from_parquet(
             path=path, members=members, scan=True, **kwargs
         )
 
     def read_collection(
         self, members: Iterable[str], **kwargs: Any
     ) -> tuple[dict[str, pl.LazyFrame], list[SerializedCollection | None]]:
-        path = Path(kwargs.pop("directory"))
+        path = str(kwargs.pop("directory"))
         return self._collection_from_parquet(
             path=path, members=members, scan=False, **kwargs
         )
 
     def _collection_from_parquet(
-        self, path: Path, members: Iterable[str], scan: bool, **kwargs: Any
+        self, path: str, members: Iterable[str], scan: bool, **kwargs: Any
     ) -> tuple[dict[str, pl.LazyFrame], list[SerializedCollection | None]]:
         # Utility method encapsulating the logic that is common
         # between lazy and eager reads
         data = {}
         collection_types = []
 
+        fs: AbstractFileSystem = url_to_fs(path)[0]
         for key in members:
-            if (source_path := self._member_source_path(path, key)) is not None:
+            if (source_path := self._member_source_path(path, fs, key)) is not None:
                 data[key] = (
                     pl.scan_parquet(source_path, **kwargs)
                     if scan
                     else pl.read_parquet(source_path, **kwargs).lazy()
                 )
-                if source_path.is_file():
+                if fs.isfile(source_path):
                     collection_types.append(_read_serialized_collection(source_path))
                 else:
-                    for file in source_path.glob("**/*.parquet"):
-                        collection_types.append(_read_serialized_collection(file))
+                    for file in fs.glob(os.path.join(source_path, "**/*.parquet")):
+                        collection_types.append(
+                            _read_serialized_collection(cast(str, file))
+                        )
 
         # Backward compatibility: If the parquets do not have schema information,
         # fall back to looking for schema.json
-        if not any(collection_types) and (schema_file := path / "schema.json").exists():
-            collection_types.append(schema_file.read_text())
+        if not any(collection_types) and fs.exists(
+            schema_file := os.path.join(path, "schema.json")
+        ):
+            collection_types.append(fs.read_text(schema_file))
 
         return data, collection_types
 
     @classmethod
-    def _member_source_path(cls, base_path: Path, name: str) -> Path | None:
-        if (path := base_path / name).exists() and base_path.is_dir():
+    def _member_source_path(
+        cls, base_path: str, fs: AbstractFileSystem, name: str
+    ) -> str | None:
+        if fs.exists(path := os.path.join(base_path, name)) and fs.isdir(base_path):
             # We assume that the member is stored as a hive-partitioned dataset
             return path
-        if (path := base_path / f"{name}.parquet").exists():
+        if fs.exists(path := os.path.join(base_path, f"{name}.parquet")):
             # We assume that the member is stored as a single parquet file
             return path
         return None
@@ -229,11 +241,11 @@ def scan_failure_info(
         return lf, serialized_rules, serialized_schema
 
 
-def _read_serialized_collection(path: Path) -> SerializedCollection | None:
+def _read_serialized_collection(path: str) -> SerializedCollection | None:
     meta = pl.read_parquet_metadata(path)
     return meta.get(COLLECTION_METADATA_KEY)
 
 
-def _read_serialized_schema(path: Path) -> SerializedSchema | None:
+def _read_serialized_schema(path: str) -> SerializedSchema | None:
     meta = pl.read_parquet_metadata(path)
     return meta.get(SCHEMA_METADATA_KEY)
@@ -1,4 +1,3 @@
-version: "3"
 services:
   mssql:
     image: mcr.microsoft.com/azure-sql-edge:latest
@@ -8,3 +7,15 @@ services:
       SA_PASSWORD: P@ssword1
     ports:
       - "1455:1433"
+
+  minio:
+    image: quay.io/minio/minio:latest
+    ports:
+      - "9009:9009"
+      - "9010:9010"
+    volumes:
+      - minio-data:/data
+    command: server /data --address ":9009" --console-address ":9010"
+
+volumes:
+  minio-data: