Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .envrc
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
export AWS_SECRET_ACCESS_KEY=minioadmin
export AWS_ACCESS_KEY_ID=minioadmin
export AWS_ENDPOINT_URL=http://localhost:9009

watch_file pixi.toml pixi.lock
eval "$(pixi shell-hook)"
17 changes: 16 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,25 @@ jobs:
uses: prefix-dev/setup-pixi@8ca4608ef7f4daeb54f5205b20d0b7cb42f11143 # v0.8.14
with:
environments: ${{ matrix.environment }}
- name: Start minio for s3 tests
if: matrix.os == 'ubuntu-latest'
run: docker compose up minio -d --wait
- name: Install repository
run: pixi run -e ${{ matrix.environment }} postinstall
- name: Derive test markers
id: markers
run: |
if [[ "${{ matrix.os }}" == "ubuntu-latest" ]]; then
echo 'marker=${{ matrix.with_optionals && '""' || '"not with_optionals"'}}' >> $GITHUB_OUTPUT
else
echo 'marker=${{ matrix.with_optionals && '"not s3"' || '"not with_optionals and not s3"' }}' >> $GITHUB_OUTPUT
fi
- name: Run pytest
run: pixi run -e ${{ matrix.environment }} test-coverage --color=yes ${{ matrix.with_optionals && '-m with_optionals' || '-m "not with_optionals"'}} --cov=dataframely --cov-report=xml
run: pixi run -e ${{ matrix.environment }} test-coverage --color=yes -m ${{ steps.markers.outputs.marker }} --cov=dataframely --cov-report=xml
env:
AWS_SECRET_ACCESS_KEY: minioadmin
AWS_ACCESS_KEY_ID: minioadmin
AWS_ENDPOINT_URL: http://localhost:9000
- name: Upload codecov
uses: codecov/codecov-action@18283e04ce6e62d37312384ff67231eb8fd56d24 # v5.4.3
with:
Expand Down
52 changes: 32 additions & 20 deletions dataframely/_storage/parquet.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
# Copyright (c) QuantCo 2025-2025
# SPDX-License-Identifier: BSD-3-Clause

import os
from collections.abc import Iterable
from pathlib import Path
from typing import Any
from typing import Any, cast

import polars as pl
from fsspec import AbstractFileSystem, url_to_fs

from ._base import (
SerializedCollection,
Expand Down Expand Up @@ -68,7 +69,7 @@ def sink_collection(
serialized_schemas: dict[str, str],
**kwargs: Any,
) -> None:
path = Path(kwargs.pop("directory"))
path = str(kwargs.pop("directory"))

# The collection schema is serialized as part of the member parquet metadata
kwargs["metadata"] = kwargs.get("metadata", {}) | {
Expand All @@ -77,7 +78,9 @@ def sink_collection(

for key, lf in dfs.items():
destination = (
path / key if "partition_by" in kwargs else path / f"{key}.parquet"
os.path.join(path, key)
if "partition_by" in kwargs
else os.path.join(path, f"{key}.parquet")
)
self.sink_frame(
lf,
Expand All @@ -93,7 +96,7 @@ def write_collection(
serialized_schemas: dict[str, str],
**kwargs: Any,
) -> None:
path = Path(kwargs.pop("directory"))
path = str(kwargs.pop("directory"))

# The collection schema is serialized as part of the member parquet metadata
kwargs["metadata"] = kwargs.get("metadata", {}) | {
Expand All @@ -102,7 +105,9 @@ def write_collection(

for key, lf in dfs.items():
destination = (
path / key if "partition_by" in kwargs else path / f"{key}.parquet"
os.path.join(path, key)
if "partition_by" in kwargs
else os.path.join(path, f"{key}.parquet")
)
self.write_frame(
lf.collect(),
Expand All @@ -114,53 +119,60 @@ def write_collection(
def scan_collection(
self, members: Iterable[str], **kwargs: Any
) -> tuple[dict[str, pl.LazyFrame], list[SerializedCollection | None]]:
path = Path(kwargs.pop("directory"))
path = str(kwargs.pop("directory"))
return self._collection_from_parquet(
path=path, members=members, scan=True, **kwargs
)

def read_collection(
self, members: Iterable[str], **kwargs: Any
) -> tuple[dict[str, pl.LazyFrame], list[SerializedCollection | None]]:
path = Path(kwargs.pop("directory"))
path = str(kwargs.pop("directory"))
return self._collection_from_parquet(
path=path, members=members, scan=False, **kwargs
)

def _collection_from_parquet(
self, path: Path, members: Iterable[str], scan: bool, **kwargs: Any
self, path: str, members: Iterable[str], scan: bool, **kwargs: Any
) -> tuple[dict[str, pl.LazyFrame], list[SerializedCollection | None]]:
# Utility method encapsulating the logic that is common
# between lazy and eager reads
data = {}
collection_types = []

fs: AbstractFileSystem = url_to_fs(path)[0]
for key in members:
if (source_path := self._member_source_path(path, key)) is not None:
if (source_path := self._member_source_path(path, fs, key)) is not None:
data[key] = (
pl.scan_parquet(source_path, **kwargs)
if scan
else pl.read_parquet(source_path, **kwargs).lazy()
)
if source_path.is_file():
if fs.isfile(source_path):
collection_types.append(_read_serialized_collection(source_path))
else:
for file in source_path.glob("**/*.parquet"):
collection_types.append(_read_serialized_collection(file))
for file in fs.glob(os.path.join(source_path, "**/*.parquet")):
collection_types.append(
_read_serialized_collection(cast(str, file))
)

# Backward compatibility: If the parquets do not have schema information,
# fall back to looking for schema.json
if not any(collection_types) and (schema_file := path / "schema.json").exists():
collection_types.append(schema_file.read_text())
if not any(collection_types) and fs.exists(
schema_file := os.path.join(path, "schema.json")
):
collection_types.append(fs.read_text(schema_file))

return data, collection_types

@classmethod
def _member_source_path(cls, base_path: Path, name: str) -> Path | None:
if (path := base_path / name).exists() and base_path.is_dir():
def _member_source_path(
cls, base_path: str, fs: AbstractFileSystem, name: str
) -> str | None:
if fs.exists(path := os.path.join(base_path, name)) and fs.isdir(base_path):
# We assume that the member is stored as a hive-partitioned dataset
return path
if (path := base_path / f"{name}.parquet").exists():
if fs.exists(path := os.path.join(base_path, f"{name}.parquet")):
# We assume that the member is stored as a single parquet file
return path
return None
Expand Down Expand Up @@ -229,11 +241,11 @@ def scan_failure_info(
return lf, serialized_rules, serialized_schema


def _read_serialized_collection(path: Path) -> SerializedCollection | None:
def _read_serialized_collection(path: str) -> SerializedCollection | None:
meta = pl.read_parquet_metadata(path)
return meta.get(COLLECTION_METADATA_KEY)


def _read_serialized_schema(path: Path) -> SerializedSchema | None:
def _read_serialized_schema(path: str) -> SerializedSchema | None:
meta = pl.read_parquet_metadata(path)
return meta.get(SCHEMA_METADATA_KEY)
13 changes: 12 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
version: "3"
services:
mssql:
image: mcr.microsoft.com/azure-sql-edge:latest
Expand All @@ -8,3 +7,15 @@ services:
SA_PASSWORD: P@ssword1
ports:
- "1455:1433"

minio:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you consider using moto for s3 tests? It's easy to use and doesn't require running a container in the background (see http://docs.getmoto.org/en/latest/docs/server_mode.html#start-within-python).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah neat, I remember that we had talked about this. I will investigate. Currently trying to fix the polars issue since this is a prerequisite anyways 😅

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

image: quay.io/minio/minio:latest
ports:
- "9009:9009"
- "9010:9010"
volumes:
- minio-data:/data
command: server /data --address ":9009" --console-address ":9010"

volumes:
minio-data:
Loading
Loading