Skip to content

Commit

Permalink
feat(python): Implement DataFrame Interchange Protocol through `pyarr…
Browse files Browse the repository at this point in the history
…ow` (pola-rs#6581)
  • Loading branch information
stinodego authored and vincent committed Feb 9, 2023
1 parent af79390 commit 9132988
Show file tree
Hide file tree
Showing 6 changed files with 209 additions and 0 deletions.
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/dataframe/export.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Export DataFrame data to other formats:
.. autosummary::
:toctree: api/

DataFrame.__dataframe__
DataFrame.to_arrow
DataFrame.to_dict
DataFrame.to_dicts
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Conversion
:toctree: api/

from_arrow
from_dataframe
from_dict
from_dicts
from_numpy
Expand Down
2 changes: 2 additions & 0 deletions py-polars/polars/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def version() -> str:
from polars.cfg import Config
from polars.convert import (
from_arrow,
from_dataframe,
from_dict,
from_dicts,
from_numpy,
Expand Down Expand Up @@ -291,6 +292,7 @@ def version() -> str:
"duration",
"coalesce",
# polars.convert
"from_dataframe",
"from_dict",
"from_dicts",
"from_records",
Expand Down
48 changes: 48 additions & 0 deletions py-polars/polars/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import TYPE_CHECKING, Any, Mapping, Sequence, overload

from polars.datatypes import N_INFER_DEFAULT, SchemaDefinition, SchemaDict
from polars.dependencies import _PYARROW_AVAILABLE
from polars.dependencies import numpy as np
from polars.dependencies import pandas as pd
from polars.dependencies import pyarrow as pa
Expand Down Expand Up @@ -479,3 +480,50 @@ def from_pandas(
)
else:
raise ValueError(f"Expected pandas DataFrame or Series, got {type(df)}.")


def from_dataframe(df: Any, allow_copy: bool = True) -> DataFrame:
"""
Build a Polars DataFrame from any dataframe supporting the interchange protocol.
Parameters
----------
df
Object supporting the dataframe interchange protocol, i.e. must have implemented
the ``__dataframe__`` method.
allow_copy
Allow memory to be copied to perform the conversion. If set to False, causes
conversions that are not zero-copy to fail.
Notes
-----
Details on the dataframe interchange protocol:
https://data-apis.org/dataframe-protocol/latest/index.html
Zero-copy conversions currently cannot be guaranteed and will throw a
``NotImplementedError``.
Using a dedicated function like :func:`from_pandas` or :func:`from_arrow` is a more
efficient method of conversion.
"""
if isinstance(df, DataFrame):
return df
if not hasattr(df, "__dataframe__"):
raise TypeError(
f"`df` of type {type(df)} does not support the dataframe interchange"
" protocol."
)
if not _PYARROW_AVAILABLE or int(pa.__version__.split(".")[0]) < 11:
raise ImportError(
"pyarrow>=11.0.0 is required for converting a dataframe interchange object"
" to a Polars dataframe."
)
if not allow_copy:
raise NotImplementedError(
"Polars cannot guarantee zero-copy conversion from dataframe interchange"
" objects at this time."
)

pa_table = pa.interchange.from_dataframe(df, allow_copy=allow_copy)
return from_arrow(pa_table, rechunk=allow_copy) # type: ignore[return-value]
39 changes: 39 additions & 0 deletions py-polars/polars/internals/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
INTEGER_DTYPES,
N_INFER_DEFAULT,
Boolean,
Categorical,
DataTypeClass,
Float64,
Int8,
Expand Down Expand Up @@ -105,6 +106,8 @@
from typing_extensions import Concatenate, ParamSpec, TypeAlias

if TYPE_CHECKING:
from pyarrow.interchange.dataframe import _PyArrowDataFrame

from polars.internals.type_aliases import (
AsofJoinStrategy,
AvroCompression,
Expand Down Expand Up @@ -1126,6 +1129,42 @@ def schema(self) -> SchemaDict:
"""
return dict(zip(self.columns, self.dtypes))

def __dataframe__(
self, nan_as_null: bool = False, allow_copy: bool = True
) -> _PyArrowDataFrame:
"""
Convert to a dataframe object implementing the dataframe interchange protocol.
Parameters
----------
nan_as_null
Overwrite null values in the data with ``NaN``.
allow_copy
Allow memory to be copied to perform the conversion. If set to False, causes
conversions that are not zero-copy to fail.
Notes
-----
Details on the dataframe interchange protocol:
https://data-apis.org/dataframe-protocol/latest/index.html
`nan_as_null` currently has no effect; once support for nullable extension
dtypes is added, this value should be propagated to columns.
"""
if not _PYARROW_AVAILABLE or int(pa.__version__.split(".")[0]) < 11:
raise ImportError(
"pyarrow>=11.0.0 is required for converting a Polars dataframe to a"
" dataframe interchange object."
)
if not allow_copy and Categorical in self.schema.values():
raise NotImplementedError(
"Polars does not offer zero-copy conversion to Arrow for categorical"
" columns. Set `allow_copy=True` or cast categorical columns to"
" string first."
)
return self.to_arrow().__dataframe__(nan_as_null, allow_copy)

def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame:
"""Compare a DataFrame with another object."""
if isinstance(other, DataFrame):
Expand Down
118 changes: 118 additions & 0 deletions py-polars/tests/unit/test_interchange.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import sys

import pandas as pd
import pytest
from _pytest.monkeypatch import MonkeyPatch

import polars as pl
from polars.testing import assert_frame_equal


def test_interchange() -> None:
df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["foo", "bar"]})
dfi = df.__dataframe__()

# Testing some random properties to make sure conversion happened correctly
assert dfi.num_rows() == 2
assert dfi.get_column(0).dtype[1] == 64
assert dfi.get_column_by_name("c").get_buffers()["data"][0].bufsize == 6


def test_interchange_pyarrow_required(monkeypatch: MonkeyPatch) -> None:
monkeypatch.setattr(pl.internals.dataframe.frame, "_PYARROW_AVAILABLE", False)

df = pl.DataFrame({"a": [1, 2]})
with pytest.raises(ImportError, match="pyarrow"):
df.__dataframe__()


def test_interchange_pyarrow_min_version(monkeypatch: MonkeyPatch) -> None:
monkeypatch.setattr(
pl.internals.dataframe.frame.pa, # type: ignore[attr-defined]
"__version__",
"10.0.0",
)

df = pl.DataFrame({"a": [1, 2]})
with pytest.raises(ImportError, match="pyarrow"):
df.__dataframe__()


def test_interchange_categorical() -> None:
df = pl.DataFrame({"a": ["foo", "bar"]}, schema={"a": pl.Categorical})

# Conversion requires copy
dfi = df.__dataframe__(allow_copy=True)
assert dfi.get_column_by_name("a").dtype[0] == 23 # 23 signifies categorical dtype

# If copy not allowed, throws an error
with pytest.raises(NotImplementedError, match="categorical"):
df.__dataframe__(allow_copy=False)


def test_from_dataframe() -> None:
df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["foo", "bar"]})
dfi = df.__dataframe__()
result = pl.from_dataframe(dfi)
assert_frame_equal(result, df)


@pytest.mark.xfail(
sys.version_info < (3, 8),
reason="Pandas does not implement the protocol on Python 3.7",
)
def test_from_dataframe_pandas() -> None:
data = {"a": [1, 2], "b": [3.0, 4.0], "c": ["foo", "bar"]}

# Pandas dataframe
df = pd.DataFrame(data)
result = pl.from_dataframe(df)
expected = pl.DataFrame(data)
assert_frame_equal(result, expected)


@pytest.mark.xfail(
sys.version_info < (3, 8),
reason="Pandas does not implement the protocol on Python 3.7",
)
def test_from_dataframe_allow_copy() -> None:
# Zero copy only allowed when input is already a Polars dataframe
df = pl.DataFrame({"a": [1, 2]})
result = pl.from_dataframe(df, allow_copy=True)
assert_frame_equal(result, df)

# Zero copy cannot be guaranteed for other inputs at this time
df_pandas = pd.DataFrame({"a": [1, 2]})
with pytest.raises(NotImplementedError):
pl.from_dataframe(df_pandas, allow_copy=False)


def test_from_dataframe_invalid_type() -> None:
df = [[1, 2], [3, 4]]
with pytest.raises(TypeError):
pl.from_dataframe(df)


def test_from_dataframe_pyarrow_required(monkeypatch: MonkeyPatch) -> None:
monkeypatch.setattr(pl.convert, "_PYARROW_AVAILABLE", False)

df = pl.DataFrame({"a": [1, 2]})
with pytest.raises(ImportError, match="pyarrow"):
pl.from_dataframe(df.__dataframe__())

# 'Converting' from a Polars dataframe does not hit this requirement
result = pl.from_dataframe(df)
assert_frame_equal(result, df)


def test_from_dataframe_pyarrow_min_version(monkeypatch: MonkeyPatch) -> None:
dfi = pl.DataFrame({"a": [1, 2]}).__dataframe__()

monkeypatch.setattr(
pl.convert.pa, # type: ignore[attr-defined]
"__version__",
"10.0.0",
)

with pytest.raises(ImportError, match="pyarrow"):
pl.from_dataframe(dfi)

0 comments on commit 9132988

Please sign in to comment.