Skip to content

Commit

Permalink
feat: create column types for polars data types (#208)
Browse files Browse the repository at this point in the history
Closes partially #196.

### Summary of Changes

* Add `polars`
* Create `ColumnType` for `polars` data type
* Create `Schema` for `polars` data frame

---------

Co-authored-by: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com>
  • Loading branch information
lars-reimann and megalinter-bot committed Apr 18, 2023
1 parent 102f0b9 commit e18b362
Show file tree
Hide file tree
Showing 17 changed files with 600 additions and 180 deletions.
100 changes: 99 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ pandas = "^2.0.0"
pillow = "^9.5.0"
scikit-learn = "^1.2.0"
seaborn = "^0.12.2"
polars = {extras = ["pandas", "pyarrow", "xlsx2csv"], version = "^0.17.5"}

[tool.poetry.group.dev.dependencies]
pytest = "^7.2.1"
Expand Down
2 changes: 1 addition & 1 deletion src/safeds/data/tabular/containers/_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def __init__(self, name: str, data: Iterable, type_: ColumnType | None = None) -
self._name: str = name
self._data: pd.Series = data if isinstance(data, pd.Series) else pd.Series(data)
# noinspection PyProtectedMember
self._type: ColumnType = type_ if type_ is not None else ColumnType._from_numpy_dtype(self._data.dtype)
self._type: ColumnType = type_ if type_ is not None else ColumnType._from_numpy_data_type(self._data.dtype)

def __eq__(self, other: object) -> bool:
if not isinstance(other, Column):
Expand Down
6 changes: 3 additions & 3 deletions src/safeds/data/tabular/containers/_row.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def from_dict(data: dict[str, Any]) -> Row:
"""
row_frame = pd.DataFrame([data.values()], columns=list(data.keys()))
# noinspection PyProtectedMember
return Row(data.values(), Schema._from_dataframe(row_frame))
return Row(data.values(), Schema._from_pandas_dataframe(row_frame))

# ------------------------------------------------------------------------------------------------------------------
# Dunder methods
Expand All @@ -65,7 +65,7 @@ def __init__(self, data: Iterable, schema: Schema | None = None):
dataframe = self._data.to_frame().T
dataframe.columns = column_names
# noinspection PyProtectedMember
self._schema = Schema._from_dataframe(dataframe)
self._schema = Schema._from_pandas_dataframe(dataframe)

def __eq__(self, other: Any) -> bool:
if not isinstance(other, Row):
Expand Down Expand Up @@ -136,7 +136,7 @@ def get_value(self, column_name: str) -> Any:
if not self._schema.has_column(column_name):
raise UnknownColumnNameError([column_name])
# noinspection PyProtectedMember
return self._data[self._schema._get_column_index_by_name(column_name)]
return self._data[self._schema._get_column_index(column_name)]

def has_column(self, column_name: str) -> bool:
"""
Expand Down
4 changes: 2 additions & 2 deletions src/safeds/data/tabular/containers/_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def __init__(self, data: Iterable, schema: Schema | None = None):
| [from_rows][safeds.data.tabular.containers._table.Table.from_rows] | Create a table from a list of rows. |
"""
self._data: pd.DataFrame = data if isinstance(data, pd.DataFrame) else pd.DataFrame(data)
self._schema: Schema = Schema._from_dataframe(self._data) if schema is None else schema
self._schema: Schema = Schema._from_pandas_dataframe(self._data) if schema is None else schema

if self._data.empty:
self._data = pd.DataFrame(columns=self._schema.get_column_names())
Expand Down Expand Up @@ -305,7 +305,7 @@ def get_column(self, column_name: str) -> Column:
if self._schema.has_column(column_name):
output_column = Column(
column_name,
self._data.iloc[:, [self._schema._get_column_index_by_name(column_name)]].squeeze(),
self._data.iloc[:, [self._schema._get_column_index(column_name)]].squeeze(),
self._schema.get_type_of_column(column_name),
)
return output_column
Expand Down
93 changes: 68 additions & 25 deletions src/safeds/data/tabular/typing/_column_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,44 +4,63 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING

from polars import FLOAT_DTYPES as POLARS_FLOAT_DTYPES
from polars import INTEGER_DTYPES as POLARS_INTEGER_DTYPES
from polars import TEMPORAL_DTYPES as POLARS_TEMPORAL_DTYPES
from polars import Boolean as PolarsBoolean
from polars import Decimal as PolarsDecimal
from polars import Object as PolarsObject
from polars import PolarsDataType
from polars import Utf8 as PolarsUtf8

if TYPE_CHECKING:
import numpy as np


class ColumnType(ABC):
"""Abstract base class for column types."""

@abstractmethod
def is_nullable(self) -> bool:
@staticmethod
def _from_numpy_data_type(data_type: np.dtype) -> ColumnType:
"""
Return whether the given column type is nullable.
Return the column type for a given `numpy` data type.
Parameters
----------
data_type : numpy.dtype
The `numpy` data type.
Returns
-------
is_nullable : bool
True if the column is nullable.
"""
column_type : ColumnType
The ColumnType.
@abstractmethod
def is_numeric(self) -> bool:
Raises
------
NotImplementedError
If the given data type is not supported.
"""
Return whether the given column type is numeric.
if data_type.kind in ("u", "i"):
return Integer()
if data_type.kind == "b":
return Boolean()
if data_type.kind == "f":
return RealNumber()
if data_type.kind in ("S", "U", "O", "M", "m"):
return String()

Returns
-------
is_numeric : bool
True if the column is numeric.
"""
message = f"Unsupported numpy data type '{data_type}'."
raise NotImplementedError(message)

@staticmethod
def _from_numpy_dtype(dtype: np.dtype) -> ColumnType:
def _from_polars_data_type(data_type: PolarsDataType) -> ColumnType:
"""
Return the column type for a given numpy dtype.
Return the column type for a given `polars` data type.
Parameters
----------
dtype : numpy.dtype
The numpy dtype.
data_type : PolarsDataType
The `polars` data type.
Returns
-------
Expand All @@ -50,18 +69,42 @@ def _from_numpy_dtype(dtype: np.dtype) -> ColumnType:
Raises
------
TypeError
If the given dtype is not supported.
NotImplementedError
If the given data type is not supported.
"""
if dtype.kind in ("u", "i"):
if data_type in POLARS_INTEGER_DTYPES:
return Integer()
if dtype.kind == "b":
if data_type is PolarsBoolean:
return Boolean()
if dtype.kind == "f":
if data_type in POLARS_FLOAT_DTYPES or data_type is PolarsDecimal:
return RealNumber()
if dtype.kind in ("S", "U", "O", "M", "m"):
if data_type is PolarsUtf8 or data_type is PolarsObject or data_type in POLARS_TEMPORAL_DTYPES:
return String()
raise TypeError("Unexpected column type")

message = f"Unsupported polars data type '{data_type}'."
raise NotImplementedError(message)

@abstractmethod
def is_nullable(self) -> bool:
"""
Return whether the given column type is nullable.
Returns
-------
is_nullable : bool
True if the column is nullable.
"""

@abstractmethod
def is_numeric(self) -> bool:
"""
Return whether the given column type is numeric.
Returns
-------
is_numeric : bool
True if the column is numeric.
"""


@dataclass
Expand Down

0 comments on commit e18b362

Please sign in to comment.