Skip to content

Commit

Permalink
feat: back Row by a polars.DataFrame (#214)
Browse files Browse the repository at this point in the history
Closes partially #196.
Closes #149.

### Summary of Changes

* `Row` now uses a `polars.DataFrame` instead of a `pandas.Series` to
store its data. The `DataFrame` can directly store the column names.
* Remove the `__hash__` method. A `Row` can no longer be used in a `set`
and as the key of a `dict`. If we find a use-case for this, we'll add it
back.

---------

Co-authored-by: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com>
  • Loading branch information
lars-reimann and megalinter-bot committed Apr 19, 2023
1 parent 655f07f commit 62ca34d
Show file tree
Hide file tree
Showing 6 changed files with 364 additions and 112 deletions.
213 changes: 154 additions & 59 deletions src/safeds/data/tabular/containers/_row.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,26 @@
from __future__ import annotations

from hashlib import md5
from typing import TYPE_CHECKING, Any

import pandas as pd
from IPython.core.display_functions import DisplayHandle, display
from pandas.core.util.hashing import hash_pandas_object
import polars as pl

from safeds.data.tabular.exceptions import UnknownColumnNameError
from safeds.data.tabular.typing import ColumnType, Schema

if TYPE_CHECKING:
from collections.abc import Iterable, Iterator
from collections.abc import Iterator


class Row:
"""
A row is a collection of values, where each value is associated with a column name.
Parameters
----------
data : Iterable
The data.
schema : Schema
The schema of the row.
To create a row manually, use the static method [from_dict][safeds.data.tabular.containers._row.Row.from_dict].
Examples
--------
>>> from safeds.data.tabular.containers import Row
>>> row = Row.from_dict({"a": 1, "b": 2})
"""

# ------------------------------------------------------------------------------------------------------------------
Expand All @@ -44,60 +41,111 @@ def from_dict(data: dict[str, Any]) -> Row:
-------
row : Row
The generated row.
Examples
--------
>>> from safeds.data.tabular.containers import Row
>>> row = Row.from_dict({"a": 1, "b": 2})
"""
row_frame = pd.DataFrame([data.values()], columns=list(data.keys()))
# noinspection PyProtectedMember
return Row(data.values(), Schema._from_pandas_dataframe(row_frame))
return Row(pl.DataFrame(data))

# ------------------------------------------------------------------------------------------------------------------
# Dunder methods
# ------------------------------------------------------------------------------------------------------------------

def __init__(self, data: Iterable, schema: Schema | None = None):
self._data: pd.Series = data if isinstance(data, pd.Series) else pd.Series(data)
self._data = self._data.reset_index(drop=True)
def __init__(self, data: pl.DataFrame, schema: Schema | None = None):
"""
Initialize a row from a `polars.DataFrame`.
**Do not use this method directly.** It is not part of the public interface and may change in the future
without a major version bump. Use the static method
[from_dict][safeds.data.tabular.containers._row.Row.from_dict] instead.
Parameters
----------
data : polars.DataFrame
The data.
schema : Schema | None
The schema. If None, the schema is inferred from the data.
"""
self._data: pl.DataFrame = data

self._schema: Schema
if schema is not None:
self._schema = schema
else:
column_names = [f"column_{i}" for i in range(len(self._data))]
dataframe = self._data.to_frame().T
dataframe.columns = column_names
# noinspection PyProtectedMember
self._schema = Schema._from_pandas_dataframe(dataframe)
self._schema = Schema._from_polars_dataframe(self._data)

def __eq__(self, other: Any) -> bool:
if not isinstance(other, Row):
return NotImplemented
if self is other:
return True
return self._schema == other._schema and self._data.equals(other._data)
return self._schema == other._schema and self._data.frame_equal(other._data)

def __getitem__(self, column_name: str) -> Any:
return self.get_value(column_name)
"""
Return the value of a specified column.
def __hash__(self) -> int:
data_hash_string = md5(hash_pandas_object(self._data, index=True).values).hexdigest()
column_names_frozenset = frozenset(self.get_column_names())
Parameters
----------
column_name : str
The column name.
Returns
-------
value : Any
The value of the column.
return hash((data_hash_string, column_names_frozenset))
Raises
------
UnknownColumnNameError
If the row does not contain the specified column.
Examples
--------
>>> from safeds.data.tabular.containers import Row
>>> row = Row.from_dict({"a": 1, "b": 2})
>>> row["a"]
1
"""
return self.get_value(column_name)

def __iter__(self) -> Iterator[Any]:
return iter(self.get_column_names())

def __len__(self) -> int:
return len(self._data)
"""
Return the number of columns in this row.
Returns
-------
count : int
The number of columns.
Examples
--------
>>> from safeds.data.tabular.containers import Row
>>> row = Row.from_dict({"a": 1, "b": 2})
>>> len(row)
2
"""
return self._data.shape[1]

def __repr__(self) -> str:
tmp = self._data.to_frame().T
tmp.columns = self.get_column_names()
return tmp.__repr__()
return f"Row({str(self)})"

def __str__(self) -> str:
tmp = self._data.to_frame().T
tmp.columns = self.get_column_names()
return tmp.__str__()
match len(self):
case 0:
return "{}"
case 1:
return str(self.to_dict())
case _:
lines = (f" {name!r}: {value!r}" for name, value in self.to_dict().items())
joined = ",\n".join(lines)
return f"{{\n{joined}\n}}"

# ------------------------------------------------------------------------------------------------------------------
# Properties
Expand All @@ -112,6 +160,12 @@ def schema(self) -> Schema:
-------
schema : Schema
The schema.
Examples
--------
>>> from safeds.data.tabular.containers import Row
>>> row = Row.from_dict({"a": 1, "b": 2})
>>> schema = row.schema
"""
return self._schema

Expand All @@ -130,50 +184,73 @@ def get_value(self, column_name: str) -> Any:
Returns
-------
value :
value : Any
The value of the column.
Raises
------
UnknownColumnNameError
If the row does not contain the specified column.
Examples
--------
>>> from safeds.data.tabular.containers import Row
>>> row = Row.from_dict({"a": 1, "b": 2})
>>> row.get_value("a")
1
"""
if not self._schema.has_column(column_name):
if not self.has_column(column_name):
raise UnknownColumnNameError([column_name])
# noinspection PyProtectedMember
return self._data[self._schema._get_column_index(column_name)]

return self._data[0, column_name]

def has_column(self, column_name: str) -> bool:
"""
Return whether the row contains a given column.
Alias for self.schema.hasColumn(column_name: str) -> bool.
Parameters
----------
column_name : str
The name of the column.
Returns
-------
contains : bool
has_column : bool
True, if row contains the column.
Examples
--------
>>> from safeds.data.tabular.containers import Row
>>> row = Row.from_dict({"a": 1, "b": 2})
>>> row.has_column("a")
True
>>> row.has_column("c")
False
"""
return self._schema.has_column(column_name)

def get_column_names(self) -> list[str]:
"""
Return a list of all column names saved in this schema.
Alias for self.schema.get_column_names() -> list[str].
Return a list of all column names in the row.
Returns
-------
column_names : list[str]
The column names.
Examples
--------
>>> from safeds.data.tabular.containers import Row
>>> row = Row.from_dict({"a": 1, "b": 2})
>>> row.get_column_names()
['a', 'b']
"""
return self._schema.get_column_names()

def get_type_of_column(self, column_name: str) -> ColumnType:
"""
Return the type of a specified column.
Alias for self.schema.get_type_of_column(column_name: str) -> ColumnType.
Return the type of the specified column.
Parameters
----------
Expand All @@ -187,8 +264,15 @@ def get_type_of_column(self, column_name: str) -> ColumnType:
Raises
------
ColumnNameError
If the specified target column name does not exist.
UnknownColumnNameError
If the row does not contain the specified column.
Examples
--------
>>> from safeds.data.tabular.containers import Row
>>> row = Row.from_dict({"a": 1, "b": 2})
>>> row.get_type_of_column("a")
Integer
"""
return self._schema.get_type_of_column(column_name)

Expand All @@ -204,8 +288,15 @@ def count(self) -> int:
-------
count : int
The number of columns.
Examples
--------
>>> from safeds.data.tabular.containers import Row
>>> row = Row.from_dict({"a": 1, "b": 2})
>>> row.count()
2
"""
return len(self._data)
return self._data.shape[1]

# ------------------------------------------------------------------------------------------------------------------
# Conversion
Expand All @@ -219,24 +310,28 @@ def to_dict(self) -> dict[str, Any]:
-------
data : dict[str, Any]
Dictionary representation of the row.
Examples
--------
>>> from safeds.data.tabular.containers import Row
>>> row = Row.from_dict({"a": 1, "b": 2})
>>> row.to_dict()
{'a': 1, 'b': 2}
"""
return {column_name: self.get_value(column_name) for column_name in self.get_column_names()}

# ------------------------------------------------------------------------------------------------------------------
# IPython integration
# ------------------------------------------------------------------------------------------------------------------

def _ipython_display_(self) -> DisplayHandle:
def _repr_html_(self) -> str:
"""
Return a display object for the column to be used in Jupyter Notebooks.
Return an HTML representation of the row.
Returns
-------
output : DisplayHandle
Output object.
output : str
The generated HTML.
"""
tmp = self._data.to_frame().T
tmp.columns = self.get_column_names()

with pd.option_context("display.max_rows", tmp.shape[0], "display.max_columns", tmp.shape[1]):
return display(tmp)
# noinspection PyProtectedMember
return self._data._repr_html_()

0 comments on commit 62ca34d

Please sign in to comment.