feat: back Row by a polars.DataFrame (#214)

Closes partially #196. Closes #149. ### Summary of Changes * `Row` now uses a `polars.DataFrame` instead of a `pandas.Series` to store its data. The `DataFrame` can directly store the column names. * Remove the `__hash__` method. A `Row` can no longer be used in a `set` and as the key of a `dict`. If we find a use-case for this, we'll add it back. --------- Co-authored-by: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com>
Safe-DS · Apr 19, 2023 · 62ca34d · 62ca34d
1 parent 655f07f
commit 62ca34d
Show file tree

Hide file tree

Showing 6 changed files with 364 additions and 112 deletions.
diff --git a/src/safeds/data/tabular/containers/_row.py b/src/safeds/data/tabular/containers/_row.py
@@ -1,29 +1,26 @@
 from __future__ import annotations
 
-from hashlib import md5
 from typing import TYPE_CHECKING, Any
 
-import pandas as pd
-from IPython.core.display_functions import DisplayHandle, display
-from pandas.core.util.hashing import hash_pandas_object
+import polars as pl
 
 from safeds.data.tabular.exceptions import UnknownColumnNameError
 from safeds.data.tabular.typing import ColumnType, Schema
 
 if TYPE_CHECKING:
-    from collections.abc import Iterable, Iterator
+    from collections.abc import Iterator
 
 
 class Row:
     """
     A row is a collection of values, where each value is associated with a column name.
 
-    Parameters
-    ----------
-    data : Iterable
-        The data.
-    schema : Schema
-        The schema of the row.
+    To create a row manually, use the static method [from_dict][safeds.data.tabular.containers._row.Row.from_dict].
+
+    Examples
+    --------
+    >>> from safeds.data.tabular.containers import Row
+    >>> row = Row.from_dict({"a": 1, "b": 2})
     """
 
     # ------------------------------------------------------------------------------------------------------------------
@@ -44,60 +41,111 @@ def from_dict(data: dict[str, Any]) -> Row:
         -------
         row : Row
             The generated row.
+
+        Examples
+        --------
+        >>> from safeds.data.tabular.containers import Row
+        >>> row = Row.from_dict({"a": 1, "b": 2})
         """
-        row_frame = pd.DataFrame([data.values()], columns=list(data.keys()))
-        # noinspection PyProtectedMember
-        return Row(data.values(), Schema._from_pandas_dataframe(row_frame))
+        return Row(pl.DataFrame(data))
 
     # ------------------------------------------------------------------------------------------------------------------
     # Dunder methods
     # ------------------------------------------------------------------------------------------------------------------
 
-    def __init__(self, data: Iterable, schema: Schema | None = None):
-        self._data: pd.Series = data if isinstance(data, pd.Series) else pd.Series(data)
-        self._data = self._data.reset_index(drop=True)
+    def __init__(self, data: pl.DataFrame, schema: Schema | None = None):
+        """
+        Initialize a row from a `polars.DataFrame`.
+
+        **Do not use this method directly.** It is not part of the public interface and may change in the future
+        without a major version bump. Use the static method
+        [from_dict][safeds.data.tabular.containers._row.Row.from_dict] instead.
+
+        Parameters
+        ----------
+        data : polars.DataFrame
+            The data.
+        schema : Schema | None
+            The schema. If None, the schema is inferred from the data.
+        """
+        self._data: pl.DataFrame = data
 
         self._schema: Schema
         if schema is not None:
             self._schema = schema
         else:
-            column_names = [f"column_{i}" for i in range(len(self._data))]
-            dataframe = self._data.to_frame().T
-            dataframe.columns = column_names
             # noinspection PyProtectedMember
-            self._schema = Schema._from_pandas_dataframe(dataframe)
+            self._schema = Schema._from_polars_dataframe(self._data)
 
     def __eq__(self, other: Any) -> bool:
         if not isinstance(other, Row):
             return NotImplemented
         if self is other:
             return True
-        return self._schema == other._schema and self._data.equals(other._data)
+        return self._schema == other._schema and self._data.frame_equal(other._data)
 
     def __getitem__(self, column_name: str) -> Any:
-        return self.get_value(column_name)
+        """
+        Return the value of a specified column.
 
-    def __hash__(self) -> int:
-        data_hash_string = md5(hash_pandas_object(self._data, index=True).values).hexdigest()
-        column_names_frozenset = frozenset(self.get_column_names())
+        Parameters
+        ----------
+        column_name : str
+            The column name.
+
+        Returns
+        -------
+        value : Any
+            The value of the column.
 
-        return hash((data_hash_string, column_names_frozenset))
+        Raises
+        ------
+        UnknownColumnNameError
+            If the row does not contain the specified column.
+
+        Examples
+        --------
+        >>> from safeds.data.tabular.containers import Row
+        >>> row = Row.from_dict({"a": 1, "b": 2})
+        >>> row["a"]
+        1
+        """
+        return self.get_value(column_name)
 
     def __iter__(self) -> Iterator[Any]:
         return iter(self.get_column_names())
 
     def __len__(self) -> int:
-        return len(self._data)
+        """
+        Return the number of columns in this row.
+
+        Returns
+        -------
+        count : int
+            The number of columns.
+
+        Examples
+        --------
+        >>> from safeds.data.tabular.containers import Row
+        >>> row = Row.from_dict({"a": 1, "b": 2})
+        >>> len(row)
+        2
+        """
+        return self._data.shape[1]
 
     def __repr__(self) -> str:
-        tmp = self._data.to_frame().T
-        tmp.columns = self.get_column_names()
-        return tmp.__repr__()
+        return f"Row({str(self)})"
 
     def __str__(self) -> str:
-        tmp = self._data.to_frame().T
-        tmp.columns = self.get_column_names()
-        return tmp.__str__()
+        match len(self):
+            case 0:
+                return "{}"
+            case 1:
+                return str(self.to_dict())
+            case _:
+                lines = (f"    {name!r}: {value!r}" for name, value in self.to_dict().items())
+                joined = ",\n".join(lines)
+                return f"{{\n{joined}\n}}"
 
     # ------------------------------------------------------------------------------------------------------------------
     # Properties
@@ -112,6 +160,12 @@ def schema(self) -> Schema:
         -------
         schema : Schema
             The schema.
+
+        Examples
+        --------
+        >>> from safeds.data.tabular.containers import Row
+        >>> row = Row.from_dict({"a": 1, "b": 2})
+        >>> schema = row.schema
         """
         return self._schema
 
@@ -130,50 +184,73 @@ def get_value(self, column_name: str) -> Any:
 
         Returns
         -------
-        value :
+        value : Any
             The value of the column.
+
+        Raises
+        ------
+        UnknownColumnNameError
+            If the row does not contain the specified column.
+
+        Examples
+        --------
+        >>> from safeds.data.tabular.containers import Row
+        >>> row = Row.from_dict({"a": 1, "b": 2})
+        >>> row.get_value("a")
+        1
         """
-        if not self._schema.has_column(column_name):
+        if not self.has_column(column_name):
             raise UnknownColumnNameError([column_name])
-        # noinspection PyProtectedMember
-        return self._data[self._schema._get_column_index(column_name)]
+
+        return self._data[0, column_name]
 
     def has_column(self, column_name: str) -> bool:
         """
         Return whether the row contains a given column.
 
-        Alias for self.schema.hasColumn(column_name: str) -> bool.
-
         Parameters
         ----------
         column_name : str
             The name of the column.
 
         Returns
         -------
-        contains : bool
+        has_column : bool
             True, if row contains the column.
+
+        Examples
+        --------
+        >>> from safeds.data.tabular.containers import Row
+        >>> row = Row.from_dict({"a": 1, "b": 2})
+        >>> row.has_column("a")
+        True
+
+        >>> row.has_column("c")
+        False
         """
         return self._schema.has_column(column_name)
 
     def get_column_names(self) -> list[str]:
         """
-        Return a list of all column names saved in this schema.
-
-        Alias for self.schema.get_column_names() -> list[str].
+        Return a list of all column names in the row.
 
         Returns
         -------
         column_names : list[str]
             The column names.
+
+        Examples
+        --------
+        >>> from safeds.data.tabular.containers import Row
+        >>> row = Row.from_dict({"a": 1, "b": 2})
+        >>> row.get_column_names()
+        ['a', 'b']
         """
         return self._schema.get_column_names()
 
     def get_type_of_column(self, column_name: str) -> ColumnType:
         """
-        Return the type of a specified column.
-
-        Alias for self.schema.get_type_of_column(column_name: str) -> ColumnType.
+        Return the type of the specified column.
 
         Parameters
         ----------
@@ -187,8 +264,15 @@ def get_type_of_column(self, column_name: str) -> ColumnType:
 
         Raises
         ------
-        ColumnNameError
-            If the specified target column name does not exist.
+        UnknownColumnNameError
+            If the row does not contain the specified column.
+
+        Examples
+        --------
+        >>> from safeds.data.tabular.containers import Row
+        >>> row = Row.from_dict({"a": 1, "b": 2})
+        >>> row.get_type_of_column("a")
+        Integer
         """
         return self._schema.get_type_of_column(column_name)
 
@@ -204,8 +288,15 @@ def count(self) -> int:
         -------
         count : int
             The number of columns.
+
+        Examples
+        --------
+        >>> from safeds.data.tabular.containers import Row
+        >>> row = Row.from_dict({"a": 1, "b": 2})
+        >>> row.count()
+        2
         """
-        return len(self._data)
+        return self._data.shape[1]
 
     # ------------------------------------------------------------------------------------------------------------------
     # Conversion
@@ -219,24 +310,28 @@ def to_dict(self) -> dict[str, Any]:
         -------
         data : dict[str, Any]
             Dictionary representation of the row.
+
+        Examples
+        --------
+        >>> from safeds.data.tabular.containers import Row
+        >>> row = Row.from_dict({"a": 1, "b": 2})
+        >>> row.to_dict()
+        {'a': 1, 'b': 2}
         """
         return {column_name: self.get_value(column_name) for column_name in self.get_column_names()}
 
     # ------------------------------------------------------------------------------------------------------------------
     # IPython integration
     # ------------------------------------------------------------------------------------------------------------------
 
-    def _ipython_display_(self) -> DisplayHandle:
+    def _repr_html_(self) -> str:
         """
-        Return a display object for the column to be used in Jupyter Notebooks.
+        Return an HTML representation of the row.
 
         Returns
         -------
-        output : DisplayHandle
-            Output object.
+        output : str
+            The generated HTML.
         """
-        tmp = self._data.to_frame().T
-        tmp.columns = self.get_column_names()
-
-        with pd.option_context("display.max_rows", tmp.shape[0], "display.max_columns", tmp.shape[1]):
-            return display(tmp)
+        # noinspection PyProtectedMember
+        return self._data._repr_html_()