feat: enable copy-on-write for pandas dataframes (#494)

Closes #428 ### Summary of Changes * Enable copy-on-write for Pandas dataframes. This will be enabled in Pandas v3 by default. * Remove unnecessary cloning. --------- Co-authored-by: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com>
Safe-DS · Nov 16, 2023 · 9a19389 · 9a19389
1 parent 5bf0e75
commit 9a19389
Show file tree

Hide file tree

Showing 7 changed files with 36 additions and 120 deletions.
diff --git a/src/safeds/data/tabular/containers/_column.py b/src/safeds/data/tabular/containers/_column.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import copy
 import io
 from collections.abc import Sequence
 from numbers import Number
@@ -27,6 +26,9 @@
 T = TypeVar("T")
 R = TypeVar("R")
 
+# Enable copy-on-write for pandas dataframes
+pd.options.mode.copy_on_write = True
+
 
 class Column(Sequence[T]):
     """
@@ -1031,19 +1033,3 @@ def _count_missing_values(self) -> int:
         2
         """
         return self._data.isna().sum()
-
-    # ------------------------------------------------------------------------------------------------------------------
-    # Helpers
-    # ------------------------------------------------------------------------------------------------------------------
-
-    def _copy(self) -> Column:
-        """
-        Return a copy of this column.
-
-        Returns
-        -------
-        column : Column
-            The copy of this column.
-
-        """
-        return copy.deepcopy(self)
diff --git a/src/safeds/data/tabular/containers/_row.py b/src/safeds/data/tabular/containers/_row.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import copy
 import functools
 from collections.abc import Callable, Mapping
 from typing import TYPE_CHECKING, Any
@@ -13,6 +12,9 @@
 if TYPE_CHECKING:
     from collections.abc import Iterator
 
+# Enable copy-on-write for pandas dataframes
+pd.options.mode.copy_on_write = True
+
 
 class Row(Mapping[str, Any]):
     """
@@ -152,7 +154,7 @@ def __contains__(self, obj: Any) -> bool:
         """
         return isinstance(obj, str) and self.has_column(obj)
 
-    def __eq__(self, other: Any) -> bool:
+    def __eq__(self, other: object) -> bool:
         """
         Check whether this row is equal to another object.
 
@@ -528,18 +530,3 @@ def _repr_html_(self) -> str:
             The generated HTML.
         """
         return self._data.to_html(max_rows=1, max_cols=self._data.shape[1], notebook=True)
-
-    # ------------------------------------------------------------------------------------------------------------------
-    # Helpers
-    # ------------------------------------------------------------------------------------------------------------------
-
-    def _copy(self) -> Row:
-        """
-        Return a copy of this row.
-
-        Returns
-        -------
-        copy : Row
-            The copy of this row.
-        """
-        return copy.deepcopy(self)
diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import copy
 import functools
 import io
 import warnings
@@ -39,6 +38,9 @@
 
     from ._tagged_table import TaggedTable
 
+# Enable copy-on-write for pandas dataframes
+pd.options.mode.copy_on_write = True
+
 
 # noinspection PyProtectedMember
 class Table:
@@ -420,7 +422,7 @@ def __init__(self, data: Mapping[str, Sequence[Any]] | None = None) -> None:
         self._data = self._data.reset_index(drop=True)
         self._schema: Schema = Schema._from_pandas_dataframe(self._data)
 
-    def __eq__(self, other: Any) -> bool:
+    def __eq__(self, other: object) -> bool:
         """
         Compare two table instances.
 
@@ -469,12 +471,12 @@ def __repr__(self) -> str:
         >>> repr(table)
         '   a  b\n0  1  2\n1  3  4'
         """
-        tmp = self._data.copy(deep=True)
+        tmp = self._data.reset_index(drop=True)
         tmp.columns = self.column_names
         return tmp.__repr__()
 
     def __str__(self) -> str:
-        tmp = self._data.copy(deep=True)
+        tmp = self._data.reset_index(drop=True)
         tmp.columns = self.column_names
         return tmp.__str__()
 
@@ -831,13 +833,13 @@ def summarize_statistics(self) -> Table:
     # Transformations
     # ------------------------------------------------------------------------------------------------------------------
 
-    # This method is meant as a way to "cast" instances of subclasses of `Table` to a proper `Table`, dropping any
-    # additional constraints that might have to hold in the subclass.
-    # Override accordingly in subclasses.
     def _as_table(self: Table) -> Table:
         """
         Transform the table to an instance of the Table class.
 
+        This method is meant as a way to "cast" instances of subclasses of `Table` to a proper `Table`, dropping any
+        additional constraints that might have to hold in the subclass. Override accordingly in subclasses.
+
         Returns
         -------
         table: Table
@@ -879,7 +881,7 @@ def add_column(self, column: Column) -> Table:
         if column.number_of_rows != self.number_of_rows and self.number_of_columns != 0:
             raise ColumnSizeError(str(self.number_of_rows), str(column._data.size))
 
-        result = self._data.copy()
+        result = self._data.reset_index(drop=True)
         result.columns = self._schema.column_names
         result[column.name] = column._data
         return Table._from_pandas_dataframe(result)
@@ -920,7 +922,7 @@ def add_columns(self, columns: list[Column] | Table) -> Table:
         """
         if isinstance(columns, Table):
             columns = columns.to_columns()
-        result = self._data.copy()
+        result = self._data.reset_index(drop=True)
         result.columns = self._schema.column_names
         for column in columns:
             if column.name in result.columns:
@@ -969,7 +971,7 @@ def add_row(self, row: Row) -> Table:
         1  3  4
         """
         int_columns = []
-        result = self._copy()
+
         if self.number_of_columns == 0:
             return Table.from_rows([row])
         if len(set(self.column_names) - set(row.column_names)) > 0:
@@ -980,12 +982,12 @@ def add_row(self, row: Row) -> Table:
                 ),
             )
 
-        if result.number_of_rows == 0:
+        if self.number_of_rows == 0:
             int_columns = list(filter(lambda name: isinstance(row[name], int | np.int64 | np.int32), row.column_names))
 
-        new_df = pd.concat([result._data, row._data]).infer_objects()
-        new_df.columns = result.column_names
-        schema = Schema.merge_multiple_schemas([result.schema, row.schema])
+        new_df = pd.concat([self._data, row._data]).infer_objects()
+        new_df.columns = self.column_names
+        schema = Schema.merge_multiple_schemas([self.schema, row.schema])
         result = Table._from_pandas_dataframe(new_df, schema)
 
         for column in int_columns:
@@ -1033,7 +1035,7 @@ def add_rows(self, rows: list[Row] | Table) -> Table:
             rows = rows.to_rows()
 
         if len(rows) == 0:
-            return self._copy()
+            return self
 
         different_column_names = set()
         for row in rows:
@@ -1046,8 +1048,7 @@ def add_rows(self, rows: list[Row] | Table) -> Table:
                 ),
             )
 
-        result = self._copy()
-
+        result = self
         for row in rows:
             result = result.add_row(row)
 
@@ -1153,9 +1154,7 @@ def keep_only_columns(self, column_names: list[str]) -> Table:
         if len(invalid_columns) != 0:
             raise UnknownColumnNameError(invalid_columns, similar_columns)
 
-        clone = self._copy()
-        clone = clone.remove_columns(list(set(self.column_names) - set(column_names)))
-        return clone
+        return self.remove_columns(list(set(self.column_names) - set(column_names)))
 
     def remove_columns(self, column_names: list[str]) -> Table:
         """
@@ -1309,8 +1308,7 @@ def remove_rows_with_missing_values(self) -> Table:
              a    b
         0  1.0  2.0
         """
-        result = self._data.copy(deep=True)
-        result = result.dropna(axis="index")
+        result = self._data.dropna(axis="index")
         return Table._from_pandas_dataframe(result)
 
     def remove_rows_with_outliers(self) -> Table:
@@ -1350,13 +1348,11 @@ def remove_rows_with_outliers(self) -> Table:
         9   0.0  0.00  0.0 -1000000
         10  0.0  0.00  0.0 -1000000
         """
-        copy = self._data.copy(deep=True)
-
         table_without_nonnumericals = self.remove_columns_with_non_numerical_values()
         z_scores = np.absolute(stats.zscore(table_without_nonnumericals._data, nan_policy="omit"))
         filter_ = ((z_scores < 3) | np.isnan(z_scores)).all(axis=1)
 
-        return Table._from_pandas_dataframe(copy[filter_], self._schema)
+        return Table._from_pandas_dataframe(self._data[filter_], self._schema)
 
     def rename_column(self, old_name: str, new_name: str) -> Table:
         """
@@ -1399,7 +1395,7 @@ def rename_column(self, old_name: str, new_name: str) -> Table:
         if new_name in self._schema.column_names:
             raise DuplicateColumnNameError(new_name)
 
-        new_df = self._data.copy()
+        new_df = self._data.reset_index(drop=True)
         new_df.columns = self._schema.column_names
         return Table._from_pandas_dataframe(new_df.rename(columns={old_name: new_name}))
 
@@ -2128,7 +2124,7 @@ def to_csv_file(self, path: str | Path) -> None:
         if path.suffix != ".csv":
             raise WrongFileExtensionError(path, ".csv")
         path.parent.mkdir(parents=True, exist_ok=True)
-        data_to_csv = self._data.copy()
+        data_to_csv = self._data.reset_index(drop=True)
         data_to_csv.columns = self._schema.column_names
         data_to_csv.to_csv(path, index=False)
 
@@ -2166,7 +2162,7 @@ def to_excel_file(self, path: str | Path) -> None:
         tmp_table_file.save(path)
 
         path.parent.mkdir(parents=True, exist_ok=True)
-        data_to_excel = self._data.copy()
+        data_to_excel = self._data.reset_index(drop=True)
         data_to_excel.columns = self._schema.column_names
         data_to_excel.to_excel(path)
 
@@ -2197,7 +2193,7 @@ def to_json_file(self, path: str | Path) -> None:
         if path.suffix != ".json":
             raise WrongFileExtensionError(path, ".json")
         path.parent.mkdir(parents=True, exist_ok=True)
-        data_to_json = self._data.copy()
+        data_to_json = self._data.reset_index(drop=True)
         data_to_json.columns = self._schema.column_names
         data_to_json.to_json(path)
 
@@ -2333,21 +2329,6 @@ def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):  #
         if not allow_copy:
             raise NotImplementedError("For the moment we need to copy the data, so `allow_copy` must be True.")
 
-        data_copy = self._data.copy()
+        data_copy = self._data.reset_index(drop=True)
         data_copy.columns = self.column_names
         return data_copy.__dataframe__(nan_as_null, allow_copy)
-
-    # ------------------------------------------------------------------------------------------------------------------
-    # Helpers
-    # ------------------------------------------------------------------------------------------------------------------
-
-    def _copy(self) -> Table:
-        """
-        Return a copy of this table.
-
-        Returns
-        -------
-        table : Table
-            The copy of this table.
-        """
-        return copy.deepcopy(self)
diff --git a/tests/safeds/data/tabular/containers/_column/test_copy.py b/tests/safeds/data/tabular/containers/_column/test_copy.py
diff --git a/tests/safeds/data/tabular/containers/_table/test_add_row.py b/tests/safeds/data/tabular/containers/_table/test_add_row.py
@@ -27,13 +27,13 @@
             Schema({"col1": Anything(), "col2": Integer(is_nullable=True)}),
         ),
         (
-            Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}),
+            Table({"col1": [], "col2": []}),
             Row({"col1": 5, "col2": 6}),
-            Table({"col1": [1, 2, 1, 5], "col2": [1, 2, 4, 6]}),
+            Table({"col1": [5], "col2": [6]}),
             Schema({"col1": Integer(), "col2": Integer()}),
         ),
         (
-            Table({"col1": [], "col2": []}),
+            Table(),
             Row({"col1": 5, "col2": 6}),
             Table({"col1": [5], "col2": [6]}),
             Schema({"col1": Integer(), "col2": Integer()}),

diff --git a/tests/safeds/data/tabular/containers/_table/test_copy.py b/tests/safeds/data/tabular/containers/_table/test_copy.py
diff --git a/tests/safeds/data/tabular/containers/test_row.py b/tests/safeds/data/tabular/containers/test_row.py
@@ -514,18 +514,6 @@ def test_should_contain_td_element_for_each_value(self, row: Row) -> None:
             assert f"<td>{value}</td>" in row._repr_html_()
 
 
-class TestCopy:
-    @pytest.mark.parametrize(
-        "row",
-        [Row(), Row({"a": 3, "b": 4})],
-        ids=["empty", "normal"],
-    )
-    def test_should_copy_table(self, row: Row) -> None:
-        copied = row._copy()
-        assert copied == row
-        assert copied is not row
-
-
 class TestSortColumns:
     @pytest.mark.parametrize(
         ("row", "comparator", "expected"),