perf: remove unneeded copy operations in table transformers (#496)

### Summary of Changes #494 enabled Pandas's copy-on-write feature. This PR removes the explicit deep copies that were created in table transformers. --------- Co-authored-by: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com>
Safe-DS · Nov 16, 2023 · 6443beb · 6443beb
1 parent 9a19389
commit 6443beb
Show file tree

Hide file tree

Showing 10 changed files with 33 additions and 87 deletions.
diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py
@@ -2075,7 +2075,7 @@ def plot_histograms(self) -> Image:
         """
         col_wrap = min(self.number_of_columns, 3)
 
-        data = pd.melt(self._data.applymap(lambda value: str(value)), value_vars=self.column_names)
+        data = pd.melt(self._data.map(lambda value: str(value)), value_vars=self.column_names)
         grid = sns.FacetGrid(data=data, col="variable", col_wrap=col_wrap, sharex=False, sharey=False)
         grid.map(sns.histplot, "value")
         grid.set_xlabels("")

diff --git a/src/safeds/data/tabular/containers/_tagged_table.py b/src/safeds/data/tabular/containers/_tagged_table.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import copy
 from typing import TYPE_CHECKING
 
 from safeds.data.tabular.containers import Column, Row, Table
@@ -193,21 +192,6 @@ def target(self) -> Column:
         """
         return self._target
 
-    # ------------------------------------------------------------------------------------------------------------------
-    # Helpers
-    # ------------------------------------------------------------------------------------------------------------------
-
-    def _copy(self) -> TaggedTable:
-        """
-        Return a copy of this tagged table.
-
-        Returns
-        -------
-        table : TaggedTable
-            The copy of this tagged table.
-        """
-        return copy.deepcopy(self)
-
     # ------------------------------------------------------------------------------------------------------------------
     # Specific methods from TaggedTable class:
     # ------------------------------------------------------------------------------------------------------------------

diff --git a/src/safeds/data/tabular/transformation/_discretizer.py b/src/safeds/data/tabular/transformation/_discretizer.py
@@ -139,7 +139,7 @@ def transform(self, table: Table) -> Table:
             if not table.get_column(column).type.is_numeric():
                 raise NonNumericColumnError(f"{column} is of type {table.get_column(column).type}.")
 
-        data = table._data.copy()
+        data = table._data.reset_index(drop=True)
         data.columns = table.column_names
         data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names])
         return Table._from_pandas_dataframe(data)

diff --git a/src/safeds/data/tabular/transformation/_imputer.py b/src/safeds/data/tabular/transformation/_imputer.py
@@ -153,11 +153,9 @@ def fit(self, table: Table, column_names: list[str] | None) -> Imputer:
                     multiple_most_frequent[name] = table.get_column(name).mode()
             if len(multiple_most_frequent) > 0:
                 warnings.warn(
-                    (
-                        "There are multiple most frequent values in a column given to the Imputer.\nThe lowest values"
-                        " are being chosen in this cases. The following columns have multiple most frequent"
-                        f" values:\n{multiple_most_frequent}"
-                    ),
+                    "There are multiple most frequent values in a column given to the Imputer.\nThe lowest values"
+                    " are being chosen in this cases. The following columns have multiple most frequent"
+                    f" values:\n{multiple_most_frequent}",
                     UserWarning,
                     stacklevel=2,
                 )
@@ -210,7 +208,7 @@ def transform(self, table: Table) -> Table:
         if table.number_of_rows == 0:
             raise ValueError("The Imputer cannot transform the table because it contains 0 rows")
 
-        data = table._data.copy()
+        data = table._data.reset_index(drop=True)
         data[self._column_names] = pd.DataFrame(
             self._wrapped_transformer.transform(data[self._column_names]),
             columns=self._column_names,

diff --git a/src/safeds/data/tabular/transformation/_label_encoder.py b/src/safeds/data/tabular/transformation/_label_encoder.py
@@ -56,12 +56,9 @@ def fit(self, table: Table, column_names: list[str] | None) -> LabelEncoder:
 
         if table.keep_only_columns(column_names).remove_columns_with_non_numerical_values().number_of_columns > 0:
             warnings.warn(
-                (
-                    "The columns"
-                    f" {table.keep_only_columns(column_names).remove_columns_with_non_numerical_values().column_names} contain"
-                    " numerical data. The LabelEncoder is designed to encode non-numerical values into numerical"
-                    " values"
-                ),
+                "The columns"
+                f" {table.keep_only_columns(column_names).remove_columns_with_non_numerical_values().column_names} contain"
+                " numerical data. The LabelEncoder is designed to encode non-numerical values into numerical values",
                 UserWarning,
                 stacklevel=2,
             )
@@ -112,7 +109,7 @@ def transform(self, table: Table) -> Table:
         if table.number_of_rows == 0:
             raise ValueError("The LabelEncoder cannot transform the table because it contains 0 rows")
 
-        data = table._data.copy()
+        data = table._data.reset_index(drop=True)
         data.columns = table.column_names
         data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names])
         return Table._from_pandas_dataframe(data)
@@ -171,7 +168,7 @@ def inverse_transform(self, transformed_table: Table) -> Table:
                 ),
             )
 
-        data = transformed_table._data.copy()
+        data = transformed_table._data.reset_index(drop=True)
         data.columns = transformed_table.column_names
         data[self._column_names] = self._wrapped_transformer.inverse_transform(data[self._column_names])
         return Table._from_pandas_dataframe(data)

diff --git a/src/safeds/data/tabular/transformation/_one_hot_encoder.py b/src/safeds/data/tabular/transformation/_one_hot_encoder.py
@@ -111,17 +111,14 @@ def fit(self, table: Table, column_names: list[str] | None) -> OneHotEncoder:
             > 0
         ):
             warnings.warn(
-                (
-                    "The columns"
-                    f" {table._as_table().keep_only_columns(column_names).remove_columns_with_non_numerical_values().column_names} contain"
-                    " numerical data. The OneHotEncoder is designed to encode non-numerical values into numerical"
-                    " values"
-                ),
+                "The columns"
+                f" {table._as_table().keep_only_columns(column_names).remove_columns_with_non_numerical_values().column_names} contain"
+                " numerical data. The OneHotEncoder is designed to encode non-numerical values into numerical values",
                 UserWarning,
                 stacklevel=2,
             )
 
-        data = table._data.copy()
+        data = table._data.reset_index(drop=True)
         data.columns = table.column_names
 
         result = OneHotEncoder()
@@ -223,7 +220,7 @@ def transform(self, table: Table) -> Table:
         # New columns may not be sorted:
         column_names = []
         for name in table.column_names:
-            if name not in self._column_names.keys():
+            if name not in self._column_names:
                 column_names.append(name)
             else:
                 column_names.extend(
@@ -322,11 +319,11 @@ def inverse_transform(self, transformed_table: Table) -> Table:
                 name
                 if name not in [value for value_list in list(self._column_names.values()) for value in value_list]
                 else list(self._column_names.keys())[
-                    [
+                    next(
                         list(self._column_names.values()).index(value)
                         for value in list(self._column_names.values())
                         if name in value
-                    ][0]
+                    )
                 ]
             )
             for name in table.column_names

diff --git a/src/safeds/data/tabular/transformation/_range_scaler.py b/src/safeds/data/tabular/transformation/_range_scaler.py
@@ -151,7 +151,7 @@ def transform(self, table: Table) -> Table:
                 ),
             )
 
-        data = table._data.copy()
+        data = table._data.reset_index(drop=True)
         data.columns = table.column_names
         data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names])
         return Table._from_pandas_dataframe(data)
@@ -213,7 +213,7 @@ def inverse_transform(self, transformed_table: Table) -> Table:
                 ),
             )
 
-        data = transformed_table._data.copy()
+        data = transformed_table._data.reset_index(drop=True)
         data.columns = transformed_table.column_names
         data[self._column_names] = self._wrapped_transformer.inverse_transform(data[self._column_names])
         return Table._from_pandas_dataframe(data)

diff --git a/src/safeds/data/tabular/transformation/_standard_scaler.py b/src/safeds/data/tabular/transformation/_standard_scaler.py
@@ -133,7 +133,7 @@ def transform(self, table: Table) -> Table:
                 ),
             )
 
-        data = table._data.copy()
+        data = table._data.reset_index(drop=True)
         data.columns = table.column_names
         data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names])
         return Table._from_pandas_dataframe(data)
@@ -195,7 +195,7 @@ def inverse_transform(self, transformed_table: Table) -> Table:
                 ),
             )
 
-        data = transformed_table._data.copy()
+        data = transformed_table._data.reset_index(drop=True)
         data.columns = transformed_table.column_names
         data[self._column_names] = self._wrapped_transformer.inverse_transform(data[self._column_names])
         return Table._from_pandas_dataframe(data)

diff --git a/src/safeds/ml/classical/_util_sklearn.py b/src/safeds/ml/classical/_util_sklearn.py
@@ -52,11 +52,9 @@ def fit(model: Any, tagged_table: TaggedTable) -> None:
     if len(non_numerical_column_names) != 0:
         raise NonNumericColumnError(
             str(non_numerical_column_names),
-            (
-                "You can use the LabelEncoder or OneHotEncoder to transform your non-numerical data to numerical"
-                " data.\nThe OneHotEncoder should be used if you work with nominal data. If your data contains too many"
-                " different values\nor is ordinal, you should use the LabelEncoder."
-            ),
+            "You can use the LabelEncoder or OneHotEncoder to transform your non-numerical data to numerical"
+            " data.\nThe OneHotEncoder should be used if you work with nominal data. If your data contains too many"
+            " different values\nor is ordinal, you should use the LabelEncoder.",
         )
 
     null_containing_column_names = set(tagged_table.features.column_names) - set(
@@ -65,10 +63,8 @@ def fit(model: Any, tagged_table: TaggedTable) -> None:
     if len(null_containing_column_names) != 0:
         raise MissingValuesColumnError(
             str(null_containing_column_names),
-            (
-                "You can use the Imputer to replace the missing values based on different strategies.\nIf you want to"
-                " remove the missing values entirely you can use the method `Table.remove_rows_with_missing_values`."
-            ),
+            "You can use the Imputer to replace the missing values based on different strategies.\nIf you want to"
+            " remove the missing values entirely you can use the method `Table.remove_rows_with_missing_values`.",
         )
 
     try:
@@ -138,11 +134,9 @@ def predict(model: Any, dataset: Table, feature_names: list[str] | None, target_
     if len(non_numerical_column_names) != 0:
         raise NonNumericColumnError(
             str(non_numerical_column_names),
-            (
-                "You can use the LabelEncoder or OneHotEncoder to transform your non-numerical data to numerical"
-                " data.\nThe OneHotEncoder should be used if you work with nominal data. If your data contains too many"
-                " different values\nor is ordinal, you should use the LabelEncoder.\n"
-            ),
+            "You can use the LabelEncoder or OneHotEncoder to transform your non-numerical data to numerical"
+            " data.\nThe OneHotEncoder should be used if you work with nominal data. If your data contains too many"
+            " different values\nor is ordinal, you should use the LabelEncoder.\n",
         )
 
     null_containing_column_names = set(dataset.keep_only_columns(feature_names).column_names) - set(
@@ -151,16 +145,14 @@ def predict(model: Any, dataset: Table, feature_names: list[str] | None, target_
     if len(null_containing_column_names) != 0:
         raise MissingValuesColumnError(
             str(null_containing_column_names),
-            (
-                "You can use the Imputer to replace the missing values based on different strategies.\nIf you want to"
-                " remove the missing values entirely you can use the method `Table.remove_rows_with_missing_values`."
-            ),
+            "You can use the Imputer to replace the missing values based on different strategies.\nIf you want to"
+            " remove the missing values entirely you can use the method `Table.remove_rows_with_missing_values`.",
         )
 
     dataset_df = dataset.keep_only_columns(feature_names)._data
     dataset_df.columns = feature_names
 
-    result_set = dataset._data.copy(deep=True)
+    result_set = dataset._data.reset_index(drop=True)
     result_set.columns = dataset.column_names
 
     try:

diff --git a/tests/safeds/data/tabular/containers/_table/_tagged_table/test_copy.py b/tests/safeds/data/tabular/containers/_table/_tagged_table/test_copy.py