Skip to content

Commit

Permalink
perf: remove unneeded copy operations in table transformers (#496)
Browse files Browse the repository at this point in the history
### Summary of Changes

#494 enabled Pandas's copy-on-write feature. This PR removes the
explicit deep copies that were created in table transformers.

---------

Co-authored-by: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com>
  • Loading branch information
lars-reimann and megalinter-bot committed Nov 16, 2023
1 parent 9a19389 commit 6443beb
Show file tree
Hide file tree
Showing 10 changed files with 33 additions and 87 deletions.
2 changes: 1 addition & 1 deletion src/safeds/data/tabular/containers/_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2075,7 +2075,7 @@ def plot_histograms(self) -> Image:
"""
col_wrap = min(self.number_of_columns, 3)

data = pd.melt(self._data.applymap(lambda value: str(value)), value_vars=self.column_names)
data = pd.melt(self._data.map(lambda value: str(value)), value_vars=self.column_names)
grid = sns.FacetGrid(data=data, col="variable", col_wrap=col_wrap, sharex=False, sharey=False)
grid.map(sns.histplot, "value")
grid.set_xlabels("")
Expand Down
16 changes: 0 additions & 16 deletions src/safeds/data/tabular/containers/_tagged_table.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

import copy
from typing import TYPE_CHECKING

from safeds.data.tabular.containers import Column, Row, Table
Expand Down Expand Up @@ -193,21 +192,6 @@ def target(self) -> Column:
"""
return self._target

# ------------------------------------------------------------------------------------------------------------------
# Helpers
# ------------------------------------------------------------------------------------------------------------------

def _copy(self) -> TaggedTable:
"""
Return a copy of this tagged table.
Returns
-------
table : TaggedTable
The copy of this tagged table.
"""
return copy.deepcopy(self)

# ------------------------------------------------------------------------------------------------------------------
# Specific methods from TaggedTable class:
# ------------------------------------------------------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion src/safeds/data/tabular/transformation/_discretizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def transform(self, table: Table) -> Table:
if not table.get_column(column).type.is_numeric():
raise NonNumericColumnError(f"{column} is of type {table.get_column(column).type}.")

data = table._data.copy()
data = table._data.reset_index(drop=True)
data.columns = table.column_names
data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names])
return Table._from_pandas_dataframe(data)
Expand Down
10 changes: 4 additions & 6 deletions src/safeds/data/tabular/transformation/_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,11 +153,9 @@ def fit(self, table: Table, column_names: list[str] | None) -> Imputer:
multiple_most_frequent[name] = table.get_column(name).mode()
if len(multiple_most_frequent) > 0:
warnings.warn(
(
"There are multiple most frequent values in a column given to the Imputer.\nThe lowest values"
" are being chosen in this cases. The following columns have multiple most frequent"
f" values:\n{multiple_most_frequent}"
),
"There are multiple most frequent values in a column given to the Imputer.\nThe lowest values"
" are being chosen in this cases. The following columns have multiple most frequent"
f" values:\n{multiple_most_frequent}",
UserWarning,
stacklevel=2,
)
Expand Down Expand Up @@ -210,7 +208,7 @@ def transform(self, table: Table) -> Table:
if table.number_of_rows == 0:
raise ValueError("The Imputer cannot transform the table because it contains 0 rows")

data = table._data.copy()
data = table._data.reset_index(drop=True)
data[self._column_names] = pd.DataFrame(
self._wrapped_transformer.transform(data[self._column_names]),
columns=self._column_names,
Expand Down
13 changes: 5 additions & 8 deletions src/safeds/data/tabular/transformation/_label_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,9 @@ def fit(self, table: Table, column_names: list[str] | None) -> LabelEncoder:

if table.keep_only_columns(column_names).remove_columns_with_non_numerical_values().number_of_columns > 0:
warnings.warn(
(
"The columns"
f" {table.keep_only_columns(column_names).remove_columns_with_non_numerical_values().column_names} contain"
" numerical data. The LabelEncoder is designed to encode non-numerical values into numerical"
" values"
),
"The columns"
f" {table.keep_only_columns(column_names).remove_columns_with_non_numerical_values().column_names} contain"
" numerical data. The LabelEncoder is designed to encode non-numerical values into numerical values",
UserWarning,
stacklevel=2,
)
Expand Down Expand Up @@ -112,7 +109,7 @@ def transform(self, table: Table) -> Table:
if table.number_of_rows == 0:
raise ValueError("The LabelEncoder cannot transform the table because it contains 0 rows")

data = table._data.copy()
data = table._data.reset_index(drop=True)
data.columns = table.column_names
data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names])
return Table._from_pandas_dataframe(data)
Expand Down Expand Up @@ -171,7 +168,7 @@ def inverse_transform(self, transformed_table: Table) -> Table:
),
)

data = transformed_table._data.copy()
data = transformed_table._data.reset_index(drop=True)
data.columns = transformed_table.column_names
data[self._column_names] = self._wrapped_transformer.inverse_transform(data[self._column_names])
return Table._from_pandas_dataframe(data)
Expand Down
17 changes: 7 additions & 10 deletions src/safeds/data/tabular/transformation/_one_hot_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,17 +111,14 @@ def fit(self, table: Table, column_names: list[str] | None) -> OneHotEncoder:
> 0
):
warnings.warn(
(
"The columns"
f" {table._as_table().keep_only_columns(column_names).remove_columns_with_non_numerical_values().column_names} contain"
" numerical data. The OneHotEncoder is designed to encode non-numerical values into numerical"
" values"
),
"The columns"
f" {table._as_table().keep_only_columns(column_names).remove_columns_with_non_numerical_values().column_names} contain"
" numerical data. The OneHotEncoder is designed to encode non-numerical values into numerical values",
UserWarning,
stacklevel=2,
)

data = table._data.copy()
data = table._data.reset_index(drop=True)
data.columns = table.column_names

result = OneHotEncoder()
Expand Down Expand Up @@ -223,7 +220,7 @@ def transform(self, table: Table) -> Table:
# New columns may not be sorted:
column_names = []
for name in table.column_names:
if name not in self._column_names.keys():
if name not in self._column_names:
column_names.append(name)
else:
column_names.extend(
Expand Down Expand Up @@ -322,11 +319,11 @@ def inverse_transform(self, transformed_table: Table) -> Table:
name
if name not in [value for value_list in list(self._column_names.values()) for value in value_list]
else list(self._column_names.keys())[
[
next(
list(self._column_names.values()).index(value)
for value in list(self._column_names.values())
if name in value
][0]
)
]
)
for name in table.column_names
Expand Down
4 changes: 2 additions & 2 deletions src/safeds/data/tabular/transformation/_range_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def transform(self, table: Table) -> Table:
),
)

data = table._data.copy()
data = table._data.reset_index(drop=True)
data.columns = table.column_names
data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names])
return Table._from_pandas_dataframe(data)
Expand Down Expand Up @@ -213,7 +213,7 @@ def inverse_transform(self, transformed_table: Table) -> Table:
),
)

data = transformed_table._data.copy()
data = transformed_table._data.reset_index(drop=True)
data.columns = transformed_table.column_names
data[self._column_names] = self._wrapped_transformer.inverse_transform(data[self._column_names])
return Table._from_pandas_dataframe(data)
Expand Down
4 changes: 2 additions & 2 deletions src/safeds/data/tabular/transformation/_standard_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def transform(self, table: Table) -> Table:
),
)

data = table._data.copy()
data = table._data.reset_index(drop=True)
data.columns = table.column_names
data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names])
return Table._from_pandas_dataframe(data)
Expand Down Expand Up @@ -195,7 +195,7 @@ def inverse_transform(self, transformed_table: Table) -> Table:
),
)

data = transformed_table._data.copy()
data = transformed_table._data.reset_index(drop=True)
data.columns = transformed_table.column_names
data[self._column_names] = self._wrapped_transformer.inverse_transform(data[self._column_names])
return Table._from_pandas_dataframe(data)
Expand Down
30 changes: 11 additions & 19 deletions src/safeds/ml/classical/_util_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,9 @@ def fit(model: Any, tagged_table: TaggedTable) -> None:
if len(non_numerical_column_names) != 0:
raise NonNumericColumnError(
str(non_numerical_column_names),
(
"You can use the LabelEncoder or OneHotEncoder to transform your non-numerical data to numerical"
" data.\nThe OneHotEncoder should be used if you work with nominal data. If your data contains too many"
" different values\nor is ordinal, you should use the LabelEncoder."
),
"You can use the LabelEncoder or OneHotEncoder to transform your non-numerical data to numerical"
" data.\nThe OneHotEncoder should be used if you work with nominal data. If your data contains too many"
" different values\nor is ordinal, you should use the LabelEncoder.",
)

null_containing_column_names = set(tagged_table.features.column_names) - set(
Expand All @@ -65,10 +63,8 @@ def fit(model: Any, tagged_table: TaggedTable) -> None:
if len(null_containing_column_names) != 0:
raise MissingValuesColumnError(
str(null_containing_column_names),
(
"You can use the Imputer to replace the missing values based on different strategies.\nIf you want to"
" remove the missing values entirely you can use the method `Table.remove_rows_with_missing_values`."
),
"You can use the Imputer to replace the missing values based on different strategies.\nIf you want to"
" remove the missing values entirely you can use the method `Table.remove_rows_with_missing_values`.",
)

try:
Expand Down Expand Up @@ -138,11 +134,9 @@ def predict(model: Any, dataset: Table, feature_names: list[str] | None, target_
if len(non_numerical_column_names) != 0:
raise NonNumericColumnError(
str(non_numerical_column_names),
(
"You can use the LabelEncoder or OneHotEncoder to transform your non-numerical data to numerical"
" data.\nThe OneHotEncoder should be used if you work with nominal data. If your data contains too many"
" different values\nor is ordinal, you should use the LabelEncoder.\n"
),
"You can use the LabelEncoder or OneHotEncoder to transform your non-numerical data to numerical"
" data.\nThe OneHotEncoder should be used if you work with nominal data. If your data contains too many"
" different values\nor is ordinal, you should use the LabelEncoder.\n",
)

null_containing_column_names = set(dataset.keep_only_columns(feature_names).column_names) - set(
Expand All @@ -151,16 +145,14 @@ def predict(model: Any, dataset: Table, feature_names: list[str] | None, target_
if len(null_containing_column_names) != 0:
raise MissingValuesColumnError(
str(null_containing_column_names),
(
"You can use the Imputer to replace the missing values based on different strategies.\nIf you want to"
" remove the missing values entirely you can use the method `Table.remove_rows_with_missing_values`."
),
"You can use the Imputer to replace the missing values based on different strategies.\nIf you want to"
" remove the missing values entirely you can use the method `Table.remove_rows_with_missing_values`.",
)

dataset_df = dataset.keep_only_columns(feature_names)._data
dataset_df.columns = feature_names

result_set = dataset._data.copy(deep=True)
result_set = dataset._data.reset_index(drop=True)
result_set.columns = dataset.column_names

try:
Expand Down

This file was deleted.

0 comments on commit 6443beb

Please sign in to comment.