Skip to content

Commit

Permalink
feat: enable copy-on-write for pandas dataframes (#494)
Browse files Browse the repository at this point in the history
Closes #428

### Summary of Changes

* Enable copy-on-write for Pandas dataframes. This will be enabled in
Pandas v3 by default.
* Remove unnecessary cloning.

---------

Co-authored-by: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com>
  • Loading branch information
lars-reimann and megalinter-bot committed Nov 16, 2023
1 parent 5bf0e75 commit 9a19389
Show file tree
Hide file tree
Showing 7 changed files with 36 additions and 120 deletions.
20 changes: 3 additions & 17 deletions src/safeds/data/tabular/containers/_column.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

import copy
import io
from collections.abc import Sequence
from numbers import Number
Expand All @@ -27,6 +26,9 @@
T = TypeVar("T")
R = TypeVar("R")

# Enable copy-on-write for pandas dataframes
pd.options.mode.copy_on_write = True


class Column(Sequence[T]):
"""
Expand Down Expand Up @@ -1031,19 +1033,3 @@ def _count_missing_values(self) -> int:
2
"""
return self._data.isna().sum()

# ------------------------------------------------------------------------------------------------------------------
# Helpers
# ------------------------------------------------------------------------------------------------------------------

def _copy(self) -> Column:
"""
Return a copy of this column.
Returns
-------
column : Column
The copy of this column.
"""
return copy.deepcopy(self)
21 changes: 4 additions & 17 deletions src/safeds/data/tabular/containers/_row.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

import copy
import functools
from collections.abc import Callable, Mapping
from typing import TYPE_CHECKING, Any
Expand All @@ -13,6 +12,9 @@
if TYPE_CHECKING:
from collections.abc import Iterator

# Enable copy-on-write for pandas dataframes
pd.options.mode.copy_on_write = True


class Row(Mapping[str, Any]):
"""
Expand Down Expand Up @@ -152,7 +154,7 @@ def __contains__(self, obj: Any) -> bool:
"""
return isinstance(obj, str) and self.has_column(obj)

def __eq__(self, other: Any) -> bool:
def __eq__(self, other: object) -> bool:
"""
Check whether this row is equal to another object.
Expand Down Expand Up @@ -528,18 +530,3 @@ def _repr_html_(self) -> str:
The generated HTML.
"""
return self._data.to_html(max_rows=1, max_cols=self._data.shape[1], notebook=True)

# ------------------------------------------------------------------------------------------------------------------
# Helpers
# ------------------------------------------------------------------------------------------------------------------

def _copy(self) -> Row:
"""
Return a copy of this row.
Returns
-------
copy : Row
The copy of this row.
"""
return copy.deepcopy(self)
71 changes: 26 additions & 45 deletions src/safeds/data/tabular/containers/_table.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

import copy
import functools
import io
import warnings
Expand Down Expand Up @@ -39,6 +38,9 @@

from ._tagged_table import TaggedTable

# Enable copy-on-write for pandas dataframes
pd.options.mode.copy_on_write = True


# noinspection PyProtectedMember
class Table:
Expand Down Expand Up @@ -420,7 +422,7 @@ def __init__(self, data: Mapping[str, Sequence[Any]] | None = None) -> None:
self._data = self._data.reset_index(drop=True)
self._schema: Schema = Schema._from_pandas_dataframe(self._data)

def __eq__(self, other: Any) -> bool:
def __eq__(self, other: object) -> bool:
"""
Compare two table instances.
Expand Down Expand Up @@ -469,12 +471,12 @@ def __repr__(self) -> str:
>>> repr(table)
' a b\n0 1 2\n1 3 4'
"""
tmp = self._data.copy(deep=True)
tmp = self._data.reset_index(drop=True)
tmp.columns = self.column_names
return tmp.__repr__()

def __str__(self) -> str:
tmp = self._data.copy(deep=True)
tmp = self._data.reset_index(drop=True)
tmp.columns = self.column_names
return tmp.__str__()

Expand Down Expand Up @@ -831,13 +833,13 @@ def summarize_statistics(self) -> Table:
# Transformations
# ------------------------------------------------------------------------------------------------------------------

# This method is meant as a way to "cast" instances of subclasses of `Table` to a proper `Table`, dropping any
# additional constraints that might have to hold in the subclass.
# Override accordingly in subclasses.
def _as_table(self: Table) -> Table:
"""
Transform the table to an instance of the Table class.
This method is meant as a way to "cast" instances of subclasses of `Table` to a proper `Table`, dropping any
additional constraints that might have to hold in the subclass. Override accordingly in subclasses.
Returns
-------
table: Table
Expand Down Expand Up @@ -879,7 +881,7 @@ def add_column(self, column: Column) -> Table:
if column.number_of_rows != self.number_of_rows and self.number_of_columns != 0:
raise ColumnSizeError(str(self.number_of_rows), str(column._data.size))

result = self._data.copy()
result = self._data.reset_index(drop=True)
result.columns = self._schema.column_names
result[column.name] = column._data
return Table._from_pandas_dataframe(result)
Expand Down Expand Up @@ -920,7 +922,7 @@ def add_columns(self, columns: list[Column] | Table) -> Table:
"""
if isinstance(columns, Table):
columns = columns.to_columns()
result = self._data.copy()
result = self._data.reset_index(drop=True)
result.columns = self._schema.column_names
for column in columns:
if column.name in result.columns:
Expand Down Expand Up @@ -969,7 +971,7 @@ def add_row(self, row: Row) -> Table:
1 3 4
"""
int_columns = []
result = self._copy()

if self.number_of_columns == 0:
return Table.from_rows([row])
if len(set(self.column_names) - set(row.column_names)) > 0:
Expand All @@ -980,12 +982,12 @@ def add_row(self, row: Row) -> Table:
),
)

if result.number_of_rows == 0:
if self.number_of_rows == 0:
int_columns = list(filter(lambda name: isinstance(row[name], int | np.int64 | np.int32), row.column_names))

new_df = pd.concat([result._data, row._data]).infer_objects()
new_df.columns = result.column_names
schema = Schema.merge_multiple_schemas([result.schema, row.schema])
new_df = pd.concat([self._data, row._data]).infer_objects()
new_df.columns = self.column_names
schema = Schema.merge_multiple_schemas([self.schema, row.schema])
result = Table._from_pandas_dataframe(new_df, schema)

for column in int_columns:
Expand Down Expand Up @@ -1033,7 +1035,7 @@ def add_rows(self, rows: list[Row] | Table) -> Table:
rows = rows.to_rows()

if len(rows) == 0:
return self._copy()
return self

different_column_names = set()
for row in rows:
Expand All @@ -1046,8 +1048,7 @@ def add_rows(self, rows: list[Row] | Table) -> Table:
),
)

result = self._copy()

result = self
for row in rows:
result = result.add_row(row)

Expand Down Expand Up @@ -1153,9 +1154,7 @@ def keep_only_columns(self, column_names: list[str]) -> Table:
if len(invalid_columns) != 0:
raise UnknownColumnNameError(invalid_columns, similar_columns)

clone = self._copy()
clone = clone.remove_columns(list(set(self.column_names) - set(column_names)))
return clone
return self.remove_columns(list(set(self.column_names) - set(column_names)))

def remove_columns(self, column_names: list[str]) -> Table:
"""
Expand Down Expand Up @@ -1309,8 +1308,7 @@ def remove_rows_with_missing_values(self) -> Table:
a b
0 1.0 2.0
"""
result = self._data.copy(deep=True)
result = result.dropna(axis="index")
result = self._data.dropna(axis="index")
return Table._from_pandas_dataframe(result)

def remove_rows_with_outliers(self) -> Table:
Expand Down Expand Up @@ -1350,13 +1348,11 @@ def remove_rows_with_outliers(self) -> Table:
9 0.0 0.00 0.0 -1000000
10 0.0 0.00 0.0 -1000000
"""
copy = self._data.copy(deep=True)

table_without_nonnumericals = self.remove_columns_with_non_numerical_values()
z_scores = np.absolute(stats.zscore(table_without_nonnumericals._data, nan_policy="omit"))
filter_ = ((z_scores < 3) | np.isnan(z_scores)).all(axis=1)

return Table._from_pandas_dataframe(copy[filter_], self._schema)
return Table._from_pandas_dataframe(self._data[filter_], self._schema)

def rename_column(self, old_name: str, new_name: str) -> Table:
"""
Expand Down Expand Up @@ -1399,7 +1395,7 @@ def rename_column(self, old_name: str, new_name: str) -> Table:
if new_name in self._schema.column_names:
raise DuplicateColumnNameError(new_name)

new_df = self._data.copy()
new_df = self._data.reset_index(drop=True)
new_df.columns = self._schema.column_names
return Table._from_pandas_dataframe(new_df.rename(columns={old_name: new_name}))

Expand Down Expand Up @@ -2128,7 +2124,7 @@ def to_csv_file(self, path: str | Path) -> None:
if path.suffix != ".csv":
raise WrongFileExtensionError(path, ".csv")
path.parent.mkdir(parents=True, exist_ok=True)
data_to_csv = self._data.copy()
data_to_csv = self._data.reset_index(drop=True)
data_to_csv.columns = self._schema.column_names
data_to_csv.to_csv(path, index=False)

Expand Down Expand Up @@ -2166,7 +2162,7 @@ def to_excel_file(self, path: str | Path) -> None:
tmp_table_file.save(path)

path.parent.mkdir(parents=True, exist_ok=True)
data_to_excel = self._data.copy()
data_to_excel = self._data.reset_index(drop=True)
data_to_excel.columns = self._schema.column_names
data_to_excel.to_excel(path)

Expand Down Expand Up @@ -2197,7 +2193,7 @@ def to_json_file(self, path: str | Path) -> None:
if path.suffix != ".json":
raise WrongFileExtensionError(path, ".json")
path.parent.mkdir(parents=True, exist_ok=True)
data_to_json = self._data.copy()
data_to_json = self._data.reset_index(drop=True)
data_to_json.columns = self._schema.column_names
data_to_json.to_json(path)

Expand Down Expand Up @@ -2333,21 +2329,6 @@ def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): #
if not allow_copy:
raise NotImplementedError("For the moment we need to copy the data, so `allow_copy` must be True.")

data_copy = self._data.copy()
data_copy = self._data.reset_index(drop=True)
data_copy.columns = self.column_names
return data_copy.__dataframe__(nan_as_null, allow_copy)

# ------------------------------------------------------------------------------------------------------------------
# Helpers
# ------------------------------------------------------------------------------------------------------------------

def _copy(self) -> Table:
"""
Return a copy of this table.
Returns
-------
table : Table
The copy of this table.
"""
return copy.deepcopy(self)
13 changes: 0 additions & 13 deletions tests/safeds/data/tabular/containers/_column/test_copy.py

This file was deleted.

6 changes: 3 additions & 3 deletions tests/safeds/data/tabular/containers/_table/test_add_row.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,13 @@
Schema({"col1": Anything(), "col2": Integer(is_nullable=True)}),
),
(
Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}),
Table({"col1": [], "col2": []}),
Row({"col1": 5, "col2": 6}),
Table({"col1": [1, 2, 1, 5], "col2": [1, 2, 4, 6]}),
Table({"col1": [5], "col2": [6]}),
Schema({"col1": Integer(), "col2": Integer()}),
),
(
Table({"col1": [], "col2": []}),
Table(),
Row({"col1": 5, "col2": 6}),
Table({"col1": [5], "col2": [6]}),
Schema({"col1": Integer(), "col2": Integer()}),
Expand Down
13 changes: 0 additions & 13 deletions tests/safeds/data/tabular/containers/_table/test_copy.py

This file was deleted.

12 changes: 0 additions & 12 deletions tests/safeds/data/tabular/containers/test_row.py
Original file line number Diff line number Diff line change
Expand Up @@ -514,18 +514,6 @@ def test_should_contain_td_element_for_each_value(self, row: Row) -> None:
assert f"<td>{value}</td>" in row._repr_html_()


class TestCopy:
@pytest.mark.parametrize(
"row",
[Row(), Row({"a": 3, "b": 4})],
ids=["empty", "normal"],
)
def test_should_copy_table(self, row: Row) -> None:
copied = row._copy()
assert copied == row
assert copied is not row


class TestSortColumns:
@pytest.mark.parametrize(
("row", "comparator", "expected"),
Expand Down

0 comments on commit 9a19389

Please sign in to comment.