From 8d534eda32572a57f60b8946f203d4247b941727 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Mon, 27 Mar 2023 17:59:35 +0200 Subject: [PATCH 1/5] test: improve outlier test --- .../_table/test_drop_rows_with_outliers.py | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/tests/safeds/data/tabular/containers/_table/test_drop_rows_with_outliers.py b/tests/safeds/data/tabular/containers/_table/test_drop_rows_with_outliers.py index 25b12a458..355958f65 100644 --- a/tests/safeds/data/tabular/containers/_table/test_drop_rows_with_outliers.py +++ b/tests/safeds/data/tabular/containers/_table/test_drop_rows_with_outliers.py @@ -22,7 +22,7 @@ def test_drop_rows_with_outliers_no_outliers() -> None: def test_drop_rows_with_outliers_with_outliers() -> None: - table = Table( + input_ = Table( pd.DataFrame( data={ "col1": [ @@ -41,12 +41,24 @@ def test_drop_rows_with_outliers_with_outliers() -> None: ], "col2": [1.0, 2.0, 3.0, 4.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], "col3": [2, 3, 1, 1_000_000_000, 1, 1, 1, 1, 1, 1, 1, 1], + "col4": ["s", 3, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1], } ) ) - result = table.drop_rows_with_outliers() - assert result.count_rows() == 11 - assert result.count_columns() == 3 + result = input_.drop_rows_with_outliers() + + expected = Table( + pd.DataFrame( + data={ + "col1": ["A", "B", "C", "a", "a", "a", "a", "a", "a", "a", "a"], + "col2": [1.0, 2.0, 3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + "col3": [2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1], + "col4": ["s", 3, 1, 1, 1, 1, 1, 1, 1, 1, 1], + } + ) + ) + + assert result == expected def test_drop_rows_with_outliers_no_rows() -> None: From c20aa656bed78b8dfa431bf54384535cec3c1a6c Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Mon, 27 Mar 2023 18:00:05 +0200 Subject: [PATCH 2/5] refactor: remove `list_columns_with_numerical_values` --- src/safeds/data/tabular/containers/_table.py | 25 ++++------------ ...test_list_columns_with_numerical_values.py | 30 ------------------- 2 files changed, 6 insertions(+), 49 deletions(-) delete mode 100644 tests/safeds/data/tabular/containers/_table/test_list_columns_with_numerical_values.py diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index fd62f988f..2d222e62f 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -596,7 +596,11 @@ def drop_columns_with_non_numerical_values(self) -> Table: A table without the columns that contain non-numerical values. """ - return Table.from_columns(self._list_columns_with_numerical_values()) + return Table.from_columns([ + column + for column in self.to_columns() + if column.type.is_numeric() + ]) def drop_duplicate_rows(self) -> Table: """ @@ -636,9 +640,7 @@ def drop_rows_with_outliers(self) -> Table: """ result = self._data.copy(deep=True) - table_without_nonnumericals = Table.from_columns( - self._list_columns_with_numerical_values() - ) + table_without_nonnumericals = self.drop_columns_with_non_numerical_values() result = result[ (np.absolute(stats.zscore(table_without_nonnumericals._data)) < 3).all( @@ -1098,18 +1100,3 @@ def _ipython_display_(self) -> DisplayHandle: "display.max_rows", tmp.shape[0], "display.max_columns", tmp.shape[1] ): return display(tmp) - - def _list_columns_with_numerical_values(self) -> list[Column]: - """ - Return a list of columns only containing numerical values. - - Returns - ------- - cols : list[Column] - The list with only numerical columns. - """ - cols = [] - for column_name, data_type in self._schema._schema.items(): - if data_type.is_numeric(): - cols.append(self.get_column(column_name)) - return cols diff --git a/tests/safeds/data/tabular/containers/_table/test_list_columns_with_numerical_values.py b/tests/safeds/data/tabular/containers/_table/test_list_columns_with_numerical_values.py deleted file mode 100644 index bc5cba885..000000000 --- a/tests/safeds/data/tabular/containers/_table/test_list_columns_with_numerical_values.py +++ /dev/null @@ -1,30 +0,0 @@ -import numpy as np -import pandas as pd -from safeds.data.tabular.containers import Table -from safeds.data.tabular.typing import ColumnType, TableSchema - - -def test_list_columns_with_numerical_values_valid() -> None: - table = Table( - pd.DataFrame( - data={ - "col1": ["A", "B", "C", "A"], - "col2": ["Test1", "Test1", "Test3", "Test1"], - "col3": [1, 2, 3, 4], - "col4": [2, 3, 1, 4], - } - ) - ) - columns = table._list_columns_with_numerical_values() - assert columns[0] == table.get_column("col3") - assert columns[1] == table.get_column("col4") - assert len(columns) == 2 - - -def test_list_columns_with_numerical_values_invalid() -> None: - table = Table( - [], TableSchema({"col1": ColumnType.from_numpy_dtype(np.dtype(float))}) - ) - columns = table._list_columns_with_numerical_values() - assert columns[0] == table.get_column("col1") - assert len(columns) == 1 From 99b3ffa51bd385b0c74366dd60991bd69835d37c Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Mon, 27 Mar 2023 18:18:32 +0200 Subject: [PATCH 3/5] test: failing test (missing value) --- .../containers/_table/test_drop_rows_with_outliers.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/safeds/data/tabular/containers/_table/test_drop_rows_with_outliers.py b/tests/safeds/data/tabular/containers/_table/test_drop_rows_with_outliers.py index 355958f65..cfe37163c 100644 --- a/tests/safeds/data/tabular/containers/_table/test_drop_rows_with_outliers.py +++ b/tests/safeds/data/tabular/containers/_table/test_drop_rows_with_outliers.py @@ -39,9 +39,8 @@ def test_drop_rows_with_outliers_with_outliers() -> None: "a", "a", ], - "col2": [1.0, 2.0, 3.0, 4.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + "col2": [1.0, 2.0, 3.0, 4.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, None], "col3": [2, 3, 1, 1_000_000_000, 1, 1, 1, 1, 1, 1, 1, 1], - "col4": ["s", 3, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1], } ) ) @@ -51,9 +50,8 @@ def test_drop_rows_with_outliers_with_outliers() -> None: pd.DataFrame( data={ "col1": ["A", "B", "C", "a", "a", "a", "a", "a", "a", "a", "a"], - "col2": [1.0, 2.0, 3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + "col2": [1.0, 2.0, 3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, None], "col3": [2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1], - "col4": ["s", 3, 1, 1, 1, 1, 1, 1, 1, 1, 1], } ) ) From bdf7dc7112d93b73b8c4ffd46b5bb023da2da808 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Mon, 27 Mar 2023 18:34:40 +0200 Subject: [PATCH 4/5] fix: handling of missing values when dropping rows with outliers --- src/safeds/data/tabular/containers/_table.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 2d222e62f..d4c79ea52 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -630,25 +630,24 @@ def drop_rows_with_missing_values(self) -> Table: def drop_rows_with_outliers(self) -> Table: """ - Remove all rows from the table that contain at least one outlier defined as having a value that has a distance - of more than 3 standard deviations from the column average. + Remove all rows from the table that contain at least one outlier. + + We define an outlier as a value that has a distance of more than 3 standard deviations from the column mean. + Missing values are not considered outliers. They are also ignored during the calculation of the standard + deviation. Returns ------- new_table : Table A new table without rows containing outliers. """ - result = self._data.copy(deep=True) + copy = self._data.copy(deep=True) table_without_nonnumericals = self.drop_columns_with_non_numerical_values() + z_scores = np.absolute(stats.zscore(table_without_nonnumericals._data, nan_policy="omit")) + filter_ = ((z_scores < 3) | np.isnan(z_scores)).all(axis=1) - result = result[ - (np.absolute(stats.zscore(table_without_nonnumericals._data)) < 3).all( - axis=1 - ) - ] - - return Table(result, self._schema) + return Table(copy[filter_], self._schema) def filter_rows(self, query: Callable[[Row], bool]) -> Table: """ From 307e25ca6a519e7951ad0b500c2fdd27cfbcaa00 Mon Sep 17 00:00:00 2001 From: lars-reimann Date: Mon, 27 Mar 2023 16:41:00 +0000 Subject: [PATCH 5/5] style: apply automated linter fixes --- src/safeds/data/tabular/containers/_table.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index d4c79ea52..fbfeeb962 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -596,11 +596,9 @@ def drop_columns_with_non_numerical_values(self) -> Table: A table without the columns that contain non-numerical values. """ - return Table.from_columns([ - column - for column in self.to_columns() - if column.type.is_numeric() - ]) + return Table.from_columns( + [column for column in self.to_columns() if column.type.is_numeric()] + ) def drop_duplicate_rows(self) -> Table: """ @@ -644,7 +642,9 @@ def drop_rows_with_outliers(self) -> Table: copy = self._data.copy(deep=True) table_without_nonnumericals = self.drop_columns_with_non_numerical_values() - z_scores = np.absolute(stats.zscore(table_without_nonnumericals._data, nan_policy="omit")) + z_scores = np.absolute( + stats.zscore(table_without_nonnumericals._data, nan_policy="omit") + ) filter_ = ((z_scores < 3) | np.isnan(z_scores)).all(axis=1) return Table(copy[filter_], self._schema)