feat: remove OrdinalEncoder (#107)

### Summary of Changes The `OrdinalEncoder` was a bit of an outlier compared to the other `Transformer` classes: * It could only be applied to a single column instead of a list of columns. Because of this, it was not possible to implement #61. * Nothing was "learned" since the user had to specify the value order explicitly. The `fit` step was completely unnecessary. Therefore, I've removed the class `OrdinalEncoder`. Instead the `transform_column` method on a `Table` can be used. If eventually find this to be too cumbersome, we can implement a new method `transform_column_into_ordered_labels` on `Table`. --------- Co-authored-by: lars-reimann <lars-reimann@users.noreply.github.com>
Safe-DS · Mar 28, 2023 · b92bba5 · b92bba5
1 parent fe68426
commit b92bba5
Show file tree

Hide file tree

Showing 7 changed files with 19 additions and 308 deletions.
diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py
@@ -109,9 +109,7 @@ def from_json_file(path: str) -> Table:
         except FileNotFoundError as exception:
             raise FileNotFoundError(f'File "{path}" does not exist') from exception
         except Exception as exception:
-            raise ValueError(
-                f'Could not read file from "{path}" as JSON'
-            ) from exception
+            raise ValueError(f'Could not read file from "{path}" as JSON') from exception
 
     @staticmethod
     def from_columns(columns: list[Column]) -> Table:
@@ -143,9 +141,7 @@ def from_columns(columns: list[Column]) -> Table:
         for column in columns:
             if column._data.size != columns[0]._data.size:
                 raise ColumnLengthMismatchError(
-                    "\n".join(
-                        [f"{column.name}: {column._data.size}" for column in columns]
-                    )
+                    "\n".join([f"{column.name}: {column._data.size}" for column in columns])
                 )
             dataframe[column.name] = column._data
 
@@ -193,9 +189,7 @@ def from_rows(rows: list[Row]) -> Table:
     # ------------------------------------------------------------------------------------------------------------------
 
     def __init__(self, data: typing.Iterable, schema: Optional[TableSchema] = None):
-        self._data: pd.Dataframe = (
-            data if isinstance(data, pd.DataFrame) else pd.DataFrame(data)
-        )
+        self._data: pd.Dataframe = data if isinstance(data, pd.DataFrame) else pd.DataFrame(data)
         if schema is None:
             if self.count_columns() == 0:
                 raise MissingSchemaError()
@@ -272,9 +266,7 @@ def get_column(self, column_name: str) -> Column:
         if self._schema.has_column(column_name):
             output_column = Column(
                 column_name,
-                self._data.iloc[
-                    :, [self._schema._get_column_index_by_name(column_name)]
-                ].squeeze(),
+                self._data.iloc[:, [self._schema._get_column_index_by_name(column_name)]].squeeze(),
                 self._schema.get_type_of_column(column_name),
             )
             return output_column
@@ -533,9 +525,7 @@ def add_rows(self, rows: Union[list[Row], Table]) -> Table:
         for row in rows:
             if self._schema != row.schema:
                 raise SchemaMismatchError()
-        result = pd.concat(
-            [result, *[row._data.to_frame().T for row in rows]]
-        ).infer_objects()
+        result = pd.concat([result, *[row._data.to_frame().T for row in rows]]).infer_objects()
         result.columns = self._schema.get_column_names()
         return Table(result)
 
@@ -568,9 +558,7 @@ def drop_columns(self, column_names: list[str]) -> Table:
         if len(invalid_columns) != 0:
             raise UnknownColumnNameError(invalid_columns)
         transformed_data = self._data.drop(labels=column_indices, axis="columns")
-        transformed_data.columns = list(
-            name for name in self._schema.get_column_names() if name not in column_names
-        )
+        transformed_data.columns = list(name for name in self._schema.get_column_names() if name not in column_names)
         return Table(transformed_data)
 
     def drop_columns_with_missing_values(self) -> Table:
@@ -582,9 +570,7 @@ def drop_columns_with_missing_values(self) -> Table:
         table : Table
             A table without the columns that contain missing values.
         """
-        return Table.from_columns(
-            [column for column in self.to_columns() if not column.has_missing_values()]
-        )
+        return Table.from_columns([column for column in self.to_columns() if not column.has_missing_values()])
 
     def drop_columns_with_non_numerical_values(self) -> Table:
         """
@@ -596,9 +582,7 @@ def drop_columns_with_non_numerical_values(self) -> Table:
             A table without the columns that contain non-numerical values.
 
         """
-        return Table.from_columns(
-            [column for column in self.to_columns() if column.type.is_numeric()]
-        )
+        return Table.from_columns([column for column in self.to_columns() if column.type.is_numeric()])
 
     def drop_duplicate_rows(self) -> Table:
         """
@@ -642,9 +626,7 @@ def drop_rows_with_outliers(self) -> Table:
         copy = self._data.copy(deep=True)
 
         table_without_nonnumericals = self.drop_columns_with_non_numerical_values()
-        z_scores = np.absolute(
-            stats.zscore(table_without_nonnumericals._data, nan_policy="omit")
-        )
+        z_scores = np.absolute(stats.zscore(table_without_nonnumericals._data, nan_policy="omit"))
         filter_ = ((z_scores < 3) | np.isnan(z_scores)).all(axis=1)
 
         return Table(copy[filter_], self._schema)
@@ -699,9 +681,7 @@ def keep_only_columns(self, column_names: list[str]) -> Table:
         if len(invalid_columns) != 0:
             raise UnknownColumnNameError(invalid_columns)
         transformed_data = self._data[column_indices]
-        transformed_data.columns = list(
-            name for name in self._schema.get_column_names() if name in column_names
-        )
+        transformed_data.columns = list(name for name in self._schema.get_column_names() if name in column_names)
         return Table(transformed_data)
 
     def rename_column(self, old_name: str, new_name: str) -> Table:
@@ -769,10 +749,7 @@ def replace_column(self, old_column_name: str, new_column: Column) -> Table:
         if old_column_name not in self._schema.get_column_names():
             raise UnknownColumnNameError([old_column_name])
 
-        if (
-            new_column.name in self._schema.get_column_names()
-            and new_column.name != old_column_name
-        ):
+        if new_column.name in self._schema.get_column_names() and new_column.name != old_column_name:
             raise DuplicateColumnNameError(new_column.name)
 
         if self.count_rows() != new_column._data.size:
@@ -838,13 +815,7 @@ def slice(
         if end is None:
             end = self.count_rows()
 
-        if (
-            start < 0
-            or end < 0
-            or start >= self.count_rows()
-            or end > self.count_rows()
-            or end < start
-        ):
+        if start < 0 or end < 0 or start >= self.count_rows() or end > self.count_rows() or end < start:
             raise ValueError("the given index is out of bounds")
 
         new_df = self._data.iloc[start:end:step]
@@ -853,9 +824,7 @@ def slice(
 
     def sort_columns(
         self,
-        comparator: Callable[[Column, Column], int] = lambda col1, col2: (
-            col1.name > col2.name
-        )
+        comparator: Callable[[Column, Column], int] = lambda col1, col2: (col1.name > col2.name)
         - (col1.name < col2.name),
     ) -> Table:
         """
@@ -891,9 +860,9 @@ def sort_rows(self, comparator: Callable[[Row, Row], int]) -> Table:
 
         The comparator is a function that takes two rows `row1` and `row2` and returns an integer:
 
-        * If `col1` should be ordered before `col2`, the function should return a negative number.
-        * If `col1` should be ordered after `col2`, the function should return a positive number.
-        * If the original order of `col1` and `col2` should be kept, the function should return 0.
+        * If `row1` should be ordered before `row2`, the function should return a negative number.
+        * If `row1` should be ordered after `row2`, the function should return a positive number.
+        * If the original order of `row1` and `row2` should be kept, the function should return 0.
 
         Parameters
         ----------
@@ -933,9 +902,7 @@ def split(self, percentage_in_first: float) -> typing.Tuple[Table, Table]:
             self.slice(round(percentage_in_first * self.count_rows())),
         )
 
-    def transform_column(
-        self, name: str, transformer: Callable[[Row], typing.Any]
-    ) -> Table:
+    def transform_column(self, name: str, transformer: Callable[[Row], typing.Any]) -> Table:
         """
         Transform provided column by calling provided transformer.
 
@@ -1103,9 +1070,7 @@ def to_rows(self) -> list[Row]:
         rows : list[Row]
             List of rows.
         """
-        return [
-            Row(series_row, self._schema) for (_, series_row) in self._data.iterrows()
-        ]
+        return [Row(series_row, self._schema) for (_, series_row) in self._data.iterrows()]
 
     # ------------------------------------------------------------------------------------------------------------------
     # Other
@@ -1123,7 +1088,5 @@ def _ipython_display_(self) -> DisplayHandle:
         tmp = self._data.copy(deep=True)
         tmp.columns = self.get_column_names()
 
-        with pd.option_context(
-            "display.max_rows", tmp.shape[0], "display.max_columns", tmp.shape[1]
-        ):
+        with pd.option_context("display.max_rows", tmp.shape[0], "display.max_columns", tmp.shape[1]):
             return display(tmp)
diff --git a/src/safeds/data/tabular/transformation/__init__.py b/src/safeds/data/tabular/transformation/__init__.py
@@ -1,4 +1,3 @@
 from ._imputer import Imputer
 from ._label_encoder import LabelEncoder
 from ._one_hot_encoder import OneHotEncoder
-from ._ordinal_encoder import OrdinalEncoder
diff --git a/src/safeds/data/tabular/transformation/_ordinal_encoder.py b/src/safeds/data/tabular/transformation/_ordinal_encoder.py
diff --git a/tests/safeds/data/tabular/transformation/_ordinal_encoder/__init__.py b/tests/safeds/data/tabular/transformation/_ordinal_encoder/__init__.py
diff --git a/tests/safeds/data/tabular/transformation/_ordinal_encoder/test_fit_transform.py b/tests/safeds/data/tabular/transformation/_ordinal_encoder/test_fit_transform.py