feat: major API redesign (WIP) (#752)

Closes #694 Closes #699 Closes #714 Closes #748 ### Summary of Changes * Replace old implementation of tabular containers * New, more efficient implementation of metrics * Standalone package for metrics * New regression metrics * Abstract base class for classifiers & regressors * Introspection methods to get information about features and target of supervised models * Rename `LogisticRegressionClassifier` to `LogisticClassifier` (shorter + does not show up when searching for regression) * Rename `LinearRegressionRegressor` to `LinearRegressor` (shorter) * Rename `SupportVectorMachineClassifier` to `SupportVectorClassifier` (a little less precise, but still unambiguous and shorter) * Rename `SupportVectorMachineRegressor` to `SupportVectorRegressor` (ditto)
Safe-DS · May 11, 2024 · 8e781f9 · 8e781f9
1 parent 0e5a54b
commit 8e781f9
Show file tree

Hide file tree

Showing 163 changed files with 7,217 additions and 15,007 deletions.
diff --git a/benchmarks/metrics/__init__.py b/benchmarks/metrics/__init__.py
diff --git a/benchmarks/metrics/classification.py b/benchmarks/metrics/classification.py
@@ -0,0 +1,71 @@
+from __future__ import annotations
+
+from timeit import timeit
+from typing import TYPE_CHECKING
+
+import polars as pl
+
+from benchmarks.table.utils import create_synthetic_table
+from safeds.data.tabular.containers import Table
+from safeds.ml.metrics import ClassificationMetrics
+
+
+REPETITIONS = 10
+
+
+def _run_accuracy() -> None:
+    ClassificationMetrics.accuracy(table.get_column("predicted"), table.get_column("expected"))
+
+
+def _run_f1_score() -> None:
+    ClassificationMetrics.f1_score(table.get_column("predicted"), table.get_column("expected"), 1)
+
+
+def _run_precision() -> None:
+    ClassificationMetrics.precision(table.get_column("predicted"), table.get_column("expected"), 1)
+
+
+def _run_recall() -> None:
+    ClassificationMetrics.recall(table.get_column("predicted"), table.get_column("expected"), 1)
+
+
+if __name__ == "__main__":
+    # Create a synthetic Table
+    table = (
+        create_synthetic_table(10000, 2)
+        .rename_column("column_0", "predicted")
+        .rename_column("column_1", "expected")
+    )
+
+    # Run the benchmarks
+    timings: dict[str, float] = {
+        "accuracy": timeit(
+            _run_accuracy,
+            number=REPETITIONS,
+        ),
+        "f1_score": timeit(
+            _run_f1_score,
+            number=REPETITIONS,
+        ),
+        "precision": timeit(
+            _run_precision,
+            number=REPETITIONS,
+        ),
+        "recall": timeit(
+            _run_recall,
+            number=REPETITIONS,
+        ),
+    }
+
+    # Print the timings
+    with pl.Config(
+        tbl_rows=-1,
+    ):
+        print(
+            Table(
+                {
+                    "method": list(timings.keys()),
+                    "timing": list(timings.values()),
+                }
+            )
+        )
diff --git a/benchmarks/table/column_operations_polars.py → benchmarks/table/column_operations.py b/benchmarks/table/column_operations_polars.py → benchmarks/table/column_operations.py
@@ -1,8 +1,8 @@
 from timeit import timeit
 
-from safeds.data.tabular.containers import ExperimentalTable
+from safeds.data.tabular.containers import Table
 
-from benchmarks.table.utils import create_synthetic_table_polars
+from benchmarks.table.utils import create_synthetic_table
 
 REPETITIONS = 10
 
@@ -21,7 +21,7 @@ def _run_summarize_statistics() -> None:
 
 if __name__ == "__main__":
     # Create a synthetic Table
-    table = create_synthetic_table_polars(100, 5000)
+    table = create_synthetic_table(100, 5000)
 
     # Run the benchmarks
     timings: dict[str, float] = {
@@ -41,7 +41,7 @@ def _run_summarize_statistics() -> None:
 
     # Print the timings
     print(
-        ExperimentalTable(
+        Table(
             {
                 "method": list(timings.keys()),
                 "timing": list(timings.values()),

diff --git a/benchmarks/table/row_operations.py b/benchmarks/table/row_operations.py
@@ -1,54 +1,58 @@
 from timeit import timeit
 
+import polars as pl
+
 from safeds.data.tabular.containers import Table
 
 from benchmarks.table.utils import create_synthetic_table
 
 REPETITIONS = 10
 
 
-def _run_group_rows() -> None:
-    table.group_rows(lambda row: row.get_value("column_0") % 2 == 0)
-
-
 def _run_remove_duplicate_rows() -> None:
-    table.remove_duplicate_rows()
+    table.remove_duplicate_rows()._lazy_frame.collect()
 
 
 def _run_remove_rows_with_missing_values() -> None:
-    table.remove_rows_with_missing_values()
+    table.remove_rows_with_missing_values()._lazy_frame.collect()
 
 
 def _run_remove_rows_with_outliers() -> None:
     table.remove_rows_with_outliers()
 
 
 def _run_remove_rows() -> None:
-    table.remove_rows(lambda row: row.get_value("column_0") % 2 == 0)
+    table.remove_rows(lambda row: row.get_value("column_0") % 2 == 0)._lazy_frame.collect()
+
+
+def _run_remove_rows_by_column() -> None:
+    table.remove_rows_by_column("column_0", lambda cell: cell % 2 == 0)._lazy_frame.collect()
 
 
 def _run_shuffle_rows() -> None:
-    table.shuffle_rows()
+    table.shuffle_rows()._lazy_frame.collect()
 
 
 def _run_slice_rows() -> None:
-    table.slice_rows(end=table.number_of_rows // 2)
+    table.slice_rows(length=table.number_of_rows // 2)._lazy_frame.collect()
 
 
 def _run_sort_rows() -> None:
-    table.sort_rows(lambda row1, row2: row1.get_value("column_0") - row2.get_value("column_0"))
+    table.sort_rows(lambda row: row.get_value("column_0"))._lazy_frame.collect()
 
 
-def _run_split_rows() -> None:
-    table.split_rows(0.5)
+def _run_sort_rows_by_column() -> None:
+    table.sort_rows_by_column("column_0")._lazy_frame.collect()
 
 
-def _run_to_rows() -> None:
-    table.to_rows()
+def _run_split_rows() -> None:
+    table_1, table_2 = table.split_rows(0.5)
+    table_1._lazy_frame.collect()
+    table_2._lazy_frame.collect()
 
 
 def _run_transform_column() -> None:
-    table.transform_column("column_0", lambda row: row.get_value("column_0") * 2)
+    table.transform_column("column_0", lambda value: value * 2)._lazy_frame.collect()
 
 
 if __name__ == "__main__":
@@ -57,10 +61,6 @@ def _run_transform_column() -> None:
 
     # Run the benchmarks
     timings: dict[str, float] = {
-        "group_rows": timeit(
-            _run_group_rows,
-            number=REPETITIONS,
-        ),
         "remove_duplicate_rows": timeit(
             _run_remove_duplicate_rows,
             number=REPETITIONS,
@@ -77,6 +77,10 @@ def _run_transform_column() -> None:
             _run_remove_rows,
             number=REPETITIONS,
         ),
+        "remove_rows_by_column": timeit(
+            _run_remove_rows_by_column,
+            number=REPETITIONS,
+        ),
         "shuffle_rows": timeit(
             _run_shuffle_rows,
             number=REPETITIONS,
@@ -89,26 +93,29 @@ def _run_transform_column() -> None:
             _run_sort_rows,
             number=REPETITIONS,
         ),
-        "split_rows": timeit(
-            _run_split_rows,
+        "sort_rows_by_column": timeit(
+            _run_sort_rows_by_column,
             number=REPETITIONS,
         ),
-        "to_rows": timeit(
-            _run_to_rows,
+        "split_rows": timeit(
+            _run_split_rows,
             number=REPETITIONS,
         ),
-        "transform_colum": timeit(
+        "transform_column": timeit(
             _run_transform_column,
             number=REPETITIONS,
         ),
     }
 
     # Print the timings
-    print(
-        Table(
-            {  # noqa: T201
-                "method": list(timings.keys()),
-                "timing": list(timings.values()),
-            }
+    with pl.Config(
+        tbl_rows=-1,
+    ):
+        print(
+            Table(
+                {
+                    "method": list(timings.keys()),
+                    "timing": list(timings.values()),
+                }
+            )
         )
-    )
diff --git a/benchmarks/table/row_operations_polars.py b/benchmarks/table/row_operations_polars.py
diff --git a/benchmarks/table/utils/__init__.py b/benchmarks/table/utils/__init__.py
@@ -1,7 +1,5 @@
 from .create_synthetic_table import create_synthetic_table
-from .create_synthetic_table_polars import create_synthetic_table_polars
 
 __all__ = [
     "create_synthetic_table",
-    "create_synthetic_table_polars",
 ]