Skip to content

Commit

Permalink
feat: major API redesign (WIP) (#752)
Browse files Browse the repository at this point in the history
Closes #694
Closes #699
Closes #714
Closes #748 

### Summary of Changes

* Replace old implementation of tabular containers
* New, more efficient implementation of metrics
* Standalone package for metrics
* New regression metrics
* Abstract base class for classifiers & regressors
* Introspection methods to get information about features and target of
supervised models
* Rename `LogisticRegressionClassifier` to `LogisticClassifier` (shorter
+ does not show up when searching for regression)
* Rename `LinearRegressionRegressor` to `LinearRegressor` (shorter)
* Rename `SupportVectorMachineClassifier` to `SupportVectorClassifier`
(a little less precise, but still unambiguous and shorter)
* Rename `SupportVectorMachineRegressor` to `SupportVectorRegressor`
(ditto)
  • Loading branch information
lars-reimann committed May 11, 2024
1 parent 0e5a54b commit 8e781f9
Show file tree
Hide file tree
Showing 163 changed files with 7,217 additions and 15,007 deletions.
Empty file added benchmarks/metrics/__init__.py
Empty file.
71 changes: 71 additions & 0 deletions benchmarks/metrics/classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from __future__ import annotations

from timeit import timeit
from typing import TYPE_CHECKING

import polars as pl

from benchmarks.table.utils import create_synthetic_table
from safeds.data.tabular.containers import Table
from safeds.ml.metrics import ClassificationMetrics


REPETITIONS = 10


def _run_accuracy() -> None:
ClassificationMetrics.accuracy(table.get_column("predicted"), table.get_column("expected"))


def _run_f1_score() -> None:
ClassificationMetrics.f1_score(table.get_column("predicted"), table.get_column("expected"), 1)


def _run_precision() -> None:
ClassificationMetrics.precision(table.get_column("predicted"), table.get_column("expected"), 1)


def _run_recall() -> None:
ClassificationMetrics.recall(table.get_column("predicted"), table.get_column("expected"), 1)


if __name__ == "__main__":
# Create a synthetic Table
table = (
create_synthetic_table(10000, 2)
.rename_column("column_0", "predicted")
.rename_column("column_1", "expected")
)

# Run the benchmarks
timings: dict[str, float] = {
"accuracy": timeit(
_run_accuracy,
number=REPETITIONS,
),
"f1_score": timeit(
_run_f1_score,
number=REPETITIONS,
),
"precision": timeit(
_run_precision,
number=REPETITIONS,
),
"recall": timeit(
_run_recall,
number=REPETITIONS,
),
}

# Print the timings
with pl.Config(
tbl_rows=-1,
):
print(
Table(
{
"method": list(timings.keys()),
"timing": list(timings.values()),
}
)
)
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from timeit import timeit

from safeds.data.tabular.containers import ExperimentalTable
from safeds.data.tabular.containers import Table

from benchmarks.table.utils import create_synthetic_table_polars
from benchmarks.table.utils import create_synthetic_table

REPETITIONS = 10

Expand All @@ -21,7 +21,7 @@ def _run_summarize_statistics() -> None:

if __name__ == "__main__":
# Create a synthetic Table
table = create_synthetic_table_polars(100, 5000)
table = create_synthetic_table(100, 5000)

# Run the benchmarks
timings: dict[str, float] = {
Expand All @@ -41,7 +41,7 @@ def _run_summarize_statistics() -> None:

# Print the timings
print(
ExperimentalTable(
Table(
{
"method": list(timings.keys()),
"timing": list(timings.values()),
Expand Down
69 changes: 38 additions & 31 deletions benchmarks/table/row_operations.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,58 @@
from timeit import timeit

import polars as pl

from safeds.data.tabular.containers import Table

from benchmarks.table.utils import create_synthetic_table

REPETITIONS = 10


def _run_group_rows() -> None:
table.group_rows(lambda row: row.get_value("column_0") % 2 == 0)


def _run_remove_duplicate_rows() -> None:
table.remove_duplicate_rows()
table.remove_duplicate_rows()._lazy_frame.collect()


def _run_remove_rows_with_missing_values() -> None:
table.remove_rows_with_missing_values()
table.remove_rows_with_missing_values()._lazy_frame.collect()


def _run_remove_rows_with_outliers() -> None:
table.remove_rows_with_outliers()


def _run_remove_rows() -> None:
table.remove_rows(lambda row: row.get_value("column_0") % 2 == 0)
table.remove_rows(lambda row: row.get_value("column_0") % 2 == 0)._lazy_frame.collect()


def _run_remove_rows_by_column() -> None:
table.remove_rows_by_column("column_0", lambda cell: cell % 2 == 0)._lazy_frame.collect()


def _run_shuffle_rows() -> None:
table.shuffle_rows()
table.shuffle_rows()._lazy_frame.collect()


def _run_slice_rows() -> None:
table.slice_rows(end=table.number_of_rows // 2)
table.slice_rows(length=table.number_of_rows // 2)._lazy_frame.collect()


def _run_sort_rows() -> None:
table.sort_rows(lambda row1, row2: row1.get_value("column_0") - row2.get_value("column_0"))
table.sort_rows(lambda row: row.get_value("column_0"))._lazy_frame.collect()


def _run_split_rows() -> None:
table.split_rows(0.5)
def _run_sort_rows_by_column() -> None:
table.sort_rows_by_column("column_0")._lazy_frame.collect()


def _run_to_rows() -> None:
table.to_rows()
def _run_split_rows() -> None:
table_1, table_2 = table.split_rows(0.5)
table_1._lazy_frame.collect()
table_2._lazy_frame.collect()


def _run_transform_column() -> None:
table.transform_column("column_0", lambda row: row.get_value("column_0") * 2)
table.transform_column("column_0", lambda value: value * 2)._lazy_frame.collect()


if __name__ == "__main__":
Expand All @@ -57,10 +61,6 @@ def _run_transform_column() -> None:

# Run the benchmarks
timings: dict[str, float] = {
"group_rows": timeit(
_run_group_rows,
number=REPETITIONS,
),
"remove_duplicate_rows": timeit(
_run_remove_duplicate_rows,
number=REPETITIONS,
Expand All @@ -77,6 +77,10 @@ def _run_transform_column() -> None:
_run_remove_rows,
number=REPETITIONS,
),
"remove_rows_by_column": timeit(
_run_remove_rows_by_column,
number=REPETITIONS,
),
"shuffle_rows": timeit(
_run_shuffle_rows,
number=REPETITIONS,
Expand All @@ -89,26 +93,29 @@ def _run_transform_column() -> None:
_run_sort_rows,
number=REPETITIONS,
),
"split_rows": timeit(
_run_split_rows,
"sort_rows_by_column": timeit(
_run_sort_rows_by_column,
number=REPETITIONS,
),
"to_rows": timeit(
_run_to_rows,
"split_rows": timeit(
_run_split_rows,
number=REPETITIONS,
),
"transform_colum": timeit(
"transform_column": timeit(
_run_transform_column,
number=REPETITIONS,
),
}

# Print the timings
print(
Table(
{ # noqa: T201
"method": list(timings.keys()),
"timing": list(timings.values()),
}
with pl.Config(
tbl_rows=-1,
):
print(
Table(
{
"method": list(timings.keys()),
"timing": list(timings.values()),
}
)
)
)
121 changes: 0 additions & 121 deletions benchmarks/table/row_operations_polars.py

This file was deleted.

2 changes: 0 additions & 2 deletions benchmarks/table/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from .create_synthetic_table import create_synthetic_table
from .create_synthetic_table_polars import create_synthetic_table_polars

__all__ = [
"create_synthetic_table",
"create_synthetic_table_polars",
]

0 comments on commit 8e781f9

Please sign in to comment.