Skip to content

Commit

Permalink
feat: polars implementation of a column (#738)
Browse files Browse the repository at this point in the history
Closes partially #712

### Summary of Changes

* Implement a column backed by a `polars` series.
* Implement several methods in the `ExperimentalTable` class. Benchmarks
are looking promising.
* Add a new implementation of a cell backed by a `polars` expression.
* Schema and data type backed by corresponding `polars` concepts.

---------

Co-authored-by: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com>
  • Loading branch information
lars-reimann and megalinter-bot committed May 7, 2024
1 parent d783caa commit 732aa48
Show file tree
Hide file tree
Showing 25 changed files with 2,068 additions and 537 deletions.
32 changes: 8 additions & 24 deletions benchmarks/table/row_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,10 @@
REPETITIONS = 10


def _run_add_rows() -> None:
table.add_rows(table)


def _run_get_row() -> None:
table.get_row(0)


def _run_group_rows() -> None:
table.group_rows(lambda row: row.get_value("column_0") % 2 == 0)


def _run_keep_only_rows() -> None:
table.keep_only_rows(lambda row: row.get_value("column_0") % 2 == 0)


def _run_remove_duplicate_rows() -> None:
table.remove_duplicate_rows()

Expand Down Expand Up @@ -59,28 +47,20 @@ def _run_to_rows() -> None:
table.to_rows()


def _run_transform_column() -> None:
table.transform_column("column_0", lambda row: row.get_value("column_0") * 2)


if __name__ == "__main__":
# Create a synthetic Table
table = create_synthetic_table(1000, 50)

# Run the benchmarks
timings: dict[str, float] = {
"add_rows": timeit(
_run_add_rows,
number=REPETITIONS,
),
"get_row": timeit(
_run_get_row,
number=REPETITIONS,
),
"group_rows": timeit(
_run_group_rows,
number=REPETITIONS,
),
"keep_only_rows": timeit(
_run_keep_only_rows,
number=REPETITIONS,
),
"remove_duplicate_rows": timeit(
_run_remove_duplicate_rows,
number=REPETITIONS,
Expand Down Expand Up @@ -117,6 +97,10 @@ def _run_to_rows() -> None:
_run_to_rows,
number=REPETITIONS,
),
"transform_colum": timeit(
_run_transform_column,
number=REPETITIONS,
),
}

# Print the timings
Expand Down
140 changes: 59 additions & 81 deletions benchmarks/table/row_operations_polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,6 @@
REPETITIONS = 10


# def _run_add_rows() -> None:
# table.add_rows(table)
#
#
# def _run_get_row() -> None:
# table.get_row(0)
#
#
# def _run_group_rows() -> None:
# table.group_rows(lambda row: row.get_value("column_0") % 2 == 0)
#
#
# def _run_keep_only_rows() -> None:
# table.keep_only_rows(lambda row: row.get_value("column_0") % 2 == 0)


def _run_remove_duplicate_rows() -> None:
table.remove_duplicate_rows()._lazy_frame.collect()

Expand All @@ -33,30 +17,36 @@ def _run_remove_rows_with_missing_values() -> None:

# def _run_remove_rows_with_outliers() -> None:
# table.remove_rows_with_outliers()
#
#
# def _run_remove_rows() -> None:
# table.remove_rows(lambda row: row.get_value("column_0") % 2 == 0)
#
#
# def _run_shuffle_rows() -> None:
# table.shuffle_rows()
#
#
# def _run_slice_rows() -> None:
# table.slice_rows(end=table.number_of_rows // 2)
#
#
# def _run_sort_rows() -> None:
# table.sort_rows(lambda row1, row2: row1.get_value("column_0") - row2.get_value("column_0"))
#
#
# def _run_split_rows() -> None:
# table.split_rows(0.5)
#
#
# def _run_to_rows() -> None:
# table.to_rows()


def _run_remove_rows() -> None:
table.remove_rows(lambda row: row.get_value("column_0") % 2 == 0)._lazy_frame.collect()


def _run_shuffle_rows() -> None:
table.shuffle_rows()._lazy_frame.collect()


def _run_slice_rows() -> None:
table.slice_rows(length=table.number_of_rows // 2)._lazy_frame.collect()


def _run_sort_rows() -> None:
table.sort_rows(lambda row: row.get_value("column_0"))._lazy_frame.collect()


def _run_sort_rows_by_column() -> None:
table.sort_rows_by_column("column_0")._lazy_frame.collect()


def _run_split_rows() -> None:
table_1, table_2 = table.split_rows(0.5)
table_1._lazy_frame.collect()
table_2._lazy_frame.collect()


def _run_transform_column() -> None:
table.transform_column("column_0", lambda value: value * 2)._lazy_frame.collect()


if __name__ == "__main__":
Expand All @@ -65,22 +55,6 @@ def _run_remove_rows_with_missing_values() -> None:

# Run the benchmarks
timings: dict[str, float] = {
# "add_rows": timeit(
# _run_add_rows,
# number=REPETITIONS,
# ),
# "get_row": timeit(
# _run_get_row,
# number=REPETITIONS,
# ),
# "group_rows": timeit(
# _run_group_rows,
# number=REPETITIONS,
# ),
# "keep_only_rows": timeit(
# _run_keep_only_rows,
# number=REPETITIONS,
# ),
"remove_duplicate_rows": timeit(
_run_remove_duplicate_rows,
number=REPETITIONS,
Expand All @@ -93,36 +67,40 @@ def _run_remove_rows_with_missing_values() -> None:
# _run_remove_rows_with_outliers,
# number=REPETITIONS,
# ),
# "remove_rows": timeit(
# _run_remove_rows,
# number=REPETITIONS,
# ),
# "shuffle_rows": timeit(
# _run_shuffle_rows,
# number=REPETITIONS,
# ),
# "slice_rows": timeit(
# _run_slice_rows,
# number=REPETITIONS,
# ),
# "sort_rows": timeit(
# _run_sort_rows,
# number=REPETITIONS,
# ),
# "split_rows": timeit(
# _run_split_rows,
# number=REPETITIONS,
# ),
# "to_rows": timeit(
# _run_to_rows,
# number=REPETITIONS,
# ),
"remove_rows": timeit(
_run_remove_rows,
number=REPETITIONS,
),
"shuffle_rows": timeit(
_run_shuffle_rows,
number=REPETITIONS,
),
"slice_rows": timeit(
_run_slice_rows,
number=REPETITIONS,
),
"sort_rows": timeit(
_run_sort_rows,
number=REPETITIONS,
),
"sort_rows_by_column": timeit(
_run_sort_rows_by_column,
number=REPETITIONS,
),
"split_rows": timeit(
_run_split_rows,
number=REPETITIONS,
),
"transform_column": timeit(
_run_transform_column,
number=REPETITIONS,
),
}

# Print the timings
print(
Table(
{ # noqa: T201
{
"method": list(timings.keys()),
"timing": list(timings.values()),
}
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/table/utils/create_synthetic_table_polars.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from random import randrange

from safeds.data.tabular.containers import ExperimentalPolarsTable
from safeds.data.tabular.containers import ExperimentalTable


def create_synthetic_table_polars(
Expand All @@ -9,7 +9,7 @@ def create_synthetic_table_polars(
*,
min_value: int = 0,
max_value: int = 1000,
) -> ExperimentalPolarsTable:
) -> ExperimentalTable:
"""Create a synthetic Table with random numerical data.
Parameters
Expand All @@ -28,7 +28,7 @@ def create_synthetic_table_polars(
Table
A Table with random numerical data.
"""
return ExperimentalPolarsTable(
return ExperimentalTable(
{
f"column_{i}": [randrange(min_value, max_value) for _ in range(number_of_rows)]
for i in range(number_of_columns)
Expand Down
20 changes: 11 additions & 9 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ matplotlib = "^3.6.3"
openpyxl = "^3.1.2"
pandas = "^2.0.0"
pillow = ">=9.5,<11.0"
polars = {extras = ["pyarrow"], version = "^0.20.23"}
polars = {extras = ["numpy", "pyarrow"], version = "^0.20.24"}
scikit-learn = "^1.2.0"
seaborn = "^0.13.0"
statsmodels = "^0.14.1"
Expand Down
Binary file modified src/resources/to_excel_file.xlsx
Binary file not shown.
24 changes: 12 additions & 12 deletions src/safeds/data/tabular/containers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,32 +6,32 @@

if TYPE_CHECKING:
from ._column import Column
from ._experimental_polars_cell import ExperimentalPolarsCell
from ._experimental_polars_column import ExperimentalPolarsColumn
from ._experimental_polars_row import ExperimentalPolarsRow
from ._experimental_polars_table import ExperimentalPolarsTable
from ._experimental_cell import ExperimentalCell
from ._experimental_column import ExperimentalColumn
from ._experimental_row import ExperimentalRow
from ._experimental_table import ExperimentalTable
from ._row import Row
from ._table import Table

apipkg.initpkg(
__name__,
{
"Column": "._column:Column",
"ExperimentalPolarsCell": "._experimental_polars_cell:ExperimentalPolarsCell",
"ExperimentalPolarsColumn": "._experimental_polars_column:ExperimentalPolarsColumn",
"ExperimentalPolarsRow": "._experimental_polars_row:ExperimentalPolarsRow",
"ExperimentalPolarsTable": "._experimental_polars_table:ExperimentalPolarsTable",
"ExperimentalCell": "._experimental_cell:ExperimentalCell",
"ExperimentalColumn": "._experimental_column:ExperimentalColumn",
"ExperimentalRow": "._experimental_row:ExperimentalRow",
"ExperimentalTable": "._experimental_table:ExperimentalTable",
"Row": "._row:Row",
"Table": "._table:Table",
},
)

__all__ = [
"Column",
"ExperimentalPolarsCell",
"ExperimentalPolarsColumn",
"ExperimentalPolarsRow",
"ExperimentalPolarsTable",
"ExperimentalCell",
"ExperimentalColumn",
"ExperimentalRow",
"ExperimentalTable",
"Row",
"Table",
]

0 comments on commit 732aa48

Please sign in to comment.