Skip to content

Commit

Permalink
feat: added Column.summarize_statistics() (#715)
Browse files Browse the repository at this point in the history
Closes #701

### Summary of Changes

Added `summarize_statistics` to the `Column` class to quickly get an
overview of relevant statistics.

The Column is converted into a Table with one Column and the results
from Table.summarize_statistics() are returned. This way, if someone
adds a new feature to Table.summarize_statistics(), it also appears in
Column.summarize_statistics().

---------

Co-authored-by: Lars Reimann <mail@larsreimann.com>
  • Loading branch information
SamanHushi and lars-reimann committed May 4, 2024
1 parent f2f4418 commit 71730a9
Show file tree
Hide file tree
Showing 2 changed files with 139 additions and 0 deletions.
33 changes: 33 additions & 0 deletions src/safeds/data/tabular/containers/_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,39 @@ def transform(self, transformer: Callable[[T], R]) -> Column[R]:
# Statistics
# ------------------------------------------------------------------------------------------------------------------

def summarize_statistics(self) -> Table:
"""
Return a table with a number of statistical key values.
The original Column is not modified.
Returns
-------
statistics:
The table with statistics.
Examples
--------
>>> from safeds.data.tabular.containers import Column
>>> column = Column("a", [1, 3])
>>> column.summarize_statistics()
metric a
0 minimum 1
1 maximum 3
2 mean 2.0
3 mode [1, 3]
4 median 2.0
5 variance 2.0
6 standard deviation 1.4142135623730951
7 missing value count 0
8 missing value ratio 0.0
9 idness 1.0
10 stability 0.5
"""
from safeds.data.tabular.containers import Table

return Table({self._name: self._data}).summarize_statistics()

def correlation_with(self, other_column: Column) -> float:
"""
Calculate Pearson correlation between this and another column. Both columns have to be numerical.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from statistics import stdev

import pytest
from safeds.data.tabular.containers import Column, Table


@pytest.mark.parametrize(
("column", "expected"),
[
(
Column("col1", [1, 2, 1]),
Table(
{
"metric": [
"minimum",
"maximum",
"mean",
"mode",
"median",
"variance",
"standard deviation",
"missing value count",
"missing value ratio",
"idness",
"stability",
],
"col1": [
"1",
"2",
str(4.0 / 3),
"[1]",
"1.0",
str(1.0 / 3),
str(stdev([1, 2, 1])),
"0",
"0.0",
str(2.0 / 3),
str(2.0 / 3),
],
},
),
),
(
Column("col1", ["a", "b", "c"]),
Table(
{
"metric": [
"minimum",
"maximum",
"mean",
"mode",
"median",
"variance",
"standard deviation",
"missing value count",
"missing value ratio",
"idness",
"stability",
],
"col1": [
"-",
"-",
"-",
"['a', 'b', 'c']",
"-",
"-",
"-",
"0",
"0.0",
"1.0",
str(1.0 / 3),
],
},
),
),
(
Column("col", [None, None]),
Table(
{
"metric": [
"minimum",
"maximum",
"mean",
"mode",
"median",
"variance",
"standard deviation",
"missing value count",
"missing value ratio",
"idness",
"stability",
],
"col": ["-", "-", "-", "[]", "-", "-", "-", "2", "1.0", "0.0", "-"],
},
),
),
],
ids=[
"Column of integers",
"Column of characters",
"Column of None",
],
)
def test_should_summarize_statistics(column: Column, expected: Table) -> None:
assert column.summarize_statistics().schema == expected.schema
assert column.summarize_statistics() == expected

0 comments on commit 71730a9

Please sign in to comment.