Skip to content

Commit

Permalink
feat: add schema conversions when adding new rows to a table and sche…
Browse files Browse the repository at this point in the history
…ma conversion when creating a new table (#432)

Closes #404 
Closes #322  
Closes #127  
This Pull request merges the issues #322 and #127.


### Summary of Changes

<!-- Please provide a summary of changes in this pull request, ensuring
all changes are explained. -->

---------

Co-authored-by: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com>
Co-authored-by: sibre28 <86068340+sibre28@users.noreply.github.com>
Co-authored-by: Alexander <47296670+Marsmaennchen221@users.noreply.github.com>
  • Loading branch information
4 people committed Jul 12, 2023
1 parent 65a3f48 commit 6e9ff69
Show file tree
Hide file tree
Showing 21 changed files with 742 additions and 197 deletions.
4 changes: 2 additions & 2 deletions src/safeds/data/tabular/containers/_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def _from_pandas_series(data: pd.Series, type_: ColumnType | None = None) -> Col
result._name = data.name
result._data = data
# noinspection PyProtectedMember
result._type = type_ if type_ is not None else ColumnType._from_numpy_data_type(data.dtype)
result._type = type_ if type_ is not None else ColumnType._data_type(data)

return result

Expand Down Expand Up @@ -106,7 +106,7 @@ def __init__(self, name: str, data: Sequence[T] | None = None) -> None:
self._name: str = name
self._data: pd.Series = data.rename(name) if isinstance(data, pd.Series) else pd.Series(data, name=name)
# noinspection PyProtectedMember
self._type: ColumnType = ColumnType._from_numpy_data_type(self._data.dtype)
self._type: ColumnType = ColumnType._data_type(self._data)

def __contains__(self, item: Any) -> bool:
return item in self._data
Expand Down
36 changes: 35 additions & 1 deletion src/safeds/data/tabular/containers/_row.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from __future__ import annotations

import copy
from collections.abc import Mapping
import functools
from collections.abc import Callable, Mapping
from typing import TYPE_CHECKING, Any

import pandas as pd
Expand Down Expand Up @@ -441,6 +442,39 @@ def get_column_type(self, column_name: str) -> ColumnType:
"""
return self._schema.get_column_type(column_name)

# ------------------------------------------------------------------------------------------------------------------
# Transformations
# ------------------------------------------------------------------------------------------------------------------

def sort_columns(
self,
comparator: Callable[[tuple, tuple], int] = lambda col1, col2: (col1[0] > col2[0]) - (col1[0] < col2[0]),
) -> Row:
"""
Sort the columns of a `Row` with the given comparator and return a new `Row`.
The original row is not modified. The comparator is a function that takes two tuples of (ColumnName, Value) `col1` and `col2` and
returns an integer:
* If `col1` should be ordered before `col2`, the function should return a negative number.
* If `col1` should be ordered after `col2`, the function should return a positive number.
* If the original order of `col1` and `col2` should be kept, the function should return 0.
If no comparator is given, the columns will be sorted alphabetically by their name.
Parameters
----------
comparator : Callable[[tuple, tuple], int]
The function used to compare two tuples of (ColumnName, Value).
Returns
-------
new_row : Row
A new row with sorted columns.
"""
sorted_row_dict = dict(sorted(self.to_dict().items(), key=functools.cmp_to_key(comparator)))
return Row.from_dict(sorted_row_dict)

# ------------------------------------------------------------------------------------------------------------------
# Conversion
# ------------------------------------------------------------------------------------------------------------------
Expand Down
85 changes: 43 additions & 42 deletions src/safeds/data/tabular/containers/_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
DuplicateColumnNameError,
IndexOutOfBoundsError,
NonNumericColumnError,
SchemaMismatchError,
UnknownColumnNameError,
WrongFileExtensionError,
)
Expand Down Expand Up @@ -302,8 +301,8 @@ def from_rows(rows: list[Row]) -> Table:
Raises
------
SchemaMismatchError
If any of the row schemas does not match with the others.
UnknownColumnNameError
If any of the row column names does not match with the first row.
Examples
--------
Expand All @@ -318,17 +317,22 @@ def from_rows(rows: list[Row]) -> Table:
if len(rows) == 0:
return Table._from_pandas_dataframe(pd.DataFrame())

schema_compare: Schema = rows[0]._schema
column_names_compare: list = list(rows[0].column_names)
unknown_column_names = set()
row_array: list[pd.DataFrame] = []

for row in rows:
if schema_compare != row._schema:
raise SchemaMismatchError
unknown_column_names.update(set(column_names_compare) - set(row.column_names))
row_array.append(row._data)
if len(unknown_column_names) > 0:
raise UnknownColumnNameError(list(unknown_column_names))

dataframe: DataFrame = pd.concat(row_array, ignore_index=True)
dataframe.columns = schema_compare.column_names
return Table._from_pandas_dataframe(dataframe)
dataframe.columns = column_names_compare

schema = Schema.merge_multiple_schemas([row.schema for row in rows])

return Table._from_pandas_dataframe(dataframe, schema)

@staticmethod
def _from_pandas_dataframe(data: pd.DataFrame, schema: Schema | None = None) -> Table:
Expand Down Expand Up @@ -906,6 +910,9 @@ def add_row(self, row: Row) -> Table:
If the table happens to be empty beforehand, respective columns will be added automatically.
The order of columns of the new row will be adjusted to the order of columns in the table.
The new table will contain the merged schema.
This table is not modified.
Parameters
Expand All @@ -920,8 +927,8 @@ def add_row(self, row: Row) -> Table:
Raises
------
SchemaMismatchError
If the schema of the row does not match the table schema.
UnknownColumnNameError
If the row has different column names than the table.
Examples
--------
Expand All @@ -935,20 +942,18 @@ def add_row(self, row: Row) -> Table:
"""
int_columns = []
result = self._copy()
if self.number_of_columns == 0:
return Table.from_rows([row])
if len(set(self.column_names) - set(row.column_names)) > 0:
raise UnknownColumnNameError(list(set(self.column_names) - set(row.column_names)))

if result.number_of_rows == 0:
int_columns = list(filter(lambda name: isinstance(row[name], int | np.int64), row.column_names))
if result.number_of_columns == 0:
for column in row.column_names:
result._data[column] = Column(column, [])
result._schema = Schema._from_pandas_dataframe(result._data)
elif result.column_names != row.column_names:
raise SchemaMismatchError
elif result._schema != row.schema:
raise SchemaMismatchError
int_columns = list(filter(lambda name: isinstance(row[name], int | np.int64 | np.int32), row.column_names))

new_df = pd.concat([result._data, row._data]).infer_objects()
new_df.columns = result.column_names
result = Table._from_pandas_dataframe(new_df)
schema = Schema.merge_multiple_schemas([result.schema, row.schema])
result = Table._from_pandas_dataframe(new_df, schema)

for column in int_columns:
result = result.replace_column(column, [result.get_column(column).transform(lambda it: int(it))])
Expand All @@ -959,6 +964,9 @@ def add_rows(self, rows: list[Row] | Table) -> Table:
"""
Add multiple rows to a table.
The order of columns of the new rows will be adjusted to the order of columns in the table.
The new table will contain the merged schema.
This table is not modified.
Parameters
Expand All @@ -973,8 +981,8 @@ def add_rows(self, rows: list[Row] | Table) -> Table:
Raises
------
SchemaMismatchError
If the schema of one of the rows does not match the table schema.
UnknownColumnNameError
If at least one of the rows have different column names than the table.
Examples
--------
Expand All @@ -990,28 +998,21 @@ def add_rows(self, rows: list[Row] | Table) -> Table:
"""
if isinstance(rows, Table):
rows = rows.to_rows()
int_columns = []
result = self._copy()

if len(rows) == 0:
return self._copy()

different_column_names = set()
for row in rows:
if result.number_of_rows == 0:
int_columns = list(filter(lambda name: isinstance(row[name], int | np.int64), row.column_names))
if result.number_of_columns == 0:
for column in row.column_names:
result._data[column] = Column(column, [])
result._schema = Schema._from_pandas_dataframe(result._data)
elif result.column_names != row.column_names:
raise SchemaMismatchError
elif result._schema != row.schema:
raise SchemaMismatchError

row_frames = (row._data for row in rows)

new_df = pd.concat([result._data, *row_frames]).infer_objects()
new_df.columns = result.column_names
result = Table._from_pandas_dataframe(new_df)
different_column_names.update(set(rows[0].column_names) - set(row.column_names))
if len(different_column_names) > 0:
raise UnknownColumnNameError(list(different_column_names))

for column in int_columns:
result = result.replace_column(column, [result.get_column(column).transform(lambda it: int(it))])
result = self._copy()

for row in rows:
result = result.add_row(row)

return result

Expand Down Expand Up @@ -1269,7 +1270,7 @@ def remove_rows_with_missing_values(self) -> Table:
"""
result = self._data.copy(deep=True)
result = result.dropna(axis="index")
return Table._from_pandas_dataframe(result, self._schema)
return Table._from_pandas_dataframe(result)

def remove_rows_with_outliers(self) -> Table:
"""
Expand Down
6 changes: 3 additions & 3 deletions src/safeds/data/tabular/transformation/_label_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,9 @@ def inverse_transform(self, transformed_table: Table) -> Table:
if len(missing_columns) > 0:
raise UnknownColumnNameError(missing_columns)

if transformed_table.number_of_rows == 0:
raise ValueError("The LabelEncoder cannot inverse transform the table because it contains 0 rows")

if transformed_table.keep_only_columns(
self._column_names,
).remove_columns_with_non_numerical_values().number_of_columns < len(self._column_names):
Expand All @@ -168,9 +171,6 @@ def inverse_transform(self, transformed_table: Table) -> Table:
),
)

if transformed_table.number_of_rows == 0:
raise ValueError("The LabelEncoder cannot inverse transform the table because it contains 0 rows")

data = transformed_table._data.copy()
data.columns = transformed_table.column_names
data[self._column_names] = self._wrapped_transformer.inverse_transform(data[self._column_names])
Expand Down
12 changes: 9 additions & 3 deletions src/safeds/data/tabular/transformation/_one_hot_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,9 @@ def inverse_transform(self, transformed_table: Table) -> Table:
if len(missing_columns) > 0:
raise UnknownColumnNameError(missing_columns)

if transformed_table.number_of_rows == 0:
raise ValueError("The OneHotEncoder cannot inverse transform the table because it contains 0 rows")

if transformed_table._as_table().keep_only_columns(
_transformed_column_names,
).remove_columns_with_non_numerical_values().number_of_columns < len(_transformed_column_names):
Expand All @@ -293,9 +296,6 @@ def inverse_transform(self, transformed_table: Table) -> Table:
),
)

if transformed_table.number_of_rows == 0:
raise ValueError("The OneHotEncoder cannot inverse transform the table because it contains 0 rows")

original_columns = {}
for original_column_name in self._column_names:
original_columns[original_column_name] = [None for _ in range(transformed_table.number_of_rows)]
Expand All @@ -306,6 +306,12 @@ def inverse_transform(self, transformed_table: Table) -> Table:
if transformed_table.get_column(constructed_column)[i] == 1.0:
original_columns[original_column_name][i] = value

for original_column_name in self._value_to_column_nans:
constructed_column = self._value_to_column_nans[original_column_name]
for i in range(transformed_table.number_of_rows):
if transformed_table.get_column(constructed_column)[i] == 1.0:
original_columns[original_column_name][i] = np.nan

table = transformed_table

for column_name, encoded_column in original_columns.items():
Expand Down
18 changes: 9 additions & 9 deletions src/safeds/data/tabular/transformation/_range_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ def fit(self, table: Table, column_names: list[str] | None) -> RangeScaler:
if len(missing_columns) > 0:
raise UnknownColumnNameError(missing_columns)

if table.number_of_rows == 0:
raise ValueError("The RangeScaler cannot be fitted because the table contains 0 rows")

if (
table.keep_only_columns(column_names).remove_columns_with_non_numerical_values().number_of_columns
< table.keep_only_columns(column_names).number_of_columns
Expand All @@ -83,9 +86,6 @@ def fit(self, table: Table, column_names: list[str] | None) -> RangeScaler:
),
)

if table.number_of_rows == 0:
raise ValueError("The RangeScaler cannot be fitted because the table contains 0 rows")

wrapped_transformer = sk_MinMaxScaler((self._minimum, self._maximum))
wrapped_transformer.fit(table._data[column_names])

Expand Down Expand Up @@ -131,6 +131,9 @@ def transform(self, table: Table) -> Table:
if len(missing_columns) > 0:
raise UnknownColumnNameError(missing_columns)

if table.number_of_rows == 0:
raise ValueError("The RangeScaler cannot transform the table because it contains 0 rows")

if (
table.keep_only_columns(self._column_names).remove_columns_with_non_numerical_values().number_of_columns
< table.keep_only_columns(self._column_names).number_of_columns
Expand All @@ -148,9 +151,6 @@ def transform(self, table: Table) -> Table:
),
)

if table.number_of_rows == 0:
raise ValueError("The RangeScaler cannot transform the table because it contains 0 rows")

data = table._data.copy()
data.columns = table.column_names
data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names])
Expand Down Expand Up @@ -191,6 +191,9 @@ def inverse_transform(self, transformed_table: Table) -> Table:
if len(missing_columns) > 0:
raise UnknownColumnNameError(missing_columns)

if transformed_table.number_of_rows == 0:
raise ValueError("The RangeScaler cannot transform the table because it contains 0 rows")

if (
transformed_table.keep_only_columns(self._column_names)
.remove_columns_with_non_numerical_values()
Expand All @@ -210,9 +213,6 @@ def inverse_transform(self, transformed_table: Table) -> Table:
),
)

if transformed_table.number_of_rows == 0:
raise ValueError("The RangeScaler cannot transform the table because it contains 0 rows")

data = transformed_table._data.copy()
data.columns = transformed_table.column_names
data[self._column_names] = self._wrapped_transformer.inverse_transform(data[self._column_names])
Expand Down

0 comments on commit 6e9ff69

Please sign in to comment.