From 3402907f5cf88ad94925929087d0eeaafe22d2b0 Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Fri, 17 Oct 2025 14:37:58 +0200 Subject: [PATCH 01/16] feat: Raise errors on nullable primary keys --- dataframely/_deprecation.py | 11 ++--------- dataframely/columns/_base.py | 5 ++--- docs/sites/features/primary-keys.md | 2 +- tests/columns/test_base.py | 14 ++++++++++++++ 4 files changed, 19 insertions(+), 13 deletions(-) create mode 100644 tests/columns/test_base.py diff --git a/dataframely/_deprecation.py b/dataframely/_deprecation.py index 42cf951e..25cdfe62 100644 --- a/dataframely/_deprecation.py +++ b/dataframely/_deprecation.py @@ -39,12 +39,5 @@ def warn_nullable_default_change() -> None: ) -@skip_if(env="DATAFRAMELY_NO_FUTURE_WARNINGS") -def warn_no_nullable_primary_key() -> None: - warnings.warn( - "Nullable primary key columns are not supported. " - "Setting `nullable=True` on a primary key column is ignored " - "and will raise an error in a future release.", - FutureWarning, - stacklevel=4, - ) +def error_no_nullable_primary_key() -> None: + raise ValueError("Nullable primary key columns are not supported.") diff --git a/dataframely/columns/_base.py b/dataframely/columns/_base.py index 82662753..37a143b7 100644 --- a/dataframely/columns/_base.py +++ b/dataframely/columns/_base.py @@ -14,7 +14,7 @@ from dataframely._compat import pa, sa, sa_TypeEngine from dataframely._deprecation import ( - warn_no_nullable_primary_key, + error_no_nullable_primary_key, warn_nullable_default_change, ) from dataframely._polars import PolarsDataType @@ -79,8 +79,7 @@ def __init__( """ if nullable and primary_key: - warn_no_nullable_primary_key() - nullable = False + error_no_nullable_primary_key() if nullable is None: if primary_key: diff --git a/docs/sites/features/primary-keys.md b/docs/sites/features/primary-keys.md index 9aa8ea09..b0daa255 100644 --- a/docs/sites/features/primary-keys.md +++ b/docs/sites/features/primary-keys.md @@ -10,7 +10,7 @@ Dataframely supports marking columns as part of the primary key when defining a `primary_key=True` on the respective column(s). ```{note} -Primary key columns must not be nullable. +Primary key columns must not be nullable. Starting in `dataframely` version 2, attempts to declare a nullable primary key column raise an error. ``` ### One-column primary keys diff --git a/tests/columns/test_base.py b/tests/columns/test_base.py new file mode 100644 index 00000000..26e462d4 --- /dev/null +++ b/tests/columns/test_base.py @@ -0,0 +1,14 @@ +# Copyright (c) QuantCo 2025-2025 +# SPDX-License-Identifier: BSD-3-Clause + +import pytest + +import dataframely as dy + + +@pytest.mark.parametrize("column_type", [dy.Int64, dy.String, dy.Float32, dy.Decimal]) +def test_no_nullable_primary_key(column_type: type[dy.Column]) -> None: + with pytest.raises(ValueError): + + class MySchema(dy.Schema): + x = column_type(primary_key=True, nullable=True) From a00196503172e382e228f22689fd2ea416cae232 Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Fri, 17 Oct 2025 14:40:52 +0200 Subject: [PATCH 02/16] fix --- tests/columns/test_base.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/columns/test_base.py b/tests/columns/test_base.py index 26e462d4..d5a4583e 100644 --- a/tests/columns/test_base.py +++ b/tests/columns/test_base.py @@ -9,6 +9,4 @@ @pytest.mark.parametrize("column_type", [dy.Int64, dy.String, dy.Float32, dy.Decimal]) def test_no_nullable_primary_key(column_type: type[dy.Column]) -> None: with pytest.raises(ValueError): - - class MySchema(dy.Schema): - x = column_type(primary_key=True, nullable=True) + column_type(primary_key=True, nullable=True) From 623bb7fba5ec28beed07f5397d9cf9b5a99f121d Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Fri, 17 Oct 2025 14:43:42 +0200 Subject: [PATCH 03/16] fix --- tests/test_deprecation.py | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/tests/test_deprecation.py b/tests/test_deprecation.py index 278e7941..f77c6af1 100644 --- a/tests/test_deprecation.py +++ b/tests/test_deprecation.py @@ -27,31 +27,12 @@ def test_warning_deprecated_default_nullable( deprecated_default_nullable() -# ------------------------- Nullable primary key ---------------------------------# - - -def deprecated_nullable_primary_key() -> None: - """This function causes a FutureWarning because both `nullable` and `primary_key` - are set to `True` in the Column constructor.""" - dy.Integer(primary_key=True, nullable=True) - - -def test_warning_deprecated_nullable_primary_key( - monkeypatch: pytest.MonkeyPatch, -) -> None: - monkeypatch.setenv("DATAFRAMELY_NO_FUTURE_WARNINGS", "") - with pytest.warns( - FutureWarning, match=r"Nullable primary key columns are not supported" - ): - deprecated_nullable_primary_key() - - # ------------------------- Common ---------------------------------# @pytest.mark.parametrize( "deprecated_behavior", - [deprecated_default_nullable, deprecated_nullable_primary_key], + [deprecated_default_nullable], ) @pytest.mark.parametrize("env_var", ["1", "True", "true"]) def test_future_warning_skip( From 02aeee704777d9a3bb875f6ac29ba602fc369083 Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Fri, 17 Oct 2025 14:49:02 +0200 Subject: [PATCH 04/16] feat!: Make columns non-nullable by default --- dataframely/_deprecation.py | 12 --------- dataframely/columns/_base.py | 12 +-------- tests/test_deprecation.py | 48 +++++++++--------------------------- 3 files changed, 13 insertions(+), 59 deletions(-) diff --git a/dataframely/_deprecation.py b/dataframely/_deprecation.py index 25cdfe62..27676f57 100644 --- a/dataframely/_deprecation.py +++ b/dataframely/_deprecation.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: BSD-3-Clause import os -import warnings from collections.abc import Callable from functools import wraps @@ -28,16 +27,5 @@ def wrapper() -> None: return decorator -@skip_if(env="DATAFRAMELY_NO_FUTURE_WARNINGS") -def warn_nullable_default_change() -> None: - warnings.warn( - "The 'nullable' argument was not explicitly set. In a future release, " - "'nullable=False' will be the default if 'nullable' is not specified. " - "Explicitly set 'nullable=True' if you want your column to be nullable.", - FutureWarning, - stacklevel=4, - ) - - def error_no_nullable_primary_key() -> None: raise ValueError("Nullable primary key columns are not supported.") diff --git a/dataframely/columns/_base.py b/dataframely/columns/_base.py index 37a143b7..0747e586 100644 --- a/dataframely/columns/_base.py +++ b/dataframely/columns/_base.py @@ -15,7 +15,6 @@ from dataframely._compat import pa, sa, sa_TypeEngine from dataframely._deprecation import ( error_no_nullable_primary_key, - warn_nullable_default_change, ) from dataframely._polars import PolarsDataType from dataframely.random import Generator @@ -46,7 +45,7 @@ class Column(ABC): def __init__( self, *, - nullable: bool | None = None, + nullable: bool = False, primary_key: bool = False, check: Check | None = None, alias: str | None = None, @@ -56,8 +55,6 @@ def __init__( Args: nullable: Whether this column may contain null values. Explicitly set `nullable=True` if you want your column to be nullable. - In a future release, `nullable=False` will be the default if `nullable` - is not specified. primary_key: Whether this column is part of the primary key of the schema. If ``True``, ``nullable`` is automatically set to ``False``. check: A custom rule or multiple rules to run for this column. This can be: @@ -81,13 +78,6 @@ def __init__( if nullable and primary_key: error_no_nullable_primary_key() - if nullable is None: - if primary_key: - nullable = False - else: - warn_nullable_default_change() - nullable = True - self.nullable = nullable self.primary_key = primary_key self.check = check diff --git a/tests/test_deprecation.py b/tests/test_deprecation.py index f77c6af1..8f3d4acd 100644 --- a/tests/test_deprecation.py +++ b/tests/test_deprecation.py @@ -1,46 +1,22 @@ # Copyright (c) QuantCo 2025-2025 # SPDX-License-Identifier: BSD-3-Clause -import warnings -from collections.abc import Callable - import pytest -import dataframely as dy - -# --------------------- Nullability default change ------------------------------# - - -def deprecated_default_nullable() -> None: - """This function causes a FutureWarning because no value is specified for the - `nullable` argument to the Column constructor.""" - dy.Integer() - - -def test_warning_deprecated_default_nullable( - monkeypatch: pytest.MonkeyPatch, -) -> None: - monkeypatch.setenv("DATAFRAMELY_NO_FUTURE_WARNINGS", "") - with pytest.warns( - FutureWarning, match="The 'nullable' argument was not explicitly set" - ): - deprecated_default_nullable() - +from dataframely._deprecation import skip_if # ------------------------- Common ---------------------------------# -@pytest.mark.parametrize( - "deprecated_behavior", - [deprecated_default_nullable], -) @pytest.mark.parametrize("env_var", ["1", "True", "true"]) -def test_future_warning_skip( - monkeypatch: pytest.MonkeyPatch, env_var: str, deprecated_behavior: Callable -) -> None: - """FutureWarnings should be avoidable by setting an environment variable.""" - monkeypatch.setenv("DATAFRAMELY_NO_FUTURE_WARNINGS", env_var) - # Elevates FutureWarning to an exception - with warnings.catch_warnings(): - warnings.simplefilter("error", FutureWarning) - deprecated_behavior() +def test_skip_if(monkeypatch: pytest.MonkeyPatch, env_var: str) -> None: + """The skip_if decorator should allow us to prevent execution of a wrapped + function.""" + variable_name = "DATAFRAMELY_NO_FUTURE_WARNINGS" + + @skip_if(variable_name) + def callable() -> None: + raise ValueError() + + monkeypatch.setenv(variable_name, env_var) + callable() From 13257abf532d8f17b15b6e84efda75ace45a644f Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Fri, 17 Oct 2025 15:16:34 +0200 Subject: [PATCH 05/16] fix --- dataframely/columns/categorical.py | 2 +- dataframely/columns/datetime.py | 8 ++++---- dataframely/columns/decimal.py | 2 +- dataframely/columns/enum.py | 2 +- dataframely/columns/float.py | 2 +- dataframely/columns/integer.py | 2 +- dataframely/columns/list.py | 2 +- dataframely/columns/string.py | 2 +- dataframely/columns/struct.py | 2 +- 9 files changed, 12 insertions(+), 12 deletions(-) diff --git a/dataframely/columns/categorical.py b/dataframely/columns/categorical.py index efcb41a4..cdb25d9f 100644 --- a/dataframely/columns/categorical.py +++ b/dataframely/columns/categorical.py @@ -21,7 +21,7 @@ class Categorical(Column): def __init__( self, *, - nullable: bool | None = None, + nullable: bool = False, primary_key: bool = False, check: Check | None = None, alias: str | None = None, diff --git a/dataframely/columns/datetime.py b/dataframely/columns/datetime.py index 82fd7d17..e87c86bc 100644 --- a/dataframely/columns/datetime.py +++ b/dataframely/columns/datetime.py @@ -34,7 +34,7 @@ class Date(OrdinalMixin[dt.date], Column): def __init__( self, *, - nullable: bool | None = None, + nullable: bool = False, primary_key: bool = False, min: dt.date | None = None, min_exclusive: dt.date | None = None, @@ -157,7 +157,7 @@ class Time(OrdinalMixin[dt.time], Column): def __init__( self, *, - nullable: bool | None = None, + nullable: bool = False, primary_key: bool = False, min: dt.time | None = None, min_exclusive: dt.time | None = None, @@ -286,7 +286,7 @@ class Datetime(OrdinalMixin[dt.datetime], Column): def __init__( self, *, - nullable: bool | None = None, + nullable: bool = False, primary_key: bool = False, min: dt.datetime | None = None, min_exclusive: dt.datetime | None = None, @@ -433,7 +433,7 @@ class Duration(OrdinalMixin[dt.timedelta], Column): def __init__( self, *, - nullable: bool | None = None, + nullable: bool = False, primary_key: bool = False, min: dt.timedelta | None = None, min_exclusive: dt.timedelta | None = None, diff --git a/dataframely/columns/decimal.py b/dataframely/columns/decimal.py index ae28b1e9..b870bbc0 100644 --- a/dataframely/columns/decimal.py +++ b/dataframely/columns/decimal.py @@ -27,7 +27,7 @@ def __init__( precision: int | None = None, scale: int = 0, *, - nullable: bool | None = None, + nullable: bool = False, primary_key: bool = False, min: decimal.Decimal | None = None, min_exclusive: decimal.Decimal | None = None, diff --git a/dataframely/columns/enum.py b/dataframely/columns/enum.py index 665e6ad6..f6384500 100644 --- a/dataframely/columns/enum.py +++ b/dataframely/columns/enum.py @@ -26,7 +26,7 @@ def __init__( self, categories: pl.Series | Iterable[str] | type[enum.Enum], *, - nullable: bool | None = None, + nullable: bool = False, primary_key: bool = False, check: Check | None = None, alias: str | None = None, diff --git a/dataframely/columns/float.py b/dataframely/columns/float.py index ed357a00..5462562e 100644 --- a/dataframely/columns/float.py +++ b/dataframely/columns/float.py @@ -26,7 +26,7 @@ class _BaseFloat(OrdinalMixin[float], Column): def __init__( self, *, - nullable: bool | None = None, + nullable: bool = False, primary_key: bool = False, allow_inf_nan: bool = False, min: float | None = None, diff --git a/dataframely/columns/integer.py b/dataframely/columns/integer.py index 92384d4b..2ffc0e4f 100644 --- a/dataframely/columns/integer.py +++ b/dataframely/columns/integer.py @@ -24,7 +24,7 @@ class _BaseInteger(IsInMixin[int], OrdinalMixin[int], Column): def __init__( self, *, - nullable: bool | None = None, + nullable: bool = False, primary_key: bool = False, min: int | None = None, min_exclusive: int | None = None, diff --git a/dataframely/columns/list.py b/dataframely/columns/list.py index 273fc1f9..8cf12036 100644 --- a/dataframely/columns/list.py +++ b/dataframely/columns/list.py @@ -31,7 +31,7 @@ def __init__( self, inner: Column, *, - nullable: bool | None = None, + nullable: bool = False, primary_key: bool = False, check: Check | None = None, alias: str | None = None, diff --git a/dataframely/columns/string.py b/dataframely/columns/string.py index 28f7d487..d98749c3 100644 --- a/dataframely/columns/string.py +++ b/dataframely/columns/string.py @@ -22,7 +22,7 @@ class String(Column): def __init__( self, *, - nullable: bool | None = None, + nullable: bool = False, primary_key: bool = False, min_length: int | None = None, max_length: int | None = None, diff --git a/dataframely/columns/struct.py b/dataframely/columns/struct.py index 712fbbae..8626aff2 100644 --- a/dataframely/columns/struct.py +++ b/dataframely/columns/struct.py @@ -29,7 +29,7 @@ def __init__( self, inner: dict[str, Column], *, - nullable: bool | None = None, + nullable: bool = False, primary_key: bool = False, check: Check | None = None, alias: str | None = None, From 3fe1a613fb380251c9d445aec6062230c46ee4d8 Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Fri, 17 Oct 2025 15:23:07 +0200 Subject: [PATCH 06/16] fix --- tests/column_types/test_array.py | 26 +++++++++++++------------- tests/columns/test_default_dtypes.py | 2 +- tests/schema/test_serialization.py | 2 +- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/tests/column_types/test_array.py b/tests/column_types/test_array.py index e15b4815..fb80eba5 100644 --- a/tests/column_types/test_array.py +++ b/tests/column_types/test_array.py @@ -12,8 +12,8 @@ @pytest.mark.parametrize( "inner", [ - (dy.Int64()), - (dy.Integer()), + (dy.Int64(nullable=True)), + (dy.Integer(nullable=True)), ], ) def test_integer_array(inner: Column) -> None: @@ -49,52 +49,52 @@ def test_invalid_shape() -> None: ("column", "dtype", "is_valid"), [ ( - dy.Array(dy.Int64(), 1), + dy.Array(dy.Int64(nullable=True), 1), pl.Array(pl.Int64(), 1), True, ), ( - dy.Array(dy.String(), 1), + dy.Array(dy.String(nullable=True), 1), pl.Array(pl.Int64(), 1), False, ), ( - dy.Array(dy.String(), 1), + dy.Array(dy.String(nullable=True), 1), pl.Array(pl.Int64(), 2), False, ), ( - dy.Array(dy.Int64(), (1,)), + dy.Array(dy.Int64(nullable=True), (1,)), pl.Array(pl.Int64(), (1,)), True, ), ( - dy.Array(dy.Int64(), (1,)), + dy.Array(dy.Int64(nullable=True), (1,)), pl.Array(pl.Int64(), (2,)), False, ), ( - dy.Array(dy.String(), 1), - dy.Array(dy.String(), 1), + dy.Array(dy.String(nullable=True), 1), + dy.Array(dy.String(nullable=True), 1), False, ), ( - dy.Array(dy.String(), 1), + dy.Array(dy.String(nullable=True), 1), dy.String(), False, ), ( - dy.Array(dy.String(), 1), + dy.Array(dy.String(nullable=True), 1), pl.String(), False, ), ( - dy.Array(dy.Array(dy.String(), 1), 1), + dy.Array(dy.Array(dy.String(nullable=True), 1), 1), pl.Array(pl.String(), (1, 1)), True, ), ( - dy.Array(dy.String(), (1, 1)), + dy.Array(dy.String(nullable=True), (1, 1)), pl.Array(pl.Array(pl.String(), 1), 1), True, ), diff --git a/tests/columns/test_default_dtypes.py b/tests/columns/test_default_dtypes.py index 278b7e83..2d959207 100644 --- a/tests/columns/test_default_dtypes.py +++ b/tests/columns/test_default_dtypes.py @@ -36,7 +36,7 @@ (dy.UInt64(), pl.UInt64()), (dy.String(), pl.String()), (dy.List(dy.String()), pl.List(pl.String())), - (dy.Array(dy.String(), 1), pl.Array(pl.String(), 1)), + (dy.Array(dy.String(nullable=True), 1), pl.Array(pl.String(), 1)), (dy.Struct({"a": dy.String()}), pl.Struct({"a": pl.String()})), (dy.Enum(["a", "b"]), pl.Enum(["a", "b"])), (dy.Categorical(), pl.Categorical()), diff --git a/tests/schema/test_serialization.py b/tests/schema/test_serialization.py index 46e12044..78e3bd62 100644 --- a/tests/schema/test_serialization.py +++ b/tests/schema/test_serialization.py @@ -53,7 +53,7 @@ def test_simple_serialization() -> None: {"a": dy.Int64()}, rules={"test": GroupRule(pl.len() > 2, group_columns=["a"])}, ), - create_schema("test", {"a": dy.Array(dy.Int64(), shape=(2, 2))}), + create_schema("test", {"a": dy.Array(dy.Int64(nullable=True), shape=(2, 2))}), create_schema("test", {"a": dy.List(dy.Int64(min=5))}), create_schema( "test", From 768841342f2d34ab02418dc74ba26ec427a6990a Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Fri, 17 Oct 2025 15:41:34 +0200 Subject: [PATCH 07/16] fix --- tests/column_types/test_array.py | 14 +++--- tests/columns/test_matches.py | 6 ++- tests/columns/test_pyarrow.py | 76 ++++++++++++++++++++++---------- 3 files changed, 67 insertions(+), 29 deletions(-) diff --git a/tests/column_types/test_array.py b/tests/column_types/test_array.py index fb80eba5..4b53911d 100644 --- a/tests/column_types/test_array.py +++ b/tests/column_types/test_array.py @@ -29,12 +29,12 @@ def test_integer_array(inner: Column) -> None: def test_invalid_inner_type() -> None: - schema = create_schema("test", {"a": dy.Array(dy.Int64(), 1)}) + schema = create_schema("test", {"a": dy.Array(dy.Int64(nullable=True), 1)}) assert not schema.is_valid(pl.DataFrame({"a": [["1"], ["2"], ["3"]]})) def test_invalid_shape() -> None: - schema = create_schema("test", {"a": dy.Array(dy.Int64(), 2)}) + schema = create_schema("test", {"a": dy.Array(dy.Int64(nullable=True), 2)}) assert not schema.is_valid( pl.DataFrame( {"a": [[1], [2], [3]]}, @@ -105,7 +105,9 @@ def test_validate_dtype(column: Column, dtype: pl.DataType, is_valid: bool) -> N def test_nested_arrays() -> None: - schema = create_schema("test", {"a": dy.Array(dy.Array(dy.Int64(), 1), 1)}) + schema = create_schema( + "test", {"a": dy.Array(dy.Array(dy.Int64(nullable=True), 1), 1)} + ) assert schema.is_valid( pl.DataFrame( {"a": [[[1]], [[2]], [[3]]]}, @@ -117,7 +119,9 @@ def test_nested_arrays() -> None: def test_nested_array() -> None: - schema = create_schema("test", {"a": dy.Array(dy.Array(dy.Int64(), 1), 1)}) + schema = create_schema( + "test", {"a": dy.Array(dy.Array(dy.Int64(nullable=True), 1), 1)} + ) assert schema.is_valid( pl.DataFrame( {"a": [[[1]], [[2]], [[3]]]}, @@ -147,7 +151,7 @@ def test_array_with_rules() -> None: def test_outer_nullability() -> None: schema = create_schema( "test", - {"nullable": dy.Array(inner=dy.Integer(), shape=1, nullable=True)}, + {"nullable": dy.Array(inner=dy.Integer(nullable=True), shape=1, nullable=True)}, ) df = pl.DataFrame({"nullable": [None, None]}) schema.validate(df, cast=True) diff --git a/tests/columns/test_matches.py b/tests/columns/test_matches.py index babee980..b8731dee 100644 --- a/tests/columns/test_matches.py +++ b/tests/columns/test_matches.py @@ -46,7 +46,11 @@ dy.String(check=[lambda x: x == "a"]), False, ), - (dy.Array(dy.Int32(), shape=(2, 2)), dy.Array(dy.Int32(), shape=(2, 2)), True), + ( + dy.Array(dy.Int32(nullable=True), shape=(2, 2)), + dy.Array(dy.Int32(nullable=True), shape=(2, 2)), + True, + ), (dy.List(dy.Int32()), dy.List(dy.Int32()), True), ( dy.Struct({"a": dy.Int32(check=lambda expr: expr > 4)}), diff --git a/tests/columns/test_pyarrow.py b/tests/columns/test_pyarrow.py index e9114b5a..c0275da6 100644 --- a/tests/columns/test_pyarrow.py +++ b/tests/columns/test_pyarrow.py @@ -1,6 +1,8 @@ # Copyright (c) QuantCo 2025-2025 # SPDX-License-Identifier: BSD-3-Clause +from typing import TypeVar + import pytest from polars._typing import TimeUnit @@ -17,9 +19,18 @@ pytestmark = pytest.mark.with_optionals +T = TypeVar("T", bound=dy.Column) + + +def _nullable(column_type: type[T]) -> T: + if column_type == dy.Any: + return column_type() + return column_type(nullable=True) + + @pytest.mark.parametrize("column_type", ALL_COLUMN_TYPES) def test_equal_to_polars_schema(column_type: type[Column]) -> None: - schema = create_schema("test", {"a": column_type()}) + schema = create_schema("test", {"a": _nullable(column_type)}) actual = schema.pyarrow_schema() expected = schema.create_empty().to_arrow().schema assert actual == expected @@ -39,7 +50,7 @@ def test_equal_to_polars_schema(column_type: type[Column]) -> None: ], ) def test_equal_polars_schema_enum(categories: list[str]) -> None: - schema = create_schema("test", {"a": dy.Enum(categories)}) + schema = create_schema("test", {"a": dy.Enum(categories, nullable=True)}) actual = schema.pyarrow_schema() expected = schema.create_empty().to_arrow().schema assert actual == expected @@ -49,11 +60,14 @@ def test_equal_polars_schema_enum(categories: list[str]) -> None: "inner", [c() for c in ALL_COLUMN_TYPES] + [dy.List(t()) for t in ALL_COLUMN_TYPES] - + [dy.Array(t(), 1) for t in NO_VALIDATION_COLUMN_TYPES] + + [ + dy.Array(t() if t == dy.Any else t(nullable=True), 1) + for t in NO_VALIDATION_COLUMN_TYPES + ] + [dy.Struct({"a": t()}) for t in ALL_COLUMN_TYPES], ) def test_equal_polars_schema_list(inner: Column) -> None: - schema = create_schema("test", {"a": dy.List(inner)}) + schema = create_schema("test", {"a": dy.List(inner, nullable=True)}) actual = schema.pyarrow_schema() expected = schema.create_empty().to_arrow().schema assert actual == expected @@ -61,10 +75,13 @@ def test_equal_polars_schema_list(inner: Column) -> None: @pytest.mark.parametrize( "inner", - [c() for c in NO_VALIDATION_COLUMN_TYPES] - + [dy.List(t()) for t in NO_VALIDATION_COLUMN_TYPES] - + [dy.Array(t(), 1) for t in NO_VALIDATION_COLUMN_TYPES] - + [dy.Struct({"a": t()}) for t in NO_VALIDATION_COLUMN_TYPES], + [_nullable(c) for c in NO_VALIDATION_COLUMN_TYPES] + + [dy.List(_nullable(t), nullable=True) for t in NO_VALIDATION_COLUMN_TYPES] + + [dy.Array(_nullable(t), 1, nullable=True) for t in NO_VALIDATION_COLUMN_TYPES] + + [ + dy.Struct({"a": _nullable(t)}, nullable=True) + for t in NO_VALIDATION_COLUMN_TYPES + ], ) @pytest.mark.parametrize( "shape", @@ -83,13 +100,16 @@ def test_equal_polars_schema_array(inner: Column, shape: int | tuple[int, ...]) @pytest.mark.parametrize( "inner", - [c() for c in ALL_COLUMN_TYPES] - + [dy.Struct({"a": t()}) for t in ALL_COLUMN_TYPES] - + [dy.Array(t(), 1) for t in NO_VALIDATION_COLUMN_TYPES] - + [dy.List(t()) for t in ALL_COLUMN_TYPES], + [_nullable(c) for c in NO_VALIDATION_COLUMN_TYPES] + + [dy.List(_nullable(t), nullable=True) for t in NO_VALIDATION_COLUMN_TYPES] + + [dy.Array(_nullable(t), 1, nullable=True) for t in NO_VALIDATION_COLUMN_TYPES] + + [ + dy.Struct({"a": _nullable(t)}, nullable=True) + for t in NO_VALIDATION_COLUMN_TYPES + ], ) def test_equal_polars_schema_struct(inner: Column) -> None: - schema = create_schema("test", {"a": dy.Struct({"a": inner})}) + schema = create_schema("test", {"a": dy.Struct({"a": inner}, nullable=True)}) actual = schema.pyarrow_schema() expected = schema.create_empty().to_arrow().schema assert actual == expected @@ -110,10 +130,13 @@ def test_nullability_information_enum(nullable: bool) -> None: @pytest.mark.parametrize( "inner", - [c() for c in ALL_COLUMN_TYPES] - + [dy.List(t()) for t in ALL_COLUMN_TYPES] - + [dy.Array(t(), 1) for t in NO_VALIDATION_COLUMN_TYPES] - + [dy.Struct({"a": t()}) for t in ALL_COLUMN_TYPES], + [_nullable(c) for c in NO_VALIDATION_COLUMN_TYPES] + + [dy.List(_nullable(t), nullable=True) for t in NO_VALIDATION_COLUMN_TYPES] + + [dy.Array(_nullable(t), 1, nullable=True) for t in NO_VALIDATION_COLUMN_TYPES] + + [ + dy.Struct({"a": _nullable(t)}, nullable=True) + for t in NO_VALIDATION_COLUMN_TYPES + ], ) @pytest.mark.parametrize("nullable", [True, False]) def test_nullability_information_list(inner: Column, nullable: bool) -> None: @@ -123,10 +146,13 @@ def test_nullability_information_list(inner: Column, nullable: bool) -> None: @pytest.mark.parametrize( "inner", - [c() for c in ALL_COLUMN_TYPES] - + [dy.Struct({"a": t()}) for t in ALL_COLUMN_TYPES] - + [dy.Array(t(), 1) for t in NO_VALIDATION_COLUMN_TYPES] - + [dy.List(t()) for t in ALL_COLUMN_TYPES], + [_nullable(c) for c in NO_VALIDATION_COLUMN_TYPES] + + [dy.List(_nullable(t), nullable=True) for t in NO_VALIDATION_COLUMN_TYPES] + + [dy.Array(_nullable(t), 1, nullable=True) for t in NO_VALIDATION_COLUMN_TYPES] + + [ + dy.Struct({"a": _nullable(t)}, nullable=True) + for t in NO_VALIDATION_COLUMN_TYPES + ], ) @pytest.mark.parametrize("nullable", [True, False]) def test_nullability_information_struct(inner: Column, nullable: bool) -> None: @@ -135,11 +161,15 @@ def test_nullability_information_struct(inner: Column, nullable: bool) -> None: def test_multiple_columns() -> None: - schema = create_schema("test", {"a": dy.Int32(nullable=False), "b": dy.Integer()}) + schema = create_schema( + "test", {"a": dy.Int32(nullable=False), "b": dy.Integer(nullable=True)} + ) assert str(schema.pyarrow_schema()).split("\n") == ["a: int32 not null", "b: int64"] @pytest.mark.parametrize("time_unit", ["ns", "us", "ms"]) def test_datetime_time_unit(time_unit: TimeUnit) -> None: - schema = create_schema("test", {"a": dy.Datetime(time_unit=time_unit)}) + schema = create_schema( + "test", {"a": dy.Datetime(time_unit=time_unit, nullable=True)} + ) assert str(schema.pyarrow_schema()) == f"a: timestamp[{time_unit}]" From 757a3882a947e3c348e4c4b35c29c4f1f9633ae6 Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Fri, 17 Oct 2025 15:45:54 +0200 Subject: [PATCH 08/16] fix --- tests/column_types/test_datetime.py | 44 ++++++++++++++++------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/tests/column_types/test_datetime.py b/tests/column_types/test_datetime.py index f3aa93fd..7c5d96f5 100644 --- a/tests/column_types/test_datetime.py +++ b/tests/column_types/test_datetime.py @@ -216,47 +216,47 @@ def test_args_resolution_valid( ("column", "values", "valid"), [ ( - dy.Date(min=dt.date(2020, 4, 1)), + dy.Date(min=dt.date(2020, 4, 1), nullable=True), [dt.date(2020, 3, 31), dt.date(2020, 4, 1), dt.date(9999, 12, 31)], {"min": [False, True, True]}, ), ( - dy.Date(min_exclusive=dt.date(2020, 4, 1)), + dy.Date(min_exclusive=dt.date(2020, 4, 1), nullable=True), [dt.date(2020, 3, 31), dt.date(2020, 4, 1), dt.date(9999, 12, 31)], {"min_exclusive": [False, False, True]}, ), ( - dy.Date(max=dt.date(2020, 4, 1)), + dy.Date(max=dt.date(2020, 4, 1), nullable=True), [dt.date(2020, 3, 31), dt.date(2020, 4, 1), dt.date(2020, 4, 2)], {"max": [True, True, False]}, ), ( - dy.Date(max_exclusive=dt.date(2020, 4, 1)), + dy.Date(max_exclusive=dt.date(2020, 4, 1), nullable=True), [dt.date(2020, 3, 31), dt.date(2020, 4, 1), dt.date(2020, 4, 2)], {"max_exclusive": [True, False, False]}, ), ( - dy.Time(min=dt.time(3)), + dy.Time(min=dt.time(3), nullable=True), [dt.time(2, 59), dt.time(3, 0, 0), dt.time(4)], {"min": [False, True, True]}, ), ( - dy.Time(min_exclusive=dt.time(3)), + dy.Time(min_exclusive=dt.time(3), nullable=True), [dt.time(2, 59), dt.time(3, 0, 0), dt.time(4)], {"min_exclusive": [False, False, True]}, ), ( - dy.Time(max=dt.time(11, 59, 59, 999999)), + dy.Time(max=dt.time(11, 59, 59, 999999), nullable=True), [dt.time(11), dt.time(12), dt.time(13)], {"max": [True, False, False]}, ), ( - dy.Time(max_exclusive=dt.time(12)), + dy.Time(max_exclusive=dt.time(12), nullable=True), [dt.time(11), dt.time(12), dt.time(13)], {"max_exclusive": [True, False, False]}, ), ( - dy.Datetime(min=dt.datetime(2020, 3, 1, hour=12)), + dy.Datetime(min=dt.datetime(2020, 3, 1, hour=12), nullable=True), [ dt.datetime(2020, 2, 29, hour=14), dt.datetime(2020, 3, 1, hour=11), @@ -267,7 +267,7 @@ def test_args_resolution_valid( {"min": [False, False, True, True, True]}, ), ( - dy.Datetime(min_exclusive=dt.datetime(2020, 3, 1, hour=12)), + dy.Datetime(min_exclusive=dt.datetime(2020, 3, 1, hour=12), nullable=True), [ dt.datetime(2020, 2, 29, hour=14), dt.datetime(2020, 3, 1, hour=11), @@ -278,7 +278,7 @@ def test_args_resolution_valid( {"min_exclusive": [False, False, False, True, True]}, ), ( - dy.Datetime(max=dt.datetime(2020, 3, 1, hour=12)), + dy.Datetime(max=dt.datetime(2020, 3, 1, hour=12), nullable=True), [ dt.datetime(2020, 2, 29, hour=14), dt.datetime(2020, 3, 1, hour=11), @@ -289,7 +289,7 @@ def test_args_resolution_valid( {"max": [True, True, True, False, False]}, ), ( - dy.Datetime(max_exclusive=dt.datetime(2020, 3, 1, hour=12)), + dy.Datetime(max_exclusive=dt.datetime(2020, 3, 1, hour=12), nullable=True), [ dt.datetime(2020, 2, 29, hour=14), dt.datetime(2020, 3, 1, hour=11), @@ -300,7 +300,7 @@ def test_args_resolution_valid( {"max_exclusive": [True, True, False, False, False]}, ), ( - dy.Duration(min=dt.timedelta(days=1, seconds=14400)), + dy.Duration(min=dt.timedelta(days=1, seconds=14400), nullable=True), [ dt.timedelta(seconds=13000), dt.timedelta(days=1, seconds=14400), @@ -309,7 +309,9 @@ def test_args_resolution_valid( {"min": [False, True, True]}, ), ( - dy.Duration(min_exclusive=dt.timedelta(days=1, seconds=14400)), + dy.Duration( + min_exclusive=dt.timedelta(days=1, seconds=14400), nullable=True + ), [ dt.timedelta(seconds=13000), dt.timedelta(days=1, seconds=14400), @@ -318,7 +320,7 @@ def test_args_resolution_valid( {"min_exclusive": [False, False, True]}, ), ( - dy.Duration(max=dt.timedelta(days=1, seconds=14400)), + dy.Duration(max=dt.timedelta(days=1, seconds=14400), nullable=True), [ dt.timedelta(seconds=13000), dt.timedelta(days=1, seconds=14400), @@ -327,7 +329,9 @@ def test_args_resolution_valid( {"max": [True, True, False]}, ), ( - dy.Duration(max_exclusive=dt.timedelta(days=1, seconds=14400)), + dy.Duration( + max_exclusive=dt.timedelta(days=1, seconds=14400), nullable=True + ), [ dt.timedelta(seconds=13000), dt.timedelta(days=1, seconds=14400), @@ -350,17 +354,17 @@ def test_validate_min_max( ("column", "values", "valid"), [ ( - dy.Date(resolution="1mo"), + dy.Date(resolution="1mo", nullable=True), [dt.date(2020, 1, 1), dt.date(2021, 1, 15), dt.date(2022, 12, 1)], {"resolution": [True, False, True]}, ), ( - dy.Time(resolution="1h"), + dy.Time(resolution="1h", nullable=True), [dt.time(12, 0), dt.time(13, 15), dt.time(14, 0, 5)], {"resolution": [True, False, False]}, ), ( - dy.Datetime(resolution="1d"), + dy.Datetime(resolution="1d", nullable=True), [ dt.datetime(2020, 4, 5), dt.datetime(2021, 1, 1, 12), @@ -369,7 +373,7 @@ def test_validate_min_max( {"resolution": [True, False, False]}, ), ( - dy.Duration(resolution="12h"), + dy.Duration(resolution="12h", nullable=True), [ dt.timedelta(hours=12), dt.timedelta(days=2), From e99690ff571bb98ec645c2f6aaa8bc3dfe933bf2 Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Fri, 17 Oct 2025 15:48:24 +0200 Subject: [PATCH 09/16] fix --- tests/column_types/test_decimal.py | 11 +++++++++-- tests/column_types/test_float.py | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/column_types/test_decimal.py b/tests/column_types/test_decimal.py index 46ff42a9..11719e4f 100644 --- a/tests/column_types/test_decimal.py +++ b/tests/column_types/test_decimal.py @@ -89,7 +89,10 @@ def test_non_decimal_dtype_fails(dtype: DataTypeClass) -> None: ], ) def test_validate_min(inclusive: bool, valid: dict[str, list[bool]]) -> None: - kwargs = {("min" if inclusive else "min_exclusive"): decimal.Decimal(3)} + kwargs = { + ("min" if inclusive else "min_exclusive"): decimal.Decimal(3), + "nullable": True, + } column = dy.Decimal(**kwargs) # type: ignore lf = pl.LazyFrame({"a": [1, 2, 3, 4, 5]}) actual = evaluate_rules(lf, rules_from_exprs(column.validation_rules(pl.col("a")))) @@ -105,7 +108,10 @@ def test_validate_min(inclusive: bool, valid: dict[str, list[bool]]) -> None: ], ) def test_validate_max(inclusive: bool, valid: dict[str, list[bool]]) -> None: - kwargs = {("max" if inclusive else "max_exclusive"): decimal.Decimal(3)} + kwargs = { + ("max" if inclusive else "max_exclusive"): decimal.Decimal(3), + "nullable": True, + } column = dy.Decimal(**kwargs) # type: ignore lf = pl.LazyFrame({"a": [1, 2, 3, 4, 5]}) actual = evaluate_rules(lf, rules_from_exprs(column.validation_rules(pl.col("a")))) @@ -158,6 +164,7 @@ def test_validate_range( kwargs = { ("min" if min_inclusive else "min_exclusive"): decimal.Decimal(0), ("max" if max_inclusive else "max_exclusive"): decimal.Decimal(2), + "nullable": True, } column = dy.Decimal(**kwargs) # type: ignore lf = pl.LazyFrame({"a": [-1, 0, 1, 2, 3]}) diff --git a/tests/column_types/test_float.py b/tests/column_types/test_float.py index d214d395..eda77f87 100644 --- a/tests/column_types/test_float.py +++ b/tests/column_types/test_float.py @@ -173,7 +173,7 @@ def test_validate_inf_nan(inf: Any, nan: Any) -> None: @pytest.mark.parametrize("inf", [np.inf, -np.inf, float("inf"), float("-inf")]) @pytest.mark.parametrize("nan", [np.nan, float("nan"), float("NaN")]) def test_validate_allow_inf_nan(inf: Any, nan: Any) -> None: - column = dy.Float(allow_inf_nan=True) + column = dy.Float(allow_inf_nan=True, nullable=True) lf = pl.LazyFrame({"a": pl.Series([inf, 2.0, nan, 4.0, 5.0])}) actual = evaluate_rules(lf, rules_from_exprs(column.validation_rules(pl.col("a")))) assert len(actual.collect_schema().names()) == 0, ( From 0a7d785548a2aeac225769bf7867b2373adb7ce2 Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Fri, 17 Oct 2025 15:50:55 +0200 Subject: [PATCH 10/16] fix --- tests/column_types/test_integer.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/column_types/test_integer.py b/tests/column_types/test_integer.py index 465c6786..5194698c 100644 --- a/tests/column_types/test_integer.py +++ b/tests/column_types/test_integer.py @@ -76,7 +76,7 @@ def test_non_integer_dtype_fails(dtype: DataTypeClass) -> None: @pytest.mark.parametrize("column_type", INTEGER_COLUMN_TYPES) @pytest.mark.parametrize("inclusive", [True, False]) def test_validate_min(column_type: type[_BaseInteger], inclusive: bool) -> None: - kwargs = {("min" if inclusive else "min_exclusive"): 3} + kwargs = {("min" if inclusive else "min_exclusive"): 3, "nullable": True} column = column_type(**kwargs) # type: ignore lf = pl.LazyFrame({"a": [1, 2, 3, 4, 5]}) actual = evaluate_rules(lf, rules_from_exprs(column.validation_rules(pl.col("a")))) @@ -88,7 +88,7 @@ def test_validate_min(column_type: type[_BaseInteger], inclusive: bool) -> None: @pytest.mark.parametrize("column_type", INTEGER_COLUMN_TYPES) @pytest.mark.parametrize("inclusive", [True, False]) def test_validate_max(column_type: type[_BaseInteger], inclusive: bool) -> None: - kwargs = {("max" if inclusive else "max_exclusive"): 3} + kwargs = {("max" if inclusive else "max_exclusive"): 3, "nullable": True} column = column_type(**kwargs) # type: ignore lf = pl.LazyFrame({"a": [1, 2, 3, 4, 5]}) actual = evaluate_rules(lf, rules_from_exprs(column.validation_rules(pl.col("a")))) @@ -103,7 +103,7 @@ def test_validate_min_zero(column_type: type[_BaseInteger], inclusive: bool) -> """Specific edge case where the minimum is `0`, which can lead to python bugs if we use `if value` instead of `if value is not None` somewhere.""" key = "min" if inclusive else "min_exclusive" - kwargs = {key: 0} + kwargs = {key: 0, "nullable": True} column = column_type(**kwargs) # type: ignore lf = pl.LazyFrame({"a": [-1]}) actual = evaluate_rules(lf, rules_from_exprs(column.validation_rules(pl.col("a")))) @@ -117,7 +117,7 @@ def test_validate_max_zero(column_type: type[_BaseInteger], inclusive: bool) -> """Specific edge case where the maximum is `0`, which can lead to python bugs if we use `if value` instead of `if value is not None` somewhere.""" key = "max" if inclusive else "max_exclusive" - kwargs = {key: 0} + kwargs = {key: 0, "nullable": True} column = column_type(**kwargs) # type: ignore lf = pl.LazyFrame({"a": [1]}) actual = evaluate_rules(lf, rules_from_exprs(column.validation_rules(pl.col("a")))) @@ -134,6 +134,7 @@ def test_validate_range( kwargs = { ("min" if min_inclusive else "min_exclusive"): 2, ("max" if max_inclusive else "max_exclusive"): 4, + "nullable": True, } column = column_type(**kwargs) # type: ignore lf = pl.LazyFrame({"a": [1, 2, 3, 4, 5]}) @@ -151,7 +152,7 @@ def test_validate_range( @pytest.mark.parametrize("column_type", INTEGER_COLUMN_TYPES) def test_validate_is_in(column_type: type[_BaseInteger]) -> None: - column = column_type(is_in=[3, 5]) + column = column_type(is_in=[3, 5], nullable=True) lf = pl.LazyFrame({"a": [1, 2, 3, 4, 5]}) actual = evaluate_rules(lf, rules_from_exprs(column.validation_rules(pl.col("a")))) expected = pl.LazyFrame({"is_in": [False, False, True, False, True]}) From a3563a2588cd31f7bca40c78453e0cac2aa60c3f Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Fri, 17 Oct 2025 15:52:29 +0200 Subject: [PATCH 11/16] wip --- tests/column_types/test_list.py | 2 +- tests/column_types/test_string.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/column_types/test_list.py b/tests/column_types/test_list.py index d7401fca..e860b85c 100644 --- a/tests/column_types/test_list.py +++ b/tests/column_types/test_list.py @@ -62,7 +62,7 @@ def test_nested_lists() -> None: def test_list_with_pk() -> None: schema = create_schema( "test", - {"a": dy.List(dy.String(), primary_key=True)}, + {"a": dy.List(dy.String(nullable=True), primary_key=True)}, ) df = pl.DataFrame({"a": [["ab"], ["a", "ab"], [None], ["a", "b"], ["a", "b"]]}) _, failures = schema.filter(df) diff --git a/tests/column_types/test_string.py b/tests/column_types/test_string.py index 10031022..1846ab84 100644 --- a/tests/column_types/test_string.py +++ b/tests/column_types/test_string.py @@ -9,7 +9,7 @@ def test_validate_min_length() -> None: - column = dy.String(min_length=2) + column = dy.String(min_length=2, nullable=True) lf = pl.LazyFrame({"a": ["foo", "x"]}) actual = evaluate_rules(lf, rules_from_exprs(column.validation_rules(pl.col("a")))) expected = pl.LazyFrame({"min_length": [True, False]}) @@ -17,7 +17,7 @@ def test_validate_min_length() -> None: def test_validate_max_length() -> None: - column = dy.String(max_length=2) + column = dy.String(max_length=2, nullable=True) lf = pl.LazyFrame({"a": ["foo", "x"]}) actual = evaluate_rules(lf, rules_from_exprs(column.validation_rules(pl.col("a")))) expected = pl.LazyFrame({"max_length": [False, True]}) @@ -25,7 +25,7 @@ def test_validate_max_length() -> None: def test_validate_regex() -> None: - column = dy.String(regex="[0-9][a-z]$") + column = dy.String(regex="[0-9][a-z]$", nullable=True) lf = pl.LazyFrame({"a": ["33x", "3x", "44"]}) actual = evaluate_rules(lf, rules_from_exprs(column.validation_rules(pl.col("a")))) expected = pl.LazyFrame({"regex": [True, True, False]}) From a41b64ab90c2145c7e9d5814882867e6fc7b257f Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Fri, 17 Oct 2025 17:03:10 +0200 Subject: [PATCH 12/16] fix --- tests/columns/test_sample.py | 10 +++++++--- tests/columns/test_sql_schema.py | 4 ++-- tests/columns/test_str.py | 2 +- tests/schema/test_base.py | 2 +- tests/schema/test_filter.py | 2 +- tests/schema/test_repr.py | 10 +++++----- tests/schema/test_sample.py | 4 ++-- tests/schema/test_validate.py | 2 +- 8 files changed, 20 insertions(+), 16 deletions(-) diff --git a/tests/columns/test_sample.py b/tests/columns/test_sample.py index 3bcac166..d0888475 100644 --- a/tests/columns/test_sample.py +++ b/tests/columns/test_sample.py @@ -176,20 +176,24 @@ def test_sample_enum(generator: Generator) -> None: def test_sample_list(generator: Generator) -> None: - column = dy.List(dy.String(regex="[abc]"), min_length=5, max_length=10) + column = dy.List( + dy.String(regex="[abc]"), nullable=True, min_length=5, max_length=10 + ) samples = sample_and_validate(column, generator, n=10_000) assert set(samples.list.len()) == set(range(5, 11)) | {None} def test_sample_array(generator: Generator) -> None: - column = dy.Array(dy.Bool(), (2, 3)) + column = dy.Array(dy.Bool(nullable=True), (2, 3)) samples = sample_and_validate(column, generator, n=10_000) assert samples.is_null().any() assert set(samples.arr.len()) == {2, None} def test_sample_struct(generator: Generator) -> None: - column = dy.Struct({"a": dy.String(regex="[abc]"), "b": dy.String(regex="[a-z]xx")}) + column = dy.Struct( + {"a": dy.String(regex="[abc]"), "b": dy.String(regex="[a-z]xx")}, nullable=True + ) samples = sample_and_validate(column, generator, n=10_000) assert samples.is_null().any() assert len(samples) == 10_000 diff --git a/tests/columns/test_sql_schema.py b/tests/columns/test_sql_schema.py index 5b414251..abb54a58 100644 --- a/tests/columns/test_sql_schema.py +++ b/tests/columns/test_sql_schema.py @@ -149,7 +149,7 @@ def test_raise_for_array_column(dialect: Dialect) -> None: with pytest.raises( NotImplementedError, match="SQL column cannot have 'Array' type." ): - dy.Array(dy.String(), 1).sqlalchemy_dtype(dialect) + dy.Array(dy.String(nullable=True), 1).sqlalchemy_dtype(dialect) @pytest.mark.parametrize("dialect", [MSDialect_pyodbc(), PGDialect_psycopg2()]) @@ -157,7 +157,7 @@ def test_raise_for_struct_column(dialect: Dialect) -> None: with pytest.raises( NotImplementedError, match="SQL column cannot have 'Struct' type." ): - dy.Struct({"a": dy.String()}).sqlalchemy_dtype(dialect) + dy.Struct({"a": dy.String(nullable=True)}).sqlalchemy_dtype(dialect) @pytest.mark.parametrize("dialect", [MSDialect_pyodbc(), PGDialect_psycopg2()]) diff --git a/tests/columns/test_str.py b/tests/columns/test_str.py index 387af519..95208221 100644 --- a/tests/columns/test_str.py +++ b/tests/columns/test_str.py @@ -25,7 +25,7 @@ def test_string_representation_list() -> None: def test_string_representation_array() -> None: - column = dy.Array(dy.String(), 1) + column = dy.Array(dy.String(nullable=True), 1) assert str(column) == dy.Array.__name__.lower() diff --git a/tests/schema/test_base.py b/tests/schema/test_base.py index 92b3b1e5..83117910 100644 --- a/tests/schema/test_base.py +++ b/tests/schema/test_base.py @@ -14,7 +14,7 @@ class MySchema(dy.Schema): a = dy.Integer(primary_key=True) b = dy.String(primary_key=True) - c = dy.Float64() + c = dy.Float64(nullable=True) d = dy.Any(alias="e") diff --git a/tests/schema/test_filter.py b/tests/schema/test_filter.py index 5dbbe84f..7268e1ba 100644 --- a/tests/schema/test_filter.py +++ b/tests/schema/test_filter.py @@ -17,7 +17,7 @@ class MySchema(dy.Schema): a = dy.Int64(primary_key=True) - b = dy.String(max_length=3) + b = dy.String(max_length=3, nullable=True) @pytest.mark.parametrize( diff --git a/tests/schema/test_repr.py b/tests/schema/test_repr.py index 63958daf..64a0c8e3 100644 --- a/tests/schema/test_repr.py +++ b/tests/schema/test_repr.py @@ -9,7 +9,7 @@ def test_repr_no_rules() -> None: class SchemaNoRules(dy.Schema): - a = dy.Integer() + a = dy.Integer(nullable=True) assert repr(SchemaNoRules) == textwrap.dedent("""\ [Schema "SchemaNoRules"] @@ -20,7 +20,7 @@ class SchemaNoRules(dy.Schema): def test_repr_only_column_rules() -> None: class SchemaColumnRules(dy.Schema): - a = dy.Integer(min=10) + a = dy.Integer(min=10, nullable=True) assert repr(SchemaColumnRules) == textwrap.dedent("""\ [Schema "SchemaColumnRules"] @@ -46,8 +46,8 @@ def test_repr_with_rules() -> None: assert repr(SchemaWithRules) == textwrap.dedent("""\ [Schema "SchemaWithRules"] Columns: - - "a": Integer(nullable=True, min=10) - - "b2": String(nullable=False, primary_key=True, regex='^[A-Z]{3}$') + - "a": Integer(min=10) + - "b2": String(primary_key=True, regex='^[A-Z]{3}$') Rules: - "my_rule": [(col("a")) < (dyn int: 100)] - "my_group_rule": [(col("a").sum()) > (dyn int: 50)] grouped by ['a'] @@ -56,7 +56,7 @@ def test_repr_with_rules() -> None: def test_repr_enum() -> None: class SchemaNoRules(dy.Schema): - a = dy.Enum(["a"]) + a = dy.Enum(["a"], nullable=True) assert repr(SchemaNoRules) == textwrap.dedent("""\ [Schema "SchemaNoRules"] diff --git a/tests/schema/test_sample.py b/tests/schema/test_sample.py index 00ca9565..3702b34a 100644 --- a/tests/schema/test_sample.py +++ b/tests/schema/test_sample.py @@ -93,8 +93,8 @@ def _sampling_overrides(cls) -> dict[str, pl.Expr]: class MyAdvancedSchema(dy.Schema): - a = dy.Float64(min=20.0) - b = dy.String(regex=r"abc*") + a = dy.Float64(min=20.0, nullable=True) + b = dy.String(regex=r"abc*", nullable=True) # --------------------------------------- TESTS -------------------------------------- # diff --git a/tests/schema/test_validate.py b/tests/schema/test_validate.py index c2c7a07c..fc33b0f7 100644 --- a/tests/schema/test_validate.py +++ b/tests/schema/test_validate.py @@ -15,7 +15,7 @@ class MySchema(dy.Schema): a = dy.Int64(primary_key=True) b = dy.String(nullable=False, max_length=5) - c = dy.String() + c = dy.String(nullable=True) class MyComplexSchema(dy.Schema): From 2abba8e5fcc63c4ec105053e50b1102d389cc239 Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Fri, 17 Oct 2025 17:30:17 +0200 Subject: [PATCH 13/16] coverage --- tests/test_deprecation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_deprecation.py b/tests/test_deprecation.py index 8f3d4acd..c6d53fa5 100644 --- a/tests/test_deprecation.py +++ b/tests/test_deprecation.py @@ -18,5 +18,7 @@ def test_skip_if(monkeypatch: pytest.MonkeyPatch, env_var: str) -> None: def callable() -> None: raise ValueError() + with pytest.raises(ValueError): + callable() monkeypatch.setenv(variable_name, env_var) callable() From 56a612e39255179a0a470190ae9829130fda63e4 Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Fri, 17 Oct 2025 17:48:13 +0200 Subject: [PATCH 14/16] coverage --- tests/schema/test_filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/schema/test_filter.py b/tests/schema/test_filter.py index 7268e1ba..e6be4851 100644 --- a/tests/schema/test_filter.py +++ b/tests/schema/test_filter.py @@ -106,7 +106,7 @@ def test_filter_failure( @pytest.mark.parametrize("df_type", [pl.DataFrame, pl.LazyFrame]) def test_filter_no_rules(df_type: type[pl.DataFrame] | type[pl.LazyFrame]) -> None: - schema = create_schema("test", {"a": dy.Int64()}) + schema = create_schema("test", {"a": dy.Int64(nullable=True)}) df = df_type({"a": [1, 2, 3]}) df_valid, failures = schema.filter(df) assert isinstance(df_valid, pl.DataFrame) From f22771ec6088c40ebf24f773c7598610f3e3da8d Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Fri, 17 Oct 2025 17:54:22 +0200 Subject: [PATCH 15/16] coverage --- tests/schema/test_sample.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/schema/test_sample.py b/tests/schema/test_sample.py index 3702b34a..d7e38a57 100644 --- a/tests/schema/test_sample.py +++ b/tests/schema/test_sample.py @@ -12,8 +12,8 @@ class MySimpleSchema(dy.Schema): - a = dy.Int64() - b = dy.String() + a = dy.Int64(nullable=True) + b = dy.String(nullable=True) class PrimaryKeySchema(dy.Schema): From 9ba9a46eec5282faeef7f4d0e0d81467903398e2 Mon Sep 17 00:00:00 2001 From: Andreas Albert <103571926+AndreasAlbertQC@users.noreply.github.com> Date: Mon, 20 Oct 2025 09:04:24 +0200 Subject: [PATCH 16/16] Apply suggestion from @delsner Co-authored-by: Daniel Elsner --- tests/columns/test_pyarrow.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/columns/test_pyarrow.py b/tests/columns/test_pyarrow.py index c0275da6..d2ef1e5f 100644 --- a/tests/columns/test_pyarrow.py +++ b/tests/columns/test_pyarrow.py @@ -23,6 +23,7 @@ def _nullable(column_type: type[T]) -> T: + # dy.Any doesn't have the `nullable` parameter. if column_type == dy.Any: return column_type() return column_type(nullable=True)