From bb3062b90620b081d24ecd60bef88f7a43b90ef9 Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Fri, 17 Oct 2025 11:12:08 +0200 Subject: [PATCH 1/4] docs: Convert docs to markdown --- docs/conf.py | 8 + docs/{index.rst => index.md} | 48 ++--- docs/sites/development.md | 41 +++++ docs/sites/development.rst | 49 ----- docs/sites/faq.md | 30 +++ docs/sites/faq.rst | 30 --- docs/sites/features/index.md | 5 + docs/sites/features/index.rst | 7 - docs/sites/features/primary-keys.md | 51 ++++++ docs/sites/features/primary-keys.rst | 47 ----- docs/sites/installation.md | 8 + docs/sites/installation.rst | 9 - docs/sites/quickstart.md | 246 +++++++++++++++++++++++++ docs/sites/quickstart.rst | 262 --------------------------- docs/sites/versioning.md | 30 +++ docs/sites/versioning.rst | 20 -- pixi.lock | 65 +++++++ pixi.toml | 2 +- 18 files changed, 510 insertions(+), 448 deletions(-) rename docs/{index.rst => index.md} (59%) create mode 100644 docs/sites/development.md delete mode 100644 docs/sites/development.rst create mode 100644 docs/sites/faq.md delete mode 100644 docs/sites/faq.rst create mode 100644 docs/sites/features/index.md delete mode 100644 docs/sites/features/index.rst create mode 100644 docs/sites/features/primary-keys.md delete mode 100644 docs/sites/features/primary-keys.rst create mode 100644 docs/sites/installation.md delete mode 100644 docs/sites/installation.rst create mode 100644 docs/sites/quickstart.md delete mode 100644 docs/sites/quickstart.rst create mode 100644 docs/sites/versioning.md delete mode 100644 docs/sites/versioning.rst diff --git a/docs/conf.py b/docs/conf.py index 8175cf44..0ada8e39 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -43,8 +43,16 @@ "sphinx.ext.autodoc", "sphinx.ext.linkcode", "sphinxcontrib.apidoc", + "sphinx.ext.intersphinx", + "myst_parser", ] +myst_parser_config = {"myst_enable_extensions": ["rst_eval_roles"]} +source_suffix = { + ".rst": "restructuredtext", + ".txt": "markdown", + ".md": "markdown", +} numpydoc_class_members_toctree = False apidoc_module_dir = "../dataframely" diff --git a/docs/index.rst b/docs/index.md similarity index 59% rename from docs/index.rst rename to docs/index.md index 24d63927..fe461695 100644 --- a/docs/index.rst +++ b/docs/index.md @@ -1,41 +1,41 @@ -Dataframely -============ +# Dataframely -Dataframely is a Python package to validate the schema and content of `polars `_ data frames. -Its purpose is to make data pipelines more robust by ensuring that data meet expectations and more readable by adding schema information to data frame type hints. +Dataframely is a Python package to validate the schema and content of `polars `\_ data frames. +Its purpose is to make data pipelines more robust by ensuring that data meet expectations and more readable by adding +schema information to data frame type hints. -Features --------- +## Features - Declaratively define schemas as classes with arbitrary inheritance structure - Specify column-specific validation rules (e.g. nullability, minimum string length, ...) -- Specify cross-column and group validation rules with built-in support for checking the primary key property of a column set +- Specify cross-column and group validation rules with built-in support for checking the primary key property of a + column set - Specify validation constraints across collections of interdependent data frames - Validate data frames softly by simply filtering out rows violating rules instead of failing hard - Introspect validation failure information for run-time failures -- Enhanced type hints for validated data frames allowing users to clearly express expectations about inputs and outputs (i.e., contracts) in data pipelines -- Integrate schemas with external tools (e.g., ``sqlalchemy`` or ``pyarrow``) +- Enhanced type hints for validated data frames allowing users to clearly express expectations about inputs and + outputs (i.e., contracts) in data pipelines +- Integrate schemas with external tools (e.g., `sqlalchemy` or `pyarrow`) - Generate test data that comply with a schema or collection of schemas and its validation rules -Contents -======== +## Contents -.. toctree:: - :caption: Contents - :maxdepth: 2 +```{toctree} +:caption: Contents +:maxdepth: 2 - Installation - Quickstart + Installation + Quickstart Real-world Example - Features - FAQ - Development Guide - Versioning + Features + FAQ + Development Guide + Versioning +``` -API Documentation -================= +## API Documentation -.. toctree:: +```{toctree} :caption: API Documentation :maxdepth: 1 @@ -45,3 +45,5 @@ API Documentation Random Data Generation <_api/dataframely.random> Failure Information <_api/dataframely.failure> Schema <_api/dataframely.schema> + +``` diff --git a/docs/sites/development.md b/docs/sites/development.md new file mode 100644 index 00000000..fd98a8ce --- /dev/null +++ b/docs/sites/development.md @@ -0,0 +1,41 @@ +# Development + +Thanks for deciding to work on `dataframely`! +You can create a development environment with the following steps: + +## Environment Installation + +```bash +git clone https://github.com/Quantco/dataframely +cd dataframely +pixi install +``` + +Next make sure to install the package locally and set up pre-commit hooks: + +```bash +pixi run postinstall +pixi run pre-commit-install +``` + +## Running the tests + +```bash +pixi run test +``` + +You can adjust the `tests/` path to run tests in a specific directory or module. + +## Building the Documentation + +When updating the documentation, you can compile a localized build of the +documentation and then open it in your web browser using the commands below: + +```bash +# Run build +pixi run -e docs postinstall +pixi run docs + +# Open documentation +open docs/_build/html/index.html +``` diff --git a/docs/sites/development.rst b/docs/sites/development.rst deleted file mode 100644 index 464e77da..00000000 --- a/docs/sites/development.rst +++ /dev/null @@ -1,49 +0,0 @@ -Development -=========== - - -Thanks for deciding to work on ``dataframely``! -You can create a development environment with the following steps: - -Environment Installation ------------------------- - -.. code-block:: bash - - git clone https://github.com/Quantco/dataframely - cd dataframely - pixi install - -Next make sure to install the package locally and set up pre-commit hooks: - -.. code-block:: bash - - pixi run postinstall - pixi run pre-commit-install - - -Running the tests ------------------ - -.. code-block:: bash - - pixi run test - - -You can adjust the ``tests/`` path to run tests in a specific directory or module. - - -Building the Documentation --------------------------- - -When updating the documentation, you can compile a localized build of the -documentation and then open it in your web browser using the commands below: - -.. code-block:: bash - - # Run build - pixi run -e docs postinstall - pixi run docs - - # Open documentation - open docs/_build/html/index.html diff --git a/docs/sites/faq.md b/docs/sites/faq.md new file mode 100644 index 00000000..66a93654 --- /dev/null +++ b/docs/sites/faq.md @@ -0,0 +1,30 @@ +# FAQ + +Whenever you find out something that you were surprised by or needed some non-trivial +thinking, please add it here. + +## How do I define additional unique keys in a `dy.Schema`? + +By default, `dataframely` only supports defining a single non-nullable (composite) primary key in `dy.Schema`. +However, in some scenarios it may be useful to define additional unique keys (which support nullable fields and/or which +are additionally unique). + +Consider the following example, which demonstrates two rules: one for validating that a field is entirely unique, and +another for validating that a field, when provided, is unique. + +```python +class UserSchema(dy.Schema): + user_id = dy.UInt64(primary_key=True, nullable=False) + username = dy.String(nullable=False) + email = dy.String(nullable=True) # Must be unique, or null. + + @dy.rule(group_by=["username"]) + def unique_username() -> pl.Expr: + """Username, a non-nullable field, must be total unique.""" + return pl.len() == 1 + + @dy.rule() + def unique_email_or_null() -> pl.Expr: + """Email must be unique, if provided.""" + return pl.col("email").is_null() | pl.col("email").is_unique() +``` diff --git a/docs/sites/faq.rst b/docs/sites/faq.rst deleted file mode 100644 index e290ecba..00000000 --- a/docs/sites/faq.rst +++ /dev/null @@ -1,30 +0,0 @@ -FAQ -=== - -Whenever you find out something that you were surprised by or needed some non-trivial -thinking, please add it here. - -How do I define additional unique keys in a ``dy.Schema``? ----------------------------------------------------------- - -By default, ``dataframely`` only supports defining a single non-nullable (composite) primary key in ``dy.Schema``. -However, in some scenarios it may be useful to define additional unique keys (which support nullable fields and/or which are additionally unique). - -Consider the following example, which demonstrates two rules: one for validating that a field is entirely unique, and another for validating that a field, when provided, is unique. - -:: - - class UserSchema(dy.Schema): - user_id = dy.UInt64(primary_key=True, nullable=False) - username = dy.String(nullable=False) - email = dy.String(nullable=True) # Must be unique, or null. - - @dy.rule(group_by=["username"]) - def unique_username() -> pl.Expr: - """Username, a non-nullable field, must be total unique.""" - return pl.len() == 1 - - @dy.rule() - def unique_email_or_null() -> pl.Expr: - """Email must be unique, if provided.""" - return pl.col("email").is_null() | pl.col("email").is_unique() diff --git a/docs/sites/features/index.md b/docs/sites/features/index.md new file mode 100644 index 00000000..31b0ca1d --- /dev/null +++ b/docs/sites/features/index.md @@ -0,0 +1,5 @@ +# Features + +```{toctree} +primary-keys +``` diff --git a/docs/sites/features/index.rst b/docs/sites/features/index.rst deleted file mode 100644 index 11cfcd9c..00000000 --- a/docs/sites/features/index.rst +++ /dev/null @@ -1,7 +0,0 @@ -Features -======== - -.. toctree:: - :maxdepth: 1 - - primary-keys.rst diff --git a/docs/sites/features/primary-keys.md b/docs/sites/features/primary-keys.md new file mode 100644 index 00000000..9aa8ea09 --- /dev/null +++ b/docs/sites/features/primary-keys.md @@ -0,0 +1,51 @@ +# Primary keys + +## Defining primary keys in `dy.Schema` + +When working with tabular data, it is often useful to define a [primary key](https://en.wikipedia.org/wiki/Primary_key). +A primary key is a set of one or multiple columns, the combined values of which form a unique identifier for every +record in a table. + +Dataframely supports marking columns as part of the primary key when defining a `dy.Schema` by setting +`primary_key=True` on the respective column(s). + +```{note} +Primary key columns must not be nullable. +``` + +### One-column primary keys + +For example, when managing data about users, we might use an `id` column to uniquely identify users: + +```python +class UserSchema(dy.Schema): + id = dy.String(primary_key=True) + name = dy.String() +``` + +When we later validate data with this schema, dataframely checks that the values of the primary key are unique, i.e. +there are no two users with the same value of `id`. Having multiple users with the same `name` but different `id` is +allowed in this case. + +### Composite primary keys + +In another scenario, we might be tracking line items on invoices. We have many invoices, and each invoice may contain +any number of line items. To uniquely identify a line item, we need to specify the invoice, as well as the line item's +position within the invoice. To encode this, we set `primary_key=True` on both the `invoice_id` and `item_id` columns: + +```python +class LineItemSchema(dy.Schema): + invoice_id = dy.Int64(primary_key=True) + item_id = dy.Int64(primary_key=True) + price = dy.Decimal() +``` + +Validation will now ensure that all pairs of (`invoice_id`, `item_id`) are unique. + +## Primary keys in `dy.Collection` + +The central idea behind `dy.Collection` is to unify multiple tables relating to the same set of underlying entities. +This is useful because it allows us to write `dy.filter`s that use information from multiple tables to identify whether +the underlying entity is valid or not. If any `dy.filter`s are defined, dataframely requires the tables in a +`dy.Collection` to have an overlapping primary key (i.e., there must be at least one column that is a primary key in all +tables). diff --git a/docs/sites/features/primary-keys.rst b/docs/sites/features/primary-keys.rst deleted file mode 100644 index f5873d52..00000000 --- a/docs/sites/features/primary-keys.rst +++ /dev/null @@ -1,47 +0,0 @@ -Primary keys -============ - -Defining primary keys in ``dy.Schema`` --------------------------------------- - -When working with tabular data, it is often useful to define a `primary key `_. A primary key is a set of one or multiple column, the combined values of which form a unique identifier for every record in a table. - -Dataframely supports marking columns as part of the primary key when defining a ``dy.Schema`` by setting ``primary_key=True`` on the respective column(s). - -.. note:: - - Primary key columns must not be nullable. - -Single primary keys -^^^^^^^^^^^^^^^^^^^ - -For example, when managing data about users, we might use an ``id`` column to uniquely identify users: - -:: - - class UserSchema(dy.Schema): - name = dy.String(primary_key=True) - name = dy.String() - -When we later validate data with this schema, ``dataframely`` checks that the values of the primary key are unique, i.e. there are no two users with the same value of ``id``. Having multiple users with the same ``name`` but different ``id`` but be allowed in this case. - -Composite primary keys -^^^^^^^^^^^^^^^^^^^^^^ - -In another scenario, we might be tracking line items on invoices. We have many invoices, and each invoice may contain any number of line items. To uniquely identify a line item, we need to specify the invoice, as well as the line items position within the invoice. To encode this, we set ``primary_key=True`` on both the ``invoice_id`` and ``item_id`` columns: - -:: - - class LineItemSchema(dy.Schema): - invoice_id = dy.Int64(primary_key=True) - item_id = dy.Int64(primary_key=True) - price = dy.Decimal() - -Validation will now ensure that all pairs of (``invoice_id``, ``item_id``) are unique. - - -Primary keys in ``dy.Collection`` ---------------------------------- - -The central idea behind ``dy.Collection`` is to unify multiple tables relating to the same set of underlying entities. -This is useful because it allows us to write ``dy.filter``s that use information from multiple tables to identify whether the underlying entity is valid or not. If any ``dy.filter``s are defined, ``dataframely`` requires the tables in a ``dy.Collection`` to have an overlapping primary key (i.e., there must be at least one column that is a primary key in all tables). diff --git a/docs/sites/installation.md b/docs/sites/installation.md new file mode 100644 index 00000000..5982fb33 --- /dev/null +++ b/docs/sites/installation.md @@ -0,0 +1,8 @@ +# Installation + +To install `dataframely`, use your favorite package manager, e.g., using `pixi` or `pip`: + +```bash + pixi add dataframely + pip install dataframely +``` diff --git a/docs/sites/installation.rst b/docs/sites/installation.rst deleted file mode 100644 index ab0646cc..00000000 --- a/docs/sites/installation.rst +++ /dev/null @@ -1,9 +0,0 @@ -Installation -============ - -To install ``dataframely``, use your favorite package manager, e.g., using ``pixi`` or ``pip``: - -.. code:: bash - - pixi add dataframely - pip install dataframely diff --git a/docs/sites/quickstart.md b/docs/sites/quickstart.md new file mode 100644 index 00000000..45a9d203 --- /dev/null +++ b/docs/sites/quickstart.md @@ -0,0 +1,246 @@ +# Quickstart + +For the purpose of this guide, let's assume that we're working with data that we use to predict housing prices. +To this end, we want to ensure that all the data we're using meets several expectations. +As a running example, consider the following data set: +| `zip_code` | `num_bedrooms` | `num_bathrooms` | `price`| +|-------|--------------|--------------|----------| +| "01234" | 2 | 1 | 100,000 | +| "01234" | 2 | 2 | 110,000 | +| "1" | 1 | 1 | 50,000 | +| "213" | NULL | 1 | 80,000 | +| "123" | NULL | 0 | 60,000 | +| "213" | 2 | 8 | 160,000 + +## Creating a {class}`~dataframely.Schema` class + +To get started with dataframely, you'll always want to define a schema ({class}`~dataframely.Schema`). For example, we +might set up the following: + +```python +import dataframely as dy + + +class HouseSchema(dy.Schema): + zip_code = dy.String(nullable=False, min_length=3) + num_bedrooms = dy.UInt8(nullable=False) + num_bathrooms = dy.UInt8(nullable=False) + price = dy.Float64(nullable=False) +``` + +This translates into the following expectations on our data: + +- We require exactly four columns `zip_code`, `num_bedrooms`, `num_bathrooms`, `price` +- We expect a particular data type for each of these, requiring all of them to be non-nullable +- The zip code must be at least three characters as we consider any other zip code invalid + +## Custom rules + +While parameters in the column-initializers allow for defining expectations on a single column (e.g. `min_length` for +string columns), this might not always by sufficient. +In many cases, we want to check expectations across columns: for example, the ratio between the number of bathrooms and +bedrooms should not be too high. + +In `dataframely`, we can do this by adding a custom rule to our schema: + +```python + +import dataframely as dy + + +class HouseSchema(dy.Schema): + zip_code = dy.String(nullable=False, min_length=3) + num_bedrooms = dy.UInt8(nullable=False) + num_bathrooms = dy.UInt8(nullable=False) + price = dy.Float64(nullable=False) + + @dy.rule() + def reasonable_bathroom_to_bedrooom_ratio() -> pl.Expr: + ratio = pl.col("num_bathrooms") / pl.col("num_bedrooms") + return (ratio >= 1 / 3) & (ratio <= 3) +``` + +The decorator `@dy.rule()` "registers" the function as a rule using its name (i.e. +`reasonable_bathroom_to_bedrooom_ratio`). +The returned expression provides a boolean value for each row of the data which evaluates to `True` whenever the data +are valid with respect to this rule. + +## Group rules + +For defining even more complex rules, the `@dy.rule` decorator allows for a `group_by` +parameter: this allows to evaluate a rule across _rows_. +For our housing data, this allows us to specify, for example, that we want to observe at least two houses per zip code: + +```python +import dataframely as dy + + +class HouseSchema(dy.Schema): + zip_code = dy.String(nullable=False, min_length=3) + num_bedrooms = dy.UInt8(nullable=False) + num_bathrooms = dy.UInt8(nullable=False) + price = dy.Float64(nullable=False) + + @dy.rule() + def reasonable_bathroom_to_bedrooom_ratio() -> pl.Expr: + ratio = pl.col("num_bathrooms") / pl.col("num_bedrooms") + return (ratio >= 1 / 3) & (ratio <= 3) + + @dy.rule(group_by=["zip_code"]) + def minimum_zip_code_count() -> pl.Expr: + return pl.len() >= 2 +``` + +When defining rules on groups, we have to take care to use some kind of "aggregate function" +in order to produce exactly one value per group: +in group rules, the "input" that the expression is evaluated on is a set of rows. + +````{note} +If you are using `mypy` to type check your code, the usage of rules requires you to use the dataframely `mypy` plugin. +In order to use it, add the following to your `pyproject.toml`: + +```toml + [tool.mypy] + plugins = ["dataframely.mypy"] +``` + +```` + +## Validating data against a schema + +Once we're done defining our schema, we want to apply the schema to our data. +To validate data against a schema, we can use the `validate` method of the schema class. For example, we can validate +the data set above as follows: + +```python + +import polars as pl + +df = pl.DataFrame({ + "zip_code": ["01234", "01234", "1", "213", "123", "213"], + "num_bedrooms": [2, 2, 1, None, None, 2], + "num_bathrooms": [1, 2, 1, 1, 0, 8], + "price": [100_000, 110_000, 50_000, 80_000, 60_000, 160_000] +}) + +# Validate the data and cast columns to expected types +validated_df = HouseSchema.validate(df, cast=True) +``` + +If any row in `df` is invalid, i.e., any rule defined on individual columns or the entire schema evaluates to +`False`, a validation exception is raised. +Here, we have invalid data in the `num_bedrooms` and `zip_code` columns. + + RuleValidationError: 2 rules failed validation: + * Column 'num_bedrooms' failed validation for 1 rules: + - 'nullability' failed for 2 rows + * Column 'zip_code' failed validation for 1 rules: + - 'min_length' failed for 1 rows + +Otherwise, if all rows in `df` are valid, `validate` returns a validated data frame of type +`dy.DataFrame[HouseSchema]`. +The same applies when a `pl.LazyFrame` is passed to `validate`. +The generic data frame types allow for more readable function signatures to express +expectations on the schema of the data frame, e.g.: + +```python +def train_model(df: dy.DataFrame[HouseSchema]) -> None: + ... +``` + +The type checker (typically `mypy`) then ensures that it is actually a +`dy.DataFrame[HouseSchema]` that is passed to the function and it complains if a plain +(i.e., non-validated) `pl.DataFrame` or a data frame with a different schema is used. +The `train_model` function can be implemented with peace of mind: `df` looks exactly as needed. + +```{note} +Make sure that you do not bypass the type checker by using `# type: ignore` annotations in these contexts. +This defies the entire purpose of the typed data frames. +Also note that the frame types generic over a schema are *only* available to the static type checker. +If you call `isinstance()` checking for `dy.DataFrame`, it will *always* evaluate to `False`. +The run-time type of the data frame is still a `pl.DataFrame`. +``` + +# Using soft-validation to introspect validation failures + +While `validate` is useful for ensuring that the entire dataset meets expectations, +it is not always useful in production systems where invalid rows should be ignored while all valid rows should be +salvaged. + +To this end, `dataframely` provides the `filter` method that performs "soft-validation": + +```python + # Filter the data and cast columns to expected types +good, failure = HouseSchema.filter(df, cast=True) + +# Inspect the reasons for the failed rows +print(failure.counts()) +``` + +In this case, `good` remains to be a `dy.DataFrame[HouseSchema]`, albeit with potentially fewer rows than `df`. +The `failure` object is of type :class:`~dataframely.FailureInfo` and provides means to inspect +the reasons for validation failures for invalid rows. + +Given the example data above and the schema that we defined, we know that rows 2, 3, 4, and 5 are invalid (0-indexed): + +- Row 2 has a zip code that does not appear at least twice +- Row 3 has a NULL value for the number of bedrooms +- Row 4 violates both of the rules above +- Row 5 violates the reasonable bathroom to bedroom ratio + +Using the `counts` method on the :class:`~dataframely.FailureInfo` object will result in the following dictionary: + +```python +{ + "reasonable_bathroom_to_bedrooom_ratio": 1, + "minimum_zip_code_count": 2, + "zip_code|min_length": 1, + "num_bedrooms|nullability": 2, +} +``` + +To get a data frame containing all failed rows, we can use the `invalid` method: + +```python +failed_df = failure.invalid() +``` + +This information tends to be very useful in tracking down issues with the data, +both in productive systems and analytics environments. + +## Type casting + +In rare cases, you might already be _absolutely certain_ that a data frame is valid with +respect to a particular schema and do not want to pay the runtime cost of calling `validate` or `filter`. +To this end, you can use the `cast` method to tell this to the type checker without inspecting the contents of the +data frame: + +```python +df_valid = HouseSchema.cast(df) +``` + +A use case for `cast` could be the concatenation of two data frames with known schema, e.g.: + +```python +df1: dy.DataFrame[HouseSchema] +df2: dy.DataFrame[HouseSchema] +df_concat = HouseSchema.cast(pl.concat([df1, df2])) +``` + +## Integration with external tools + +Lastly, `dataframely` schemas can be used to integrate with external tools: + +- `HouseSchema.create_empty()` creates an empty `dy.DataFrame[HouseSchema]` that can be used for testing +- `HouseSchema.sql_schema()` provides a list of `sqlalchemy `\_ columns that can be used to + create SQL tables using types and constraints in line with the schema +- `HouseSchema.pyarrow_schema()` provides a `pyarrow `\_ schema with + appropriate column dtypes and nullability information +- You can use `dy.DataFrame[HouseSchema]` (or the `LazyFrame` equivalent) as fields in + `pydantic `\_ models, including support for validation and serialization. Integration with + pydantic is unstable. + +## Outlook + +This concludes the quickstart guide. For more information, please see the +`real-world example `\_\_ or dive into the API documentation. diff --git a/docs/sites/quickstart.rst b/docs/sites/quickstart.rst deleted file mode 100644 index b191d343..00000000 --- a/docs/sites/quickstart.rst +++ /dev/null @@ -1,262 +0,0 @@ -Quickstart -========== - -For the purpose of this guide, let's assume that we're working with data that we use to predict housing prices. -To this end, we want to ensure that all the data we're using meets several expectations. -As a running example, consider the following data set: - -.. list-table:: - :header-rows: 1 - - * - ``zip_code`` - - ``num_bedrooms`` - - ``num_bathrooms`` - - ``price`` - * - "01234" - - 2 - - 1 - - 100,000 - * - "01234" - - 2 - - 2 - - 110,000 - * - "1" - - 1 - - 1 - - 50,000 - * - "213" - - - - 1 - - 80,000 - * - "123" - - - - 0 - - 60,000 - * - "213" - - 2 - - 8 - - 160,000 - - -Creating a :class:`~dataframely.Schema` class ---------------------------------------------------- - -To get started with dataframely, you'll always want to define a schema (``dataframely.Schema``). For example, we might set up the following: - -:: - - import dataframely as dy - - class HouseSchema(dy.Schema): - zip_code = dy.String(nullable=False, min_length=3) - num_bedrooms = dy.UInt8(nullable=False) - num_bathrooms = dy.UInt8(nullable=False) - price = dy.Float64(nullable=False) - -This translates into the following expectations on our data: - -- We require exactly four columns ``zip_code``, ``num_bedrooms``, ``num_bathrooms``, ``price`` -- We expect a particular data type for each of these, requiring all of them to be non-nullable -- The zip code must be at least three characters as we consider any other zip code invalid - -Custom rules ------------- - -While parameters in the column-initializers allow for defining expectations on a single column (e.g. ``min_length`` for string columns), this might not always by sufficient. -In many cases, we want to check expectations across columns: for example, the ratio between the number of bathrooms and bedrooms should not be too high. - -In ``dataframely``, we can do this by adding a custom rule to our schema: - -:: - - import dataframely as dy - - class HouseSchema(dy.Schema): - zip_code = dy.String(nullable=False, min_length=3) - num_bedrooms = dy.UInt8(nullable=False) - num_bathrooms = dy.UInt8(nullable=False) - price = dy.Float64(nullable=False) - - @dy.rule() - def reasonable_bathroom_to_bedrooom_ratio() -> pl.Expr: - ratio = pl.col("num_bathrooms") / pl.col("num_bedrooms") - return (ratio >= 1 / 3) & (ratio <= 3) - -The decorator ``@dy.rule()`` "registers" the function as a rule using its name (i.e. ``reasonable_bathroom_to_bedrooom_ratio``). -The returned expression provides a boolean value for each row of the data which evaluates to ``True`` whenever the data are valid with respect to this rule. - -Group rules ------------ - -For defining even more complex rules, the ``@dy.rule`` decorator allows for a ``group_by`` -parameter: this allows to evaluate a rule across *rows*. -For our housing data, this allows us to specify, for example, that we want to observe at least two houses per zip code: - -:: - - import dataframely as dy - - class HouseSchema(dy.Schema): - zip_code = dy.String(nullable=False, min_length=3) - num_bedrooms = dy.UInt8(nullable=False) - num_bathrooms = dy.UInt8(nullable=False) - price = dy.Float64(nullable=False) - - @dy.rule() - def reasonable_bathroom_to_bedrooom_ratio() -> pl.Expr: - ratio = pl.col("num_bathrooms") / pl.col("num_bedrooms") - return (ratio >= 1 / 3) & (ratio <= 3) - - @dy.rule(group_by=["zip_code"]) - def minimum_zip_code_count() -> pl.Expr: - return pl.len() >= 2 - -When defining rules on groups, we have to take care to use some kind of "aggregate function" -in order to produce exactly one value per group: -in group rules, the "input" that the expression is evaluated on is a set of rows. - -.. note:: - If you are using `mypy` to type check your code, the usage of rules requires you to use the dataframely `mypy` plugin. In order to use it, add the following to your ``pyproject.toml``: - - :: - - [tool.mypy] - plugins = ["dataframely.mypy"] - - -Validating data against a schema --------------------------------- - -Once we're done defining our schema, we want to apply the schema to our data. -To validate data against a schema, we can use the ``validate`` method of the schema class. For example, we can validate the data set above as follows: - -:: - - import polars as pl - - df = pl.DataFrame({ - "zip_code": ["01234", "01234", "1", "213", "123", "213"], - "num_bedrooms": [2, 2, 1, None, None, 2], - "num_bathrooms": [1, 2, 1, 1, 0, 8], - "price": [100_000, 110_000, 50_000, 80_000, 60_000, 160_000] - }) - - # Validate the data and cast columns to expected types - validated_df = HouseSchema.validate(df, cast=True) - -If any row in ``df`` is invalid, i.e., any rule defined on individual columns or the entire schema evaluates to ``False``, a validation exception is raised. -Here, we have invalid data in the ``num_bedrooms`` and ``zip_code`` columns. - -:: - - RuleValidationError: 2 rules failed validation: - * Column 'num_bedrooms' failed validation for 1 rules: - - 'nullability' failed for 2 rows - * Column 'zip_code' failed validation for 1 rules: - - 'min_length' failed for 1 rows - -Otherwise, if all rows in ``df`` are valid, ``validate`` returns a validated data frame of type ``dy.DataFrame[HouseSchema]``. -The same applies when a ``pl.LazyFrame`` is passed to `validate`. -The generic data frame types allow for more readable function signatures to express -expectations on the schema of the data frame, e.g.: - -:: - - def train_model(df: dy.DataFrame[HouseSchema]) -> None: - ... - -The type checker (typically ``mypy``) then ensures that it is actually a -``dy.DataFrame[HouseSchema]`` that is passed to the function and it complains if a plain -(i.e., non-validated) ``pl.DataFrame`` or a data frame with a different schema is used. -The ``train_model`` function can be implemented with peace of mind: ``df`` looks exactly as needed. - -.. note:: - Make sure that you do not bypass the type checker by using ``# type: ignore`` annotations in these contexts. - This defies the entire purpose of the typed data frames. - Also note that the frame types generic over a schema are *only* available to the static type checker. - If you call ``isinstance()`` checking for ``dy.DataFrame``, it will *always* evaluate to ``False``. - The run-time type of the data frame is still a ``pl.DataFrame``. - -Using soft-validation to introspect validation failures -------------------------------------------------------- - -While ``validate`` is useful for ensuring that the entire dataset meets expectations, -it is not always useful in production systems where invalid rows should be ignored while all valid rows should be salvaged. - -To this end, ``dataframely`` provides the ``filter`` method that performs "soft-validation": - -:: - - # Filter the data and cast columns to expected types - good, failure = HouseSchema.filter(df, cast=True) - - # Inspect the reasons for the failed rows - print(failure.counts()) - -In this case, ``good`` remains to be a ``dy.DataFrame[HouseSchema]``, albeit with potentially fewer rows than ``df``. -The ``failure`` object is of type :class:`~dataframely.FailureInfo` and provides means to inspect -the reasons for validation failures for invalid rows. - -Given the example data above and the schema that we defined, we know that rows 2, 3, 4, and 5 are invalid (0-indexed): - -- Row 2 has a zip code that does not appear at least twice -- Row 3 has a NULL value for the number of bedrooms -- Row 4 violates both of the rules above -- Row 5 violates the reasonable bathroom to bedroom ratio - -Using the ``counts`` method on the :class:`~dataframely.FailureInfo` object will result in the following dictionary: - -:: - - { - "reasonable_bathroom_to_bedrooom_ratio": 1, - "minimum_zip_code_count": 2, - "zip_code|min_length": 1, - "num_bedrooms|nullability": 2, - } - - -To get a data frame containing all failed rows, we can use the ``invalid`` method: - -:: - - failed_df = failure.invalid() - -This information tends to be very useful in tracking down issues with the data, -both in productive systems and analytics environments. - -Type casting ------------- - -In rare cases, you might already be *absolutely certain* that a data frame is valid with -respect to a particular schema and do not want to pay the runtime cost of calling ``validate`` or ``filter``. -To this end, you can use the ``cast`` method to tell this to the type checker without inspecting the contents of the data frame: - -:: - - df_valid = HouseSchema.cast(df) - -A use case for ``cast`` could be the concatenation of two data frames with known schema, e.g.: - -:: - - df1: dy.DataFrame[HouseSchema] - df2: dy.DataFrame[HouseSchema] - df_concat = HouseSchema.cast(pl.concat([df1, df2])) - - -Integration with external tools -------------------------------- - -Lastly, ``dataframely`` schemas can be used to integrate with external tools: - -- ``HouseSchema.create_empty()`` creates an empty ``dy.DataFrame[HouseSchema]`` that can be used for testing -- ``HouseSchema.sql_schema()`` provides a list of `sqlalchemy `_ columns that can be used to create SQL tables using types and constraints in line with the schema -- ``HouseSchema.pyarrow_schema()`` provides a `pyarrow `_ schema with appropriate column dtypes and nullability information -- You can use ``dy.DataFrame[HouseSchema]`` (or the ``LazyFrame`` equivalent) as fields in `pydantic `_ models, including support for validation and serialization. Integration with pydantic is unstable. - - -Outlook -------- - -This concludes the quickstart guide. For more information, please see the `real-world example `__ or dive into the API documentation. diff --git a/docs/sites/versioning.md b/docs/sites/versioning.md new file mode 100644 index 00000000..5a089759 --- /dev/null +++ b/docs/sites/versioning.md @@ -0,0 +1,30 @@ +# Versioning policy and breaking changes + +Dataframely uses [semantic versioning](https://semver.org/). +This versioning scheme is designed to make it easy for users to anticipate what types of change they can expect from a +given version update in their dependencies. +We generally recommend that users take measures to control dependency versions. Personally, we like to use `pixi` as a +package manager, which comes with builtin +support for lockfiles. Many other package managers support similar functionality. When updating the lockfiles, we +recommend to use automated testing +to ensure that user code still works with newer versions of dependencies such as `dataframely`. + +Most importantly, semantic versioning implies that breaking changes of user-facing functionality are only introduced in +**major releases**. +We therefore recommend that users are particularly vigilant when updating their environments to a newer major release of +`dataframely`. +As always, automated testing is useful here, but we also recommend checking the release notes +[published on GitHub](https://github.com/Quantco/dataframely/releases). + +In order to give users a heads-up before breaking changes are released, we introduce +[FutureWarnings](https://docs.python.org/3/library/exceptions.html#FutureWarning). +Warnings are the most direct and effective tool at our disposal for reaching users directly. +We therefore generally recommend that users do not silence such warnings explicitly, but instead migrate their code +proactively, whenever possible. +However, we also understand that the need for migration may catch users at an inconvenient time, and a temporary band +aid solution might be required. +Users can disable `FutureWarnings` either through +[python builtins](https://docs.python.org/3/library/warnings.html#warnings.filterwarnings), +builtins from tools +like [pytest](https://docs.pytest.org/en/stable/how-to/capture-warnings.html#controlling-warnings), +or by setting the `DATAFRAMELY_NO_FUTURE_WARNINGS` environment variable to `true` or `1`. diff --git a/docs/sites/versioning.rst b/docs/sites/versioning.rst deleted file mode 100644 index ecf0fe39..00000000 --- a/docs/sites/versioning.rst +++ /dev/null @@ -1,20 +0,0 @@ -Versioning policy and breaking changes -====================================== - -Dataframely uses `semantic versioning `_. -This versioning scheme is designed to make it easy for users to anticipate what types of change they can expect from a given version update in their dependencies. -We generally recommend that users take measures to control dependency versions. Personally, we like to use ``pixi`` as a package manager, which comes with builtin -support for lockfiles. Many other package managers support similar functionality. When updating the lockfiles, we recommend to use automated testing -to ensure that user code still works with newer versions of dependencies such as ``dataframely``. - -Most importantly, semantic versioning implies that breaking changes of user-facing functionality are only introduced in **major releases**. -We therefore recommend that users are particularly vigilant when updating their environments to a newer major release of `dataframely`. -As always, automated testing is useful here, but we also recommend checking the release notes `published on GitHub `_. - -In order to give users a heads-up before breaking changes are released, we introduce `FutureWarnings `_ . -Warnings are the most direct and effective tool at our disposal for reaching users directly. -We therefore generally recommend that users do not silence such warnings explicitly, but instead migrate their code proactively, whenever possible. -However, we also understand that the need for migration may catch users at an inconvenient time, and a temporary band aid solution might be required. -Users can disable ``FutureWarnings`` either through `python builtins `_, -builtins from tools like `pytest `_ , -or by setting the ``DATAFRAMELY_NO_FUTURE_WARNINGS`` environment variable to ``true`` or ``1``. diff --git a/pixi.lock b/pixi.lock index 27cec496..708867c6 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1942,12 +1942,16 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.2-he9a06e4_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/make-4.4.1-hb9d3cd8_2.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/markdown-it-py-3.0.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.3-py313h3dea7bd_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/matplotlib-inline-0.1.7-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/maturin-1.9.5-py310hf7d6592_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mdit-py-plugins-0.5.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mdurl-0.1.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/mistune-3.1.4-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/mypy-1.18.2-py313h07c4f96_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.1.0-pyha770c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/myst-parser-4.0.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/nbclient-0.10.2-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/nbconvert-7.16.6-hb482800_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/nbconvert-core-7.16.6-pyh29332c3_0.conda @@ -1986,6 +1990,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/python-gil-3.13.7-h4df99d1_100.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytz-2025.2-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py313h3dea7bd_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/pyzmq-27.1.0-py312hfb55c3c_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/referencing-0.36.2-pyh29332c3_0.conda @@ -2023,6 +2028,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.5.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/wcwidth-0.2.14-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/webencodings-0.5.1-pyhd8ed1ab_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zeromq-4.3.5-h387f397_9.conda - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zstandard-0.25.0-py313h54dd161_0.conda @@ -2096,12 +2102,16 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libuuid-2.41.2-h3e4203c_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libzlib-1.3.1-h86ecc28_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/make-4.4.1-h2a6d0cb_2.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/markdown-it-py-3.0.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/markupsafe-3.0.3-py313hfa222a2_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/matplotlib-inline-0.1.7-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/maturin-1.9.5-py310h49ee9a9_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mdit-py-plugins-0.5.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mdurl-0.1.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/mistune-3.1.4-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/mypy-1.18.2-py313h6194ac5_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.1.0-pyha770c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/myst-parser-4.0.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/nbclient-0.10.2-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/nbconvert-7.16.6-hb482800_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/nbconvert-core-7.16.6-pyh29332c3_0.conda @@ -2140,6 +2150,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/python-gil-3.13.7-h4df99d1_100.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytz-2025.2-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pyyaml-6.0.3-py313hd3a54cf_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pyzmq-27.1.0-py312h4552c38_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/readline-8.2-h8382b9d_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/referencing-0.36.2-pyh29332c3_0.conda @@ -2177,6 +2188,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.5.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/wcwidth-0.2.14-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/webencodings-0.5.1-pyhd8ed1ab_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/yaml-0.2.5-h80f16a2_3.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/zeromq-4.3.5-hefbcea8_9.conda - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/zstandard-0.25.0-py313h62ef0ea_0.conda @@ -2238,12 +2250,16 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-64/libzlib-1.3.1-hd23fc13_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/llvm-openmp-21.1.2-h472b3d1_3.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/make-4.4.1-h00291cd_2.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/markdown-it-py-3.0.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/markupsafe-3.0.3-py313h0f4d31d_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/matplotlib-inline-0.1.7-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/maturin-1.9.5-py310h765790a_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mdit-py-plugins-0.5.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mdurl-0.1.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/mistune-3.1.4-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/mypy-1.18.2-py313hf050af9_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.1.0-pyha770c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/myst-parser-4.0.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/nbclient-0.10.2-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/nbconvert-7.16.6-hb482800_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/nbconvert-core-7.16.6-pyh29332c3_0.conda @@ -2282,6 +2298,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/python-gil-3.13.7-h4df99d1_100.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytz-2025.2-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/pyyaml-6.0.3-py313h0f4d31d_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/pyzmq-27.1.0-py312hb7d603e_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/readline-8.2-h7cca4af_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/referencing-0.36.2-pyh29332c3_0.conda @@ -2318,6 +2335,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.5.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/wcwidth-0.2.14-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/webencodings-0.5.1-pyhd8ed1ab_3.conda + - conda: https://conda.anaconda.org/conda-forge/osx-64/yaml-0.2.5-h4132b18_3.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/zeromq-4.3.5-h6c33b1e_9.conda - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/zstandard-0.25.0-py313hcb05632_0.conda @@ -2380,12 +2398,16 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/libzlib-1.3.1-h8359307_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/llvm-openmp-21.1.2-h4a912ad_3.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/make-4.4.1-hc9fafa5_2.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/markdown-it-py-3.0.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/markupsafe-3.0.3-py313h7d74516_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/matplotlib-inline-0.1.7-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/maturin-1.9.5-py310h34f76f2_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mdit-py-plugins-0.5.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mdurl-0.1.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/mistune-3.1.4-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/mypy-1.18.2-py313h6535dbc_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.1.0-pyha770c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/myst-parser-4.0.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/nbclient-0.10.2-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/nbconvert-7.16.6-hb482800_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/nbconvert-core-7.16.6-pyh29332c3_0.conda @@ -2424,6 +2446,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/python-gil-3.13.7-h4df99d1_100.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytz-2025.2-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/pyyaml-6.0.3-py313h7d74516_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/pyzmq-27.1.0-py312hd65ceae_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/readline-8.2-h1d1bf99_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/referencing-0.36.2-pyh29332c3_0.conda @@ -2460,6 +2483,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.5.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/wcwidth-0.2.14-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/webencodings-0.5.1-pyhd8ed1ab_3.conda + - conda: https://conda.anaconda.org/conda-forge/osx-arm64/yaml-0.2.5-h925e9cb_3.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/zeromq-4.3.5-h888dc83_9.conda - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/zstandard-0.25.0-py313h9734d34_0.conda @@ -2525,13 +2549,17 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/libzlib-1.3.1-h2466b09_2.conda - conda: https://conda.anaconda.org/conda-forge/win-64/llvm-openmp-21.1.2-hfa2b4ca_3.conda - conda: https://conda.anaconda.org/conda-forge/win-64/make-4.4.1-h0e40799_2.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/markdown-it-py-3.0.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/markupsafe-3.0.3-py313hd650c13_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/matplotlib-inline-0.1.7-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/maturin-1.9.5-py310h194dfaf_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mdit-py-plugins-0.5.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/mdurl-0.1.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/mistune-3.1.4-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/mkl-2024.2.2-h57928b3_16.conda - conda: https://conda.anaconda.org/conda-forge/win-64/mypy-1.18.2-py313h5ea7bf4_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.1.0-pyha770c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/myst-parser-4.0.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/nbclient-0.10.2-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/nbconvert-7.16.6-hb482800_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/nbconvert-core-7.16.6-pyh29332c3_0.conda @@ -2568,6 +2596,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytz-2025.2-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/pywin32-311-py313h40c08fc_1.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/pyyaml-6.0.3-py313hd650c13_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/pyzmq-27.1.0-py312hbb5da91_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/referencing-0.36.2-pyh29332c3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/requests-2.32.5-pyhd8ed1ab_0.conda @@ -2609,6 +2638,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/wcwidth-0.2.14-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/webencodings-0.5.1-pyhd8ed1ab_3.conda - conda: https://conda.anaconda.org/conda-forge/noarch/win_inet_pton-1.1.0-pyh7428d3b_8.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/yaml-0.2.5-h6a83c73_3.conda - conda: https://conda.anaconda.org/conda-forge/win-64/zeromq-4.3.5-h5bddc39_9.conda - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/zstandard-0.25.0-py313h5fd188c_0.conda @@ -12706,6 +12736,16 @@ packages: license_family: GPL size: 2176937 timestamp: 1727802346950 +- conda: https://conda.anaconda.org/conda-forge/noarch/markdown-it-py-3.0.0-pyhd8ed1ab_1.conda + sha256: 0fbacdfb31e55964152b24d5567e9a9996e1e7902fb08eb7d91b5fd6ce60803a + md5: fee3164ac23dfca50cfcc8b85ddefb81 + depends: + - mdurl >=0.1,<1 + - python >=3.9 + license: MIT + license_family: MIT + size: 64430 + timestamp: 1733250550053 - conda: https://conda.anaconda.org/conda-forge/noarch/markdown-it-py-4.0.0-pyhd8ed1ab_0.conda sha256: 7b1da4b5c40385791dbc3cc85ceea9fad5da680a27d5d3cb8bfaa185e304a89e md5: 5b5203189eb668f042ac2b0826244964 @@ -12873,6 +12913,16 @@ packages: license_family: MIT size: 6404225 timestamp: 1759587134851 +- conda: https://conda.anaconda.org/conda-forge/noarch/mdit-py-plugins-0.5.0-pyhd8ed1ab_0.conda + sha256: 123cc004e2946879708cdb6a9eff24acbbb054990d6131bb94bca7a374ebebfc + md5: 1997a083ef0b4c9331f9191564be275e + depends: + - markdown-it-py >=2.0.0,<5.0.0 + - python >=3.10 + license: MIT + license_family: MIT + size: 43805 + timestamp: 1754946862113 - conda: https://conda.anaconda.org/conda-forge/noarch/mdurl-0.1.2-pyhd8ed1ab_1.conda sha256: 78c1bbe1723449c52b7a9df1af2ee5f005209f67e40b6e1d3c7619127c43b1c7 md5: 592132998493b3ff25fd7479396e8351 @@ -13246,6 +13296,21 @@ packages: license_family: MIT size: 11766 timestamp: 1745776666688 +- conda: https://conda.anaconda.org/conda-forge/noarch/myst-parser-4.0.1-pyhd8ed1ab_0.conda + sha256: f035d0ea623f63247f0f944eb080eaa2a45fb5b7fda8947f4ac94d381ef3bf33 + md5: b528795158847039003033ee0db20e9b + depends: + - docutils >=0.19,<0.22 + - jinja2 + - markdown-it-py >=3.0.0,<4.0.0 + - mdit-py-plugins >=0.4.1,<1 + - python >=3.10 + - pyyaml + - sphinx >=7,<9 + license: MIT + license_family: MIT + size: 73074 + timestamp: 1739381945342 - conda: https://conda.anaconda.org/conda-forge/noarch/nbclient-0.10.2-pyhd8ed1ab_0.conda sha256: a20cff739d66c2f89f413e4ba4c6f6b59c50d5c30b5f0d840c13e8c9c2df9135 md5: 6bb0d77277061742744176ab555b723c diff --git a/pixi.toml b/pixi.toml index 3a2d59cd..cb157e1c 100644 --- a/pixi.toml +++ b/pixi.toml @@ -34,7 +34,7 @@ sphinx = "*" sphinx-copybutton = "*" sphinx_rtd_theme = "*" sphinxcontrib-apidoc = "*" - +myst-parser = "*" [feature.docs.tasks] docs = "cd docs && make html" readthedocs = "rm -rf $READTHEDOCS_OUTPUT/html && cp -r docs/_build/html $READTHEDOCS_OUTPUT/html" From b79e5eb9e62270e0a75bfbe0d58771805c9f1093 Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Fri, 17 Oct 2025 11:25:03 +0200 Subject: [PATCH 2/4] fix --- docs/index.md | 16 ++++++++-------- docs/sites/installation.md | 4 ++-- docs/sites/quickstart.md | 14 ++++++-------- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/docs/index.md b/docs/index.md index fe461695..cd142b8f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,6 +1,6 @@ # Dataframely -Dataframely is a Python package to validate the schema and content of `polars `\_ data frames. +Dataframely is a Python package to validate the schema and content of [polars](https://pola.rs/)\_ data frames. Its purpose is to make data pipelines more robust by ensuring that data meet expectations and more readable by adding schema information to data frame type hints. @@ -24,13 +24,13 @@ schema information to data frame type hints. :caption: Contents :maxdepth: 2 - Installation - Quickstart - Real-world Example - Features - FAQ - Development Guide - Versioning +sites/installation +sites/quickstart +sites/examples/real-world +sites/features/index.md +sites/faq.md +sites/development.md +sites/versioning.md ``` ## API Documentation diff --git a/docs/sites/installation.md b/docs/sites/installation.md index 5982fb33..c660c7a5 100644 --- a/docs/sites/installation.md +++ b/docs/sites/installation.md @@ -3,6 +3,6 @@ To install `dataframely`, use your favorite package manager, e.g., using `pixi` or `pip`: ```bash - pixi add dataframely - pip install dataframely +pixi add dataframely +pip install dataframely ``` diff --git a/docs/sites/quickstart.md b/docs/sites/quickstart.md index 45a9d203..52462157 100644 --- a/docs/sites/quickstart.md +++ b/docs/sites/quickstart.md @@ -44,7 +44,6 @@ bedrooms should not be too high. In `dataframely`, we can do this by adding a custom rule to our schema: ```python - import dataframely as dy @@ -113,7 +112,6 @@ To validate data against a schema, we can use the `validate` method of the schem the data set above as follows: ```python - import polars as pl df = pl.DataFrame({ @@ -161,7 +159,7 @@ If you call `isinstance()` checking for `dy.DataFrame`, it will *always* evaluat The run-time type of the data frame is still a `pl.DataFrame`. ``` -# Using soft-validation to introspect validation failures +## Using soft-validation to introspect validation failures While `validate` is useful for ensuring that the entire dataset meets expectations, it is not always useful in production systems where invalid rows should be ignored while all valid rows should be @@ -170,7 +168,7 @@ salvaged. To this end, `dataframely` provides the `filter` method that performs "soft-validation": ```python - # Filter the data and cast columns to expected types +# Filter the data and cast columns to expected types good, failure = HouseSchema.filter(df, cast=True) # Inspect the reasons for the failed rows @@ -232,15 +230,15 @@ df_concat = HouseSchema.cast(pl.concat([df1, df2])) Lastly, `dataframely` schemas can be used to integrate with external tools: - `HouseSchema.create_empty()` creates an empty `dy.DataFrame[HouseSchema]` that can be used for testing -- `HouseSchema.sql_schema()` provides a list of `sqlalchemy `\_ columns that can be used to +- `HouseSchema.sql_schema()` provides a list of [sqlalchemy](https://www.sqlalchemy.org) columns that can be used to create SQL tables using types and constraints in line with the schema -- `HouseSchema.pyarrow_schema()` provides a `pyarrow `\_ schema with +- `HouseSchema.pyarrow_schema()` provides a [pyarrow](https://arrow.apache.org/docs/python/index.html) schema with appropriate column dtypes and nullability information - You can use `dy.DataFrame[HouseSchema]` (or the `LazyFrame` equivalent) as fields in - `pydantic `\_ models, including support for validation and serialization. Integration with + [pydantic](https://pydantic.dev) models, including support for validation and serialization. Integration with pydantic is unstable. ## Outlook This concludes the quickstart guide. For more information, please see the -`real-world example `\_\_ or dive into the API documentation. +[real-world example](examples/real-world.ipynb) or dive into the API documentation. From 34ae8bb5c347aba3cbd597ac9ce1c9282a5a9113 Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Fri, 17 Oct 2025 11:28:46 +0200 Subject: [PATCH 3/4] fix --- docs/sites/development.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/sites/development.md b/docs/sites/development.md index fd98a8ce..b37e5dd4 100644 --- a/docs/sites/development.md +++ b/docs/sites/development.md @@ -26,7 +26,12 @@ pixi run test You can adjust the `tests/` path to run tests in a specific directory or module. -## Building the Documentation +## Documentation + +We use [Sphinx](https://www.sphinx-doc.org/en/master/index.html) together +with [MyST](https://myst-parser.readthedocs.io/), and write user documentation in markdown. +If you are not yet familiar with this setup, +the [MyST docs for Sphinx](https://myst-parser.readthedocs.io/en/v0.17.2/sphinx/intro.html) are a good starting point. When updating the documentation, you can compile a localized build of the documentation and then open it in your web browser using the commands below: From 9206f4231437b7aabecab126504c16b64bc304ed Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Fri, 17 Oct 2025 12:04:24 +0200 Subject: [PATCH 4/4] fix --- dataframely/collection.py | 34 +++++++++++++++++----------------- docs/conf.py | 1 - 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/dataframely/collection.py b/dataframely/collection.py index 4b6b8c37..0916444b 100644 --- a/dataframely/collection.py +++ b/dataframely/collection.py @@ -63,7 +63,7 @@ class Collection(BaseCollection, ABC): represent "semantic objects" which cannot be represented in a single data frame due to 1-N relationships that are managed in separate data frames. - A collection must only have type annotations for :class:`~dataframely.LazyFrame`s + A collection must only have type annotations for :class:`~dataframely.LazyFrame` with known schema: .. code:: python @@ -786,20 +786,20 @@ def read_parquet( Parquet files may have been written with Hive partitioning. validation: The strategy for running validation when reading the data: - - ``"allow"`: The method tries to read the schema data from the parquet + - ``"allow"``: The method tries to read the schema data from the parquet files. If the stored collection schema matches this collection schema, the collection is read without validation. If the stored schema mismatches this schema no metadata can be found in the parquets, or the files have conflicting metadata, this method automatically runs :meth:`validate` with ``cast=True``. - - ``"warn"`: The method behaves similarly to ``"allow"``. However, + - ``"warn"``: The method behaves similarly to ``"allow"``. However, it prints a warning if validation is necessary. - ``"forbid"``: The method never runs validation automatically and only returns if the metadata stores a collection schema that matches this collection. - ``"skip"``: The method never runs validation and simply reads the - data, entrusting the user that the schema is valid. _Use this option - carefully_. + data, entrusting the user that the schema is valid. *Use this option + carefully*. kwargs: Additional keyword arguments passed directly to :meth:`polars.read_parquet`. @@ -849,20 +849,20 @@ def scan_parquet( Parquet files may have been written with Hive partitioning. validation: The strategy for running validation when reading the data: - - ``"allow"`: The method tries to read the schema data from the parquet + - ``"allow"``: The method tries to read the schema data from the parquet files. If the stored collection schema matches this collection schema, the collection is read without validation. If the stored schema mismatches this schema no metadata can be found in the parquets, or the files have conflicting metadata, this method automatically runs :meth:`validate` with ``cast=True``. - - ``"warn"`: The method behaves similarly to ``"allow"``. However, + - ``"warn"``: The method behaves similarly to ``"allow"``. However, it prints a warning if validation is necessary. - ``"forbid"``: The method never runs validation automatically and only returns if the metadata stores a collection schema that matches this collection. - ``"skip"``: The method never runs validation and simply reads the - data, entrusting the user that the schema is valid. _Use this option - carefully_. + data, entrusting the user that the schema is valid. *Use this option + carefully*. kwargs: Additional keyword arguments passed directly to :meth:`polars.scan_parquet` for all members. @@ -947,20 +947,20 @@ def scan_delta( source: The location or DeltaTable to read from. validation: The strategy for running validation when reading the data: - - ``"allow"`: The method tries to read the schema data from the parquet + - ``"allow"``: The method tries to read the schema data from the parquet files. If the stored collection schema matches this collection schema, the collection is read without validation. If the stored schema mismatches this schema no metadata can be found in the parquets, or the files have conflicting metadata, this method automatically runs :meth:`validate` with ``cast=True``. - - ``"warn"`: The method behaves similarly to ``"allow"``. However, + - ``"warn"``: The method behaves similarly to ``"allow"``. However, it prints a warning if validation is necessary. - ``"forbid"``: The method never runs validation automatically and only returns if the metadata stores a collection schema that matches this collection. - ``"skip"``: The method never runs validation and simply reads the - data, entrusting the user that the schema is valid. _Use this option - carefully_. + data, entrusting the user that the schema is valid. *Use this option + carefully*. kwargs: Additional keyword arguments passed directly to :meth:`polars.scan_delta`. @@ -1010,20 +1010,20 @@ def read_delta( source: The location or DeltaTable to read from. validation: The strategy for running validation when reading the data: - - ``"allow"`: The method tries to read the schema data from the parquet + - ``"allow"``: The method tries to read the schema data from the parquet files. If the stored collection schema matches this collection schema, the collection is read without validation. If the stored schema mismatches this schema no metadata can be found in the parquets, or the files have conflicting metadata, this method automatically runs :meth:`validate` with ``cast=True``. - - ``"warn"`: The method behaves similarly to ``"allow"``. However, + - ``"warn"``: The method behaves similarly to ``"allow"``. However, it prints a warning if validation is necessary. - ``"forbid"``: The method never runs validation automatically and only returns if the metadata stores a collection schema that matches this collection. - ``"skip"``: The method never runs validation and simply reads the - data, entrusting the user that the schema is valid. _Use this option - carefully_. + data, entrusting the user that the schema is valid. *Use this option + carefully*. kwargs: Additional keyword arguments passed directly to :meth:`polars.read_delta`. diff --git a/docs/conf.py b/docs/conf.py index 0ada8e39..ad2e9bf0 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -43,7 +43,6 @@ "sphinx.ext.autodoc", "sphinx.ext.linkcode", "sphinxcontrib.apidoc", - "sphinx.ext.intersphinx", "myst_parser", ]