From a9ec0bbdafa943bf1c6eba23d656190a6a972f7b Mon Sep 17 00:00:00 2001 From: julien Date: Tue, 12 May 2026 22:18:42 +0200 Subject: [PATCH] feat(anonymize): add NULL, NAME, PHONE_NUMBER, DATE_OF_BIRTH, TEXT, WORD types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `anonymize` previously supported only `EMAIL`. To cover more PII columns without falling back to custom SQL post-steps, extend `FieldType` with: - `NULL` — blank the column. Useful for hashes / tokens / anything that shouldn't carry information in a shared dump. Note YAML's bare `NULL` parses as null, so quote the type: `type: "NULL"`. - `FIRST_NAME`, `LAST_NAME`, `NAME` — `faker.first_name()` / `last_name()` / `name()`. - `PHONE_NUMBER` — `faker.phone_number()`. - `DATE_OF_BIRTH` — `faker.date_of_birth()`, accepts `minimum_age` / `maximum_age`. - `TEXT`, `WORD` — `faker.text()` / `faker.word()`. All types forward their extra_args to the underlying Faker call, so the existing `EMAIL` `domain:` pattern works for the new types where Faker exposes options. Tests: 7 new parametrized cases over `_get_fake_value` plus an explicit check that an unknown type raises `ValueError`. README: new "3. Anonymization" section with the type table + an example config; subsequent sections renumbered. --- README.md | 47 ++++++++++++++++++++++++++++++++++-- padmy/anonymize/anonymize.py | 19 +++++++++++++++ padmy/config.py | 12 ++++++++- tests/test_anonymize.py | 27 +++++++++++++++++++++ 4 files changed, 102 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 47af6cf..830ff57 100644 --- a/README.md +++ b/README.md @@ -102,7 +102,50 @@ tables: sample: 10 ``` -## 3. Migration utils +## 3. Anonymization + +You can scrub PII from selected columns with `padmy anonymize`. Field types map +to Faker generators (or `NULL` to blank the column). Available types: + +| Type | Behavior | +| --- | --- | +| `EMAIL` | `faker.email()` — supports `domain:` extra arg | +| `NULL` | Sets the column to `NULL` (useful for hashes, tokens, anything you don't want to fake) | +| `FIRST_NAME` | `faker.first_name()` | +| `LAST_NAME` | `faker.last_name()` | +| `NAME` | `faker.name()` (full name) | +| `PHONE_NUMBER` | `faker.phone_number()` | +| `DATE_OF_BIRTH` | `faker.date_of_birth()` — supports `minimum_age:` / `maximum_age:` | +| `TEXT` | `faker.text()` — supports `max_nb_chars:` | +| `WORD` | `faker.word()` | + +Example config: + +```yaml +tables: + - schema: public + table: users + fields: + - column: email + type: EMAIL + domain: example.com # extra arg forwarded to faker.email + - column: password_hash + type: "NULL" # quoted: YAML's bare NULL parses as null + - column: first_name + type: FIRST_NAME + - column: birthdate + type: DATE_OF_BIRTH + minimum_age: 18 + maximum_age: 80 +``` + +Run with: + +```bash +uvx padmy anonymize --db test -f config.yml +``` + +## 4. Migration utils **Setting up** @@ -227,7 +270,7 @@ We are all good ! uvx padmy -vv migrate verify-files --sql-dir /tmp/migrations --no-raise ``` -## 4. Comparing databases schemas +## 5. Comparing databases schemas You can compare two databases by running: diff --git a/padmy/anonymize/anonymize.py b/padmy/anonymize/anonymize.py index d79e45f..7e74730 100644 --- a/padmy/anonymize/anonymize.py +++ b/padmy/anonymize/anonymize.py @@ -41,6 +41,25 @@ def _get_fake_value(faker: Faker, field: FieldType, extra_fields: dict | None = match field: case "EMAIL": return faker.email(**_extra_fields) + case "NULL": + # Drop the column to NULL. Useful when no realistic placeholder is + # wanted (password hashes, tokens, secrets) — the column shape is + # preserved but the value carries no information. + return None + case "FIRST_NAME": + return faker.first_name(**_extra_fields) + case "LAST_NAME": + return faker.last_name(**_extra_fields) + case "NAME": + return faker.name(**_extra_fields) + case "PHONE_NUMBER": + return faker.phone_number(**_extra_fields) + case "DATE_OF_BIRTH": + return faker.date_of_birth(**_extra_fields) + case "TEXT": + return faker.text(**_extra_fields) + case "WORD": + return faker.word(**_extra_fields) case _: raise ValueError(f"Got unimplemented field type {field!r}") diff --git a/padmy/config.py b/padmy/config.py index 2f8d508..13d12ec 100644 --- a/padmy/config.py +++ b/padmy/config.py @@ -4,7 +4,17 @@ import yaml -FieldType = Literal["EMAIL"] +FieldType = Literal[ + "EMAIL", + "NULL", + "FIRST_NAME", + "LAST_NAME", + "NAME", + "PHONE_NUMBER", + "DATE_OF_BIRTH", + "TEXT", + "WORD", +] SampleType = float | int diff --git a/tests/test_anonymize.py b/tests/test_anonymize.py index 396c40f..9f6d7ce 100644 --- a/tests/test_anonymize.py +++ b/tests/test_anonymize.py @@ -78,6 +78,33 @@ async def test(): ] +@pytest.mark.parametrize( + "field_type, extra, predicate", + [ + pytest.param("EMAIL", None, lambda v: v and "@" in v, id="EMAIL"), + pytest.param("NULL", None, lambda v: v is None, id="NULL"), + pytest.param("FIRST_NAME", None, lambda v: isinstance(v, str) and v, id="FIRST_NAME"), + pytest.param("LAST_NAME", None, lambda v: isinstance(v, str) and v, id="LAST_NAME"), + pytest.param("NAME", None, lambda v: isinstance(v, str) and " " in v, id="NAME"), + pytest.param("PHONE_NUMBER", None, lambda v: isinstance(v, str) and v, id="PHONE_NUMBER"), + pytest.param("WORD", None, lambda v: isinstance(v, str) and v, id="WORD"), + ], +) +def test_get_fake_value(faker, field_type, extra, predicate): + """Each supported field type returns something matching its shape.""" + from padmy.anonymize.anonymize import _get_fake_value + + value = _get_fake_value(faker, field_type, extra) + assert predicate(value), f"unexpected value for {field_type}: {value!r}" + + +def test_get_fake_value_unknown_type_raises(faker): + from padmy.anonymize.anonymize import _get_fake_value + + with pytest.raises(ValueError, match="unimplemented field type"): + _get_fake_value(faker, "DOES_NOT_EXIST") # type: ignore[arg-type] + + @pytest.mark.usefixtures("add_table_1_data") def test_anonymize_db(apool, engine, loop, faker): from padmy.anonymize import anonymize_db