Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 45 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,50 @@ tables:
sample: 10
```

## 3. Migration utils
## 3. Anonymization

You can scrub PII from selected columns with `padmy anonymize`. Field types map
to Faker generators (or `NULL` to blank the column). Available types:

| Type | Behavior |
| --- | --- |
| `EMAIL` | `faker.email()` — supports `domain:` extra arg |
| `NULL` | Sets the column to `NULL` (useful for hashes, tokens, anything you don't want to fake) |
| `FIRST_NAME` | `faker.first_name()` |
| `LAST_NAME` | `faker.last_name()` |
| `NAME` | `faker.name()` (full name) |
| `PHONE_NUMBER` | `faker.phone_number()` |
| `DATE_OF_BIRTH` | `faker.date_of_birth()` — supports `minimum_age:` / `maximum_age:` |
| `TEXT` | `faker.text()` — supports `max_nb_chars:` |
| `WORD` | `faker.word()` |

Example config:

```yaml
tables:
- schema: public
table: users
fields:
- column: email
type: EMAIL
domain: example.com # extra arg forwarded to faker.email
- column: password_hash
type: "NULL" # quoted: YAML's bare NULL parses as null
- column: first_name
type: FIRST_NAME
- column: birthdate
type: DATE_OF_BIRTH
minimum_age: 18
maximum_age: 80
```

Run with:

```bash
uvx padmy anonymize --db test -f config.yml
```

## 4. Migration utils

**Setting up**

Expand Down Expand Up @@ -227,7 +270,7 @@ We are all good !
uvx padmy -vv migrate verify-files --sql-dir /tmp/migrations --no-raise
```

## 4. Comparing databases schemas
## 5. Comparing databases schemas

You can compare two databases by running:

Expand Down
19 changes: 19 additions & 0 deletions padmy/anonymize/anonymize.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,25 @@ def _get_fake_value(faker: Faker, field: FieldType, extra_fields: dict | None =
match field:
case "EMAIL":
return faker.email(**_extra_fields)
case "NULL":
# Drop the column to NULL. Useful when no realistic placeholder is
# wanted (password hashes, tokens, secrets) — the column shape is
# preserved but the value carries no information.
return None
case "FIRST_NAME":
return faker.first_name(**_extra_fields)
case "LAST_NAME":
return faker.last_name(**_extra_fields)
case "NAME":
return faker.name(**_extra_fields)
case "PHONE_NUMBER":
return faker.phone_number(**_extra_fields)
case "DATE_OF_BIRTH":
return faker.date_of_birth(**_extra_fields)
case "TEXT":
return faker.text(**_extra_fields)
case "WORD":
return faker.word(**_extra_fields)
case _:
raise ValueError(f"Got unimplemented field type {field!r}")

Expand Down
12 changes: 11 additions & 1 deletion padmy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,17 @@

import yaml

FieldType = Literal["EMAIL"]
FieldType = Literal[
"EMAIL",
"NULL",
"FIRST_NAME",
"LAST_NAME",
"NAME",
"PHONE_NUMBER",
"DATE_OF_BIRTH",
"TEXT",
"WORD",
]

SampleType = float | int

Expand Down
27 changes: 27 additions & 0 deletions tests/test_anonymize.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,33 @@ async def test():
]


@pytest.mark.parametrize(
"field_type, extra, predicate",
[
pytest.param("EMAIL", None, lambda v: v and "@" in v, id="EMAIL"),
pytest.param("NULL", None, lambda v: v is None, id="NULL"),
pytest.param("FIRST_NAME", None, lambda v: isinstance(v, str) and v, id="FIRST_NAME"),
pytest.param("LAST_NAME", None, lambda v: isinstance(v, str) and v, id="LAST_NAME"),
pytest.param("NAME", None, lambda v: isinstance(v, str) and " " in v, id="NAME"),
pytest.param("PHONE_NUMBER", None, lambda v: isinstance(v, str) and v, id="PHONE_NUMBER"),
pytest.param("WORD", None, lambda v: isinstance(v, str) and v, id="WORD"),
],
)
def test_get_fake_value(faker, field_type, extra, predicate):
"""Each supported field type returns something matching its shape."""
from padmy.anonymize.anonymize import _get_fake_value

value = _get_fake_value(faker, field_type, extra)
assert predicate(value), f"unexpected value for {field_type}: {value!r}"


def test_get_fake_value_unknown_type_raises(faker):
from padmy.anonymize.anonymize import _get_fake_value

with pytest.raises(ValueError, match="unimplemented field type"):
_get_fake_value(faker, "DOES_NOT_EXIST") # type: ignore[arg-type]


@pytest.mark.usefixtures("add_table_1_data")
def test_anonymize_db(apool, engine, loop, faker):
from padmy.anonymize import anonymize_db
Expand Down