In [1]:
import polars as pl
import opendp.prelude as dp
import yaml
dp.enable_features("contrib")

In [2]:
data_path = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv"
metadata_path = "penguin_metadata.yaml"

In [3]:
with open(metadata_path, "r") as f:
    metadata = yaml.safe_load(f)

In [4]:
bl_lb = metadata["columns"]["bill_length_mm"]["lower"]
bl_ub = metadata["columns"]["bill_length_mm"]["upper"]

In [25]:
lf = pl.scan_csv(data_path).collect()
lf.head(2)

species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
str,str,f64,f64,i64,i64,str
"""Adelie""","""Torgersen""",39.1,18.7,181,3750,"""MALE"""
"""Adelie""","""Torgersen""",39.5,17.4,186,3800,"""FEMALE"""


In [26]:
lazy_lf = lf.lazy()

## Context API

In [46]:
context = dp.Context.compositor(
    data=lazy_lf,
    privacy_unit=dp.unit_of(contributions=2),
    privacy_loss=dp.loss_of(epsilon=1.0),
    split_evenly_over=1,
    margins=[
        dp.polars.Margin(max_length=500),
    ],
)

### Just a sum and len

In [47]:
mean_query = context.query().select(
    pl.col.bill_length_mm.cast(int).dp.sum(bounds=(30, 65)),
    dp.len()
)
mean_query.release().collect()

bill_length_mm,len
i64,u32
14707,335


In [67]:
mean_query.summarize()

column,aggregate,distribution,scale
str,str,str,f64
"""bill_length_mm""","""Sum""","""Integer Laplace""",260.0
"""len""","""Frame Length""","""Integer Laplace""",4.0


### A synthetic dataset

In [55]:
context = dp.Context.compositor(
    data=lazy_lf,
    privacy_unit=dp.unit_of(contributions=2),
    privacy_loss=dp.loss_of(rho=0.19, delta=1e-7),
    # split_evenly_over=1, error if has this parameter
    margins=[
        dp.polars.Margin(max_length=500),
    ],
)

In [56]:
table_aim = (
    context.query(rho=0.1, delta=0.0)
    .select(
        "sex", "species", "island", 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm'
    ).contingency_table(
        keys={
            "sex": ["MALE", "FEMALE"],
            "species": ['Adelie', 'Chinstrap', 'Gentoo'],
            "island": ['Dream', 'Torgersen', 'Biscoe']
             },
        cuts={
            "bill_length_mm": [30, 42, 54, 65],
            'bill_depth_mm': [13,18,23],
            'flipper_length_mm': [150,200,250]
        },
        algorithm=dp.mbi.AIM(),
    )
    .release()
)

In [57]:
table_aim.synthesize(rows=1000)

sex,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm
str,str,str,f64,f64,i64
"""FEMALE""","""Adelie""","""Dream""",45.983527,14.643416,160
"""MALE""","""Gentoo""","""Biscoe""",52.31929,15.720447,181
"""MALE""","""Chinstrap""","""Torgersen""",52.705494,14.272297,204
,"""Gentoo""","""Dream""",45.069304,20.44747,179
"""MALE""","""Gentoo""","""Dream""",51.752809,21.426858,228
…,…,…,…,…,…
"""MALE""","""Gentoo""","""Dream""",31.739026,20.459406,223
"""MALE""","""Gentoo""","""Biscoe""",53.279086,18.11024,234
"""FEMALE""","""Adelie""","""Biscoe""",38.358699,18.355451,173
"""FEMALE""","""Adelie""","""Torgersen""",37.635967,16.052841,221


## Framework API

In [58]:
from opendp_helper import get_raw_lf_domain, add_global_margin
from opendp import measures as ms

In [59]:
lf_domain = get_raw_lf_domain(metadata)

margin = dp.polars.Margin(max_length=metadata["rows"], invariant="keys")
lf_domain = dp.with_margin(lf_domain, margin)

### Just a mean and len

In [65]:
plan = lazy_lf.select(
    pl.col("bill_length_mm").dp.mean(bounds=(bl_lb, bl_ub), scale=100), 
    dp.len(scale=1)
)
opendp_pipe = dp.m.make_private_lazyframe(
    lf_domain, dp.symmetric_distance(), ms.max_divergence(), plan
)
cost = opendp_pipe.map(d_in=int(metadata["max_ids"]))
print(f"Cost: {cost}")

release_data = opendp_pipe(lazy_lf)
release_data = release_data.collect()
release_data

Cost: 1.6600000000341588


bill_length_mm,len
f64,u32
133.650555,334


### A synthetic dataset

In [66]:
plan_aim = (
    lazy_lf.select(
        "sex", "species", "island", 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm'
    ).contingency_table(
        keys={
            "sex": ["MALE", "FEMALE"],
            "species": ['Adelie', 'Chinstrap', 'Gentoo'],
            "island": ['Dream', 'Torgersen', 'Biscoe']
             },
        cuts={
            "bill_length_mm": [30, 42, 54, 65],
            'bill_depth_mm': [13, 18, 23],
            'flipper_length_mm': [150, 200, 250]
        },
        algorithm=dp.mbi.AIM(),
    )
    .release()
)

AttributeError: 'LazyFrame' object has no attribute 'contingency_table'

In [None]:
opendp_pipe = dp.m.make_private_lazyframe(
    lf_domain, dp.symmetric_distance(), ms.max_divergence(), plan
)
cost = opendp_pipe.map(d_in=int(metadata["max_ids"]))
print(f"Cost: {cost}")

release_data = opendp_pipe(lazy_lf)
release_data = release_data.collect()
release_data