In [1]:
import polars as pl
import opendp.prelude as dp
import yaml
dp.enable_features("contrib")

In [2]:
data_path = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv"
metadata_path = "penguin_metadata.yaml"

In [3]:
with open(metadata_path, "r") as f:
    metadata = yaml.safe_load(f)

In [4]:
bl_lb = metadata["columns"]["bill_length_mm"]["lower"]
bl_ub = metadata["columns"]["bill_length_mm"]["upper"]

In [5]:
lf = pl.scan_csv(data_path).collect()
lf.head(2)

species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
str,str,f64,f64,i64,i64,str
"""Adelie""","""Torgersen""",39.1,18.7,181,3750,"""MALE"""
"""Adelie""","""Torgersen""",39.5,17.4,186,3800,"""FEMALE"""


In [6]:
lazy_lf = lf.lazy()

## Context API

In [7]:
context = dp.Context.compositor(
    data=lazy_lf,
    privacy_unit=dp.unit_of(contributions=2),
    privacy_loss=dp.loss_of(epsilon=1.0),
    split_evenly_over=1,
    margins=[
        dp.polars.Margin(max_length=500),
    ],
)

### Just a sum and len

In [8]:
mean_query = context.query().select(
    pl.col.bill_length_mm.cast(int).dp.sum(bounds=(30, 65)),
    dp.len()
)

In [9]:
mean_query.summarize()

column,aggregate,distribution,scale
str,str,str,f64
"""bill_length_mm""","""Sum""","""Integer Laplace""",260.0
"""len""","""Frame Length""","""Integer Laplace""",4.0


In [10]:
mean_query.release().collect()

bill_length_mm,len
i64,u32
15095,351


### A synthetic dataset

In [11]:
context = dp.Context.compositor(
    data=lazy_lf,
    privacy_unit=dp.unit_of(contributions=2),
    privacy_loss=dp.loss_of(rho=0.19, delta=1e-7),
    # split_evenly_over=1, error if has this parameter
    margins=[
        dp.polars.Margin(max_length=500),
    ],
)

In [12]:
table_aim = (
    context.query(rho=0.1, delta=0.0)
    .select(
        "sex", "species", "island", 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm'
    ).contingency_table(
        keys={
            "sex": ["MALE", "FEMALE"],
            "species": ['Adelie', 'Chinstrap', 'Gentoo'],
            "island": ['Dream', 'Torgersen', 'Biscoe']
             },
        cuts={
            "bill_length_mm": [30, 42, 54, 65],
            'bill_depth_mm': [13,18,23],
            'flipper_length_mm': [150,200,250]
        },
        algorithm=dp.mbi.AIM(),
    )
    .release()
)

In [13]:
table_aim.synthesize(rows=1000)

sex,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm
str,str,str,f64,f64,i64
"""MALE""","""Adelie""","""Biscoe""",42.938434,21.436842,177
,"""Adelie""","""Dream""",33.969006,12.319888,165
"""FEMALE""","""Adelie""","""Dream""",49.584048,15.541649,240
"""MALE""","""Adelie""","""Biscoe""",50.929077,16.395965,175
"""FEMALE""","""Gentoo""","""Biscoe""",43.158024,19.14188,156
…,…,…,…,…,…
"""MALE""","""Adelie""","""Torgersen""",44.9507,14.728648,184
"""FEMALE""","""Gentoo""","""Dream""",40.659908,18.910473,209
"""FEMALE""","""Gentoo""","""Dream""",46.507862,16.735932,199
"""FEMALE""","""Gentoo""","""Biscoe""",50.456128,16.750706,232


## Framework API

In [14]:
from opendp_helper import get_raw_lf_domain, add_global_margin
from opendp import measures as ms

In [15]:
lf_domain = get_raw_lf_domain(metadata)

margin = dp.polars.Margin(max_length=metadata["rows"], invariant="keys")
lf_domain = dp.with_margin(lf_domain, margin)

### Just a mean and len

In [16]:
plan = lazy_lf.select(
    pl.col("bill_length_mm").dp.mean(bounds=(bl_lb, bl_ub), scale=100), 
    dp.len(scale=1)
)
opendp_pipe = dp.m.make_private_lazyframe(
    lf_domain, dp.symmetric_distance(), ms.max_divergence(), plan
)
cost = opendp_pipe.map(d_in=int(metadata["max_ids"]))
print(f"Cost: {cost}")

release_data = opendp_pipe(lazy_lf)
release_data = release_data.collect()
release_data

Cost: 1.6600000000341588


bill_length_mm,len
f64,u32
40.643018,346


In [17]:
lazy_lf.select(
    "sex", "species", "island", 
    'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm'
)

### A synthetic dataset

In [26]:
from opendp.extras.mbi import make_contingency_table

In [31]:
lf_domain.columns

['species',
 'island',
 'bill_length_mm',
 'bill_depth_mm',
 'flipper_length_mm',
 'body_mass_g',
 'sex']

In [37]:
plan_aim = make_contingency_table(
    input_domain = lf_domain,
    input_metric = dp.symmetric_distance(),
    output_measure = ms.max_divergence(),
    d_in = 1,
    d_out = 1.0,
    keys={
        "sex": ["MALE", "FEMALE"],
        "species": ['Adelie', 'Chinstrap', 'Gentoo'],
        "island": ['Dream', 'Torgersen', 'Biscoe']
         },
    cuts={
        "bill_length_mm": [30, 42, 54, 65],
        "bill_depth_mm": [13, 18, 23],
        "flipper_length_mm": [150, 200, 250], 
        "body_mass_g": [2000, 4500, 7000]
    },
    algorithm=dp.mbi.AIM(),
)

In [39]:
opendp_pipe = dp.m.make_private_lazyframe(
    lf_domain, dp.symmetric_distance(), ms.max_divergence(), plan_aim
)
cost = opendp_pipe.map(d_in=int(metadata["max_ids"]))
print(f"Cost: {cost}")

release_data = opendp_pipe(lazy_lf)
release_data = release_data.collect()
release_data

ValueError: expected Polars LazyFrame

In [24]:
plain_aim = make_contingency_table(
    keys={
        "sex": ["MALE", "FEMALE"],
        "species": ['Adelie', 'Chinstrap', 'Gentoo'],
        "island": ['Dream', 'Torgersen', 'Biscoe']
         },
    cuts={
        "bill_length_mm": [30, 42, 54, 65],
        "bill_depth_mm": [13, 18, 23],
        "flipper_length_mm": [150, 200, 250]
    },
    algorithm=dp.mbi.AIM(),
)

AttributeError: 'LazyFrame' object has no attribute 'contingency_table'