In [1]:
import polars as pl
import opendp.prelude as dp
import yaml
dp.enable_features("contrib")

In [2]:
data_path = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv"
metadata_path = "penguin_metadata.yaml"

In [3]:
with open(metadata_path, "r") as f:
    metadata = yaml.safe_load(f)

In [4]:
bl_lb = metadata["columns"]["bill_length_mm"]["lower"]
bl_ub = metadata["columns"]["bill_length_mm"]["upper"]

In [5]:
lf = pl.scan_csv(data_path)
lf.head(2)

In [6]:
lazy_lf = lf.lazy()

## Context API

In [7]:
context = dp.Context.compositor(
    data=lazy_lf,
    privacy_unit=dp.unit_of(contributions=2),
    privacy_loss=dp.loss_of(epsilon=1.0),
    split_evenly_over=1,
    margins=[
        dp.polars.Margin(
            max_length=500
        ),
    ],
)

In [None]:
query_num_responses = context.query().select(dp.len())
query_num_responses.release().collect().item()

### Just a mean

In [None]:
mean_query = context.query().select(
    pl.col.bill_length_mm.cast(int).dp.sum(bounds=(30, 65)),
    dp.len(),
)
mean_query.release().collect().with_columns(
    mean=pl.col.bill_length_mm / pl.col.len
)

### A synthetic dataset

In [6]:
table_aim = (
    context.query(rho=0.1, delta=0.0)
    # transformations/truncation may be applied here
    .select(
        "sex", "species", "island", 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm'
    ).contingency_table(
        keys={"sex": ["MALE", "FEMALE"],
              "species": ['Adelie'],
              "island": ['Dream', 'Torgersen', 'Biscoe']
             },
        cuts={"bill_length_mm": [30,42,54,65],'bill_depth_mm':[13,18,23], 'flipper_length_mm': [150,200,250]},
        algorithm=dp.mbi.AIM(),
    )
    .release()
)

In [7]:
table_aim.synthesize(rows=1000)

sex,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm
enum,enum,enum,f64,f64,i64
"""FEMALE""","""Adelie""","""Biscoe""",34.431485,13.150151,209
"""MALE""",,"""Dream""",29.06146,18.027628,187
"""FEMALE""","""Adelie""","""Dream""",35.582999,13.472987,227
"""MALE""",,"""Torgersen""",30.942098,22.179769,191
"""MALE""",,"""Torgersen""",55.27336,17.370779,206
…,…,…,…,…,…
"""MALE""",,"""Dream""",48.480758,20.061722,212
"""MALE""","""Adelie""","""Biscoe""",43.049056,17.617474,190
"""FEMALE""",,"""Biscoe""",29.019981,16.490279,247
"""MALE""","""Adelie""","""Biscoe""",52.041217,13.545329,243


## Framework API

In [14]:
from opendp_helper import get_raw_lf_domain, add_global_margin
from opendp import measures as ms

In [21]:
lf_domain = get_raw_lf_domain(metadata)

margin = dp.polars.Margin(max_length=metadata["rows"], invariant="keys")
lf_domain = dp.with_margin(lf_domain, margin)

### Just a mean

In [28]:
plan = lazy_lf.select(
    pl.col("bill_length_mm").dp.mean(bounds=(bl_lb, bl_ub), scale=100_000), dp.len(scale=1)
)
opendp_pipe = dp.m.make_private_lazyframe(
    lf_domain, dp.symmetric_distance(), ms.max_divergence(), plan
)
cost = opendp_pipe.map(d_in=int(metadata["max_ids"]))
print(f"Cost: {cost}")

release_data = opendp_pipe(lazy_lf)
release_data = release_data.collect()
release_data

Cost: 1.0006600000000343


bill_length_mm,len
f64,u32
inf,338


### A synthetic dataset

In [None]:
d_in = 1
input_metric = dp.symmetric_distance()
input_domain = dp.vector_domain(dp.atom_domain(T=float))

d_out = 1.0
privacy_measure = dp.max_divergence()

bounds = (0.0, 100.0)
imputed_value = 50.0

In [None]:
m_sc = dp.c.make_adaptive_composition(
    input_domain=input_domain,
    input_metric=input_metric,
    output_measure=privacy_measure,
    d_in=d_in,
    d_mids=[d_out / 3] * 3,
)

In [9]:
table_mst = (
    context.query(rho=0.1, delta=0.0)
    # transformations/truncation may be applied here
    .select(
        "sex", "species", "island", 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm'
    ).contingency_table(
        keys={"sex": ["MALE", "FEMALE"],
              "species": ['Adelie'],
              "island": ['Dream', 'Torgersen', 'Biscoe']
             },
        cuts={"bill_length_mm": [30, 42, 54, 65], 'bill_depth_mm':[13, 18, 23], 'flipper_length_mm': [150, 200, 250]},
        algorithm=dp.mbi.MST(),
    )
    .release()
)

OpenDPException: 
  FailedFunction("unknown ordering between (0.19999999999999998, 0.0) and (0.19, 1e-7)")

In [None]:
table_mst.synthesize(rows=1000)