In [17]:
%%capture
pip install 'opendp[polars]'

# Explore opendp==0.14
https://docs.opendp.org/en/stable/getting-started/quickstart.html

In [1]:
import opendp.prelude as dp
import polars as pl

dp.enable_features("contrib")

In [2]:
PATH = "penguin.csv"
#PATH = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv"
#PATH = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv"
#PATH = dp.examples.get_france_lfs_path()

In [3]:
lf = pl.scan_csv(PATH, ignore_errors=True)
lf

In [4]:
lf.collect().head(2)

species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
str,str,f64,f64,f64,f64,str
"""Adelie""","""Torgersen""",39.1,18.7,181.0,3750.0,"""MALE"""
"""Adelie""","""Torgersen""",39.5,17.4,186.0,3800.0,"""FEMALE"""


# ESSENTIAL

## COUNT

In [138]:
context = dp.Context.compositor(
    data=lf,
    privacy_unit=dp.unit_of(contributions=3),
    privacy_loss=dp.loss_of(epsilon=1.0),
    split_evenly_over=6,
)

In [139]:
## Count rows in frame
query = context.query().select(dp.len())
query.release().collect().item() 

341

In [140]:
## Count rows in column (including nulls)
query = context.query().select(pl.col.sex.dp.len())
query.release().collect().item() 

305

In [141]:
## Count rows in column (excluding nulls)
query = context.query().select(pl.col.sex.dp.count())
query.release().collect().item() 

337

In [142]:
## Count null rows in column (only nulls)
query = context.query().select(pl.col.sex.dp.null_count())
query.release().collect().item() 

9

In [143]:
## Count unique rows in column (including nulls)
query = context.query().select(pl.col.sex.dp.n_unique())
query.release().collect().item() 

0

In [144]:
## Count unique rows in column (including nulls)
query = context.query().select(pl.col.body_mass_g.dp.n_unique())
query.release().collect().item() 

113

## SUM

In [8]:
context = dp.Context.compositor(
    data=lf,
    privacy_unit=dp.unit_of(contributions=3),
    privacy_loss=dp.loss_of(epsilon=1.0),
    split_evenly_over=5,
    margins=[ # take into account the potential for overflow and/or numerical instability
        dp.polars.Margin(
            max_length=1000 # known upper bound on how many records can be present in the data
        ),
    ],
)

In [9]:
query = (
    context.query()
    .filter(pl.col.island == "Torgersen")
    .filter(pl.col("flipper_length_mm") >= 185.0)
    .select(
        pl.col.flipper_length_mm.cast(int)
        .fill_null(200)
        .dp.sum(bounds=(185, 250))
    )
)

In [10]:
query.summarize(alpha=0.05)

column,aggregate,distribution,scale,accuracy
str,str,str,f64,f64
"""flipper_length_mm""","""Sum""","""Integer Laplace""",3750.0,11234.495992


In [11]:
query.release().collect().item()

33493

## MEAN

In [12]:
query = (
    context.query()
    .filter(pl.col.island == "Torgersen")
    .select(
        pl.col.flipper_length_mm.cast(int).dp.sum(bounds=(150, 250)),
        dp.len(),
    )
)

In [13]:
query.release().collect().with_columns(
    mean=pl.col.flipper_length_mm / pl.col.len
) 

flipper_length_mm,len,mean
i64,u32,f64
13782,6,2297.0


In [17]:
context_bounded_dp = dp.Context.compositor(
    data=lf,
    privacy_unit=dp.unit_of(contributions=36),
    privacy_loss=dp.loss_of(epsilon=1.0),
    split_evenly_over=5,
    margins=[
        dp.polars.Margin(
            max_length=1000,            
            invariant="lengths", # don't protect the total number of records (bounded-DP)
        ),
    ],
)

In [18]:
query = context_bounded_dp.query().select(
    pl.col.flipper_length_mm.cast(int).dp.mean(bounds=(150, 250))
)

In [19]:
query.release().collect().item()

182.12790697674419

## MEDIAN

In [20]:
candidates = list(range(150, 250))

query = (
    context.query()
    .filter(pl.col.island == "Torgersen")
    .select(
        pl.col.flipper_length_mm.cast(int).dp.median(candidates)
    )
)

In [21]:
query.release().collect()

flipper_length_mm
i64
164


## QUANTILES

In [22]:
query_multi_quantiles = (
    context.query()
    .filter(pl.col.island == "Torgersen")
    .select(
        pl.col.flipper_length_mm.cast(int)
        .dp.quantile(a, candidates)
        .alias(f"{a}-Quantile")
        for a in [0.25, 0.5, 0.75]
    )
)

In [23]:
query_multi_quantiles.release().collect()  

0.25-Quantile,0.5-Quantile,0.75-Quantile
i64,i64,i64
199,214,223


# GROUPING

## STABLE KEYS (spend delta - only show groups big enough)

In [24]:
context = dp.Context.compositor(
    data=lf,
    privacy_unit=dp.unit_of(contributions=3),
    privacy_loss=dp.loss_of(epsilon=1.0, delta=1e-7),
    split_evenly_over=1,
)

In [25]:
query = (
    context.query()
    .group_by("species", "sex")
    .agg(dp.len())
)

In [26]:
tmp_df = query.release().collect()
tmp_df.head(2)

species,sex,len
str,str,u32
"""Adelie""","""FEMALE""",67
"""Gentoo""","""FEMALE""",62


## EXPLICIT KEYS (does not spend delta)

In [33]:
context = dp.Context.compositor(
    data=lf,
    privacy_unit=dp.unit_of(contributions=3),
    privacy_loss=dp.loss_of(epsilon=1.0 / 4),
    split_evenly_over=1,
)

In [34]:
query = (
    context.query()
    .filter(pl.col.island == "Torgersen")
    .group_by("species", "sex")
    .agg(dp.len())
    .with_keys(tmp_df["species", "sex"])
)

In [35]:
query.release().collect().head(2)

species,sex,len
str,str,u32
"""Gentoo""","""FEMALE""",0
"""Adelie""","""FEMALE""",27


## INVARIANT GROUP KEYS

In [25]:
context = dp.Context.compositor(
    data=lf,
    privacy_unit=dp.unit_of(contributions=36),
    privacy_loss=dp.loss_of(epsilon=1.0 / 4),
    split_evenly_over=1,
    margins=[
        dp.polars.Margin(by=["species", "sex"], invariant="keys") # group keys when grouped by "YEAR" and "QUARTER" are invariant
    ],
)

In [26]:
query = (
    context.query()
    .group_by("species", "sex")
    .agg(dp.len())
)

In [27]:
query.release().collect().head(2)

species,sex,len
str,str,u32
"""Adelie""","""MALE""",34
"""Chinstrap""","""MALE""",91


## INVARIANT GROUP LENGTHS

In [44]:
# filtering the data within the query results in the margin info being invalidated. One way to work around this limitation is to preprocess your data before passing it into the context
lf_preprocessed = lf.filter(pl.col("island") == "Torgersen")

In [45]:
context = dp.Context.compositor(
    data=lf_preprocessed,
    privacy_unit=dp.unit_of(contributions=36),
    privacy_loss=dp.loss_of(epsilon=1.0, delta=1e-7),
    split_evenly_over=1,
    margins=[
        # total number of responses when grouped by "SEX" is public information
        dp.polars.Margin(
            by=["sex"],
            invariant="lengths",
            max_length=500,
            max_groups=2, # encoding is Male, Female
        )
    ],
)

In [46]:
query_work_hours = (
    context.query()
    .group_by("sex")
    .agg(pl.col.flipper_length_mm.cast(int).dp.mean((200, 250))) # no budget for length as assumed known public info
)

In [47]:
query_work_hours.release().collect().head(2)

sex,flipper_length_mm
str,f64
"""FEMALE""",277.333333
"""MALE""",131.26087


# MICRODATA

## WITH COLUMNS
Expressions passed into .with_columns must be row-by-row, meaning that the expression could be represented as a function applied to each row in the data.

Any new columns added by .with_columns do not (currently) have margin descriptors. For instance, in the above query, any margin descriptors related to HWUSUAL would no longer apply to the new, shadowing, HWUSUAL column after .with_columns.

In [86]:
context = dp.Context.compositor(
    data=lf,
    privacy_unit=dp.unit_of(contributions=3),
    privacy_loss=dp.loss_of(epsilon=2.0, delta=1e-6),
    split_evenly_over=3,
    margins=[
        dp.polars.Margin(max_length=1000)
    ],
)

In [45]:
query = (
    context.query()
    .with_columns(
        pl.col.flipper_length_mm.cut(
            breaks=[150, 175, 200, 225],
            left_closed=True,
        )
    )
    .group_by(pl.col.flipper_length_mm)
    .agg(dp.len())
)

In [46]:
query.release().collect().sort("flipper_length_mm")

flipper_length_mm,len
cat,u32
"""[175, 200)""",189
"""[200, 225)""",129


## SELECT
resolves each passed expression to a column and then returns those columns

## FILTER
.filter uses row-by-row expressions of booleans to mask rows.

Filtering discards all invariants about the group keys and group sizes. Margin descriptors are considered applicable for the input dataset, so a data-dependent filtering renders these invariants invalid.

Otherwise, filtering preserves all other margin descriptors, because filtering only ever removes rows.

In [52]:
query = (
    context.query()
    .with_columns(pl.col.flipper_length_mm.cast(int))
    .filter(pl.col.island == "Torgersen")
    .select(pl.col.flipper_length_mm.dp.sum((150, 250)))
)

In [53]:
print("sum:", query.release().collect().item())  

sum: 11164


## GROUP BY (PRIVATE)
.group_by also resolves each passed expression to a column, and then groups on those columns. must be row-by-row.

In [59]:
query = (
    context.query()
    .group_by(
        pl.col.flipper_length_mm.cut(
            breaks=[150, 175, 200, 225], left_closed=True
        )
    )
    .agg(dp.len())
)

In [60]:
query.release().collect().sort("flipper_length_mm")  

flipper_length_mm,len
cat,u32
"""[175, 200)""",185
"""[200, 225)""",132


## GROUP BY (STABLE)
group_by/agg can also be used earlier in the data pipeline, before the private group_by/agg or select aggregation --> multi group by ? 

appealing because arbitrary expressions can be used in the agg argument, 

but a large amount of data is needed to get reasonable utility.

In [107]:
context = dp.Context.compositor(
    data=lf,
    privacy_unit=dp.unit_of(contributions=3),
    privacy_loss=dp.loss_of(epsilon=2.0, delta=1e-6),
    split_evenly_over=3,
    margins=[
        dp.polars.Margin(max_length=1000),
        dp.polars.Margin(
            by=["species"],
            invariant="lengths",
            max_length=500,
            max_groups=3,
        ),
        dp.polars.Margin(
            by=["sex"],
            invariant="lengths",
            max_length=500,
            max_groups=2,
        ),
        dp.polars.Margin(
            by=["sex", "species"],
            invariant="lengths",
            max_length=200,
            max_groups=6,
        )
    ],
)

In [108]:
query = (
    context.query()
    .group_by("species", "sex")
    .agg(pl.col.flipper_length_mm.mean())
    .group_by(pl.col.sex)
    .agg(
        dp.len(), # must add otherwise error
        pl.col.flipper_length_mm.cast(int).dp.mean((150, 250))
    )
)

In [109]:
query.release().collect()

sex,len,flipper_length_mm
str,u32,f64


Inform context of number of user in group depending on groupby

In [113]:
context = dp.Context.compositor(
    data=lf,
    privacy_unit=dp.unit_of(
        contributions=[
            dp.polars.Bound(per_group=3),
            # a penguin only belong to one species
            dp.polars.Bound(
                by=[pl.col.species], num_groups=1
            ),
            dp.polars.Bound(
                by=[pl.col.sex], num_groups=1
            ),
        ]
    ),
    privacy_loss=dp.loss_of(epsilon=1.0, delta=1e-7),
    split_evenly_over=4,
    margins=[dp.polars.Margin(max_length=1000)],
)

In [122]:
query = (
    context.query()
    .group_by("species", "sex")
    .agg(pl.col.flipper_length_mm.mean())
    .group_by(pl.col.sex)
    .agg(
        dp.len(), # must add otherwise # MakeMeasurement("stable key release requires a `dp.len()` expression")
        pl.col.flipper_length_mm.cast(int).dp.mean((150, 250))
    )
)

In [123]:
query.release().collect()

sex,len,flipper_length_mm
str,u32,f64


In [None]:
context = dp.Context.compositor(
    data=lf,
    privacy_unit=dp.unit_of(contributions=3),
    privacy_loss=dp.loss_of(epsilon=2.0, delta=1e-6),
    split_evenly_over=3,
    margins=[
        dp.polars.Margin(max_length=1000),
        dp.polars.Margin(
            by=["species"],
            invariant="lengths",
            max_length=500,
            max_groups=3,
        ),
        dp.polars.Margin(
            by=["sex"],
            invariant="lengths",
            max_length=500,
            max_groups=2,
        ),
        dp.polars.Margin(
            by=["sex", "species"],
            invariant="lengths",
            max_length=200,
            max_groups=6,
        )
    ],
)