In [1]:
%%capture
pip install 'opendp[polars]'

# Explore opendp==0.14
https://docs.opendp.org/en/stable/getting-started/quickstart.html

In [1]:
import opendp.prelude as dp
import polars as pl

dp.enable_features("contrib")

In [2]:
PATH = "penguin.csv"
#PATH = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv"
#PATH = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv"
#PATH = dp.examples.get_france_lfs_path()

In [3]:
lf = pl.scan_csv(PATH, ignore_errors=True)
lf

In [4]:
lf.collect().head(2)

species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
str,str,f64,f64,f64,f64,str
"""Adelie""","""Torgersen""",39.1,18.7,181.0,3750.0,"""MALE"""
"""Adelie""","""Torgersen""",39.5,17.4,186.0,3800.0,"""FEMALE"""


# ESSENTIAL

## COUNT

In [5]:
context = dp.Context.compositor(
    data=lf,
    privacy_unit=dp.unit_of(contributions=3),
    privacy_loss=dp.loss_of(epsilon=1.0),
    split_evenly_over=6,
)

In [6]:
## Count rows in frame
query = context.query().select(dp.len())
query.release().collect().item() 

341

In [7]:
## Count rows in column (including nulls)
query = context.query().select(pl.col.sex.dp.len())
query.release().collect().item() 

344

In [8]:
## Count rows in column (excluding nulls)
query = context.query().select(pl.col.sex.dp.count())
query.release().collect().item() 

330

In [9]:
## Count null rows in column (only nulls)
query = context.query().select(pl.col.sex.dp.null_count())
query.release().collect().item() 

13

In [10]:
## Count unique rows in column (including nulls)
query = context.query().select(pl.col.sex.dp.n_unique())
query.release().collect().item() 

0

In [11]:
## Count unique rows in column (including nulls)
query = context.query().select(pl.col.body_mass_g.dp.n_unique())
query.release().collect().item() 

98

In [12]:
context = dp.Context.compositor(
    data=lf,
    privacy_unit=dp.unit_of(contributions=3),
    privacy_loss=dp.loss_of(epsilon=1.0),
    split_evenly_over=6,
)

In [13]:
query_counts = context.query().select([
    dp.len().alias("rows_in_frame"),
    pl.col.sex.dp.len().alias("rows_in_sex"),
    pl.col.sex.dp.count().alias("rows_in_sex_no_null"),
    pl.col.sex.dp.null_count().alias("rows_in_sex_null"),
    pl.col.sex.dp.n_unique().alias("rows_in_sex_unique"),
    pl.col.body_mass_g.dp.n_unique().alias("rows_in_mass_unique"),
])
query_counts.summarize()

column,aggregate,distribution,scale
str,str,str,f64
"""rows_in_frame""","""Frame Length""","""Integer Laplace""",108.0
"""rows_in_sex""","""Length""","""Integer Laplace""",108.0
"""rows_in_sex_no_null""","""Count""","""Integer Laplace""",108.0
"""rows_in_sex_null""","""Null Count""","""Integer Laplace""",108.0
"""rows_in_sex_unique""","""N Unique""","""Integer Laplace""",108.0
"""rows_in_mass_unique""","""N Unique""","""Integer Laplace""",108.0


In [14]:
query_counts.release().collect()

rows_in_frame,rows_in_sex,rows_in_sex_no_null,rows_in_sex_null,rows_in_sex_unique,rows_in_mass_unique
u32,u32,u32,u32,u32,u32
257,664,177,0,0,251


## SUM

In [15]:
context = dp.Context.compositor(
    data=lf,
    privacy_unit=dp.unit_of(contributions=3),
    privacy_loss=dp.loss_of(epsilon=1.0),
    split_evenly_over=5,
    margins=[ 
        dp.polars.Margin(
            max_length=1000 # upper bound number rows (overflow and/or numerical instability)
        ),
    ],
)

In [16]:
query = (
    context.query()
    .filter(pl.col.island == "Torgersen")
    .filter(pl.col("flipper_length_mm") >= 185.0)
    .select(
        pl.col.flipper_length_mm.cast(int)
        .fill_null(200)
        .dp.sum(bounds=(185, 250))
    )
)

In [17]:
query.summarize(alpha=0.05)

column,aggregate,distribution,scale,accuracy
str,str,str,f64,f64
"""flipper_length_mm""","""Sum""","""Integer Laplace""",3750.0,11234.495992


In [18]:
query.release().collect().item()

1075

## MEAN

In [19]:
query = (
    context.query()
    .filter(pl.col.island == "Torgersen")
    .select(
        pl.col.flipper_length_mm.cast(int).dp.sum(bounds=(150, 250)).alias("sum_fl"),
        dp.len().alias("nb_row"),
    )
)

In [20]:
query.summarize()

column,aggregate,distribution,scale
str,str,str,f64
"""sum_fl""","""Sum""","""Integer Laplace""",7500.0
"""nb_row""","""Frame Length""","""Integer Laplace""",30.0


In [21]:
query.release().collect().with_columns(
    mean=pl.col.sum_fl / pl.col.nb_row
) 

sum_fl,nb_row,mean
i64,u32,f64
9636,117,82.358974


In [22]:
context_bounded_dp = dp.Context.compositor(
    data=lf,
    privacy_unit=dp.unit_of(contributions=36),
    privacy_loss=dp.loss_of(epsilon=1.0),
    split_evenly_over=5,
    margins=[
        dp.polars.Margin(
            max_length=1000,            
            invariant="lengths", # don't protect the total number of records (bounded-DP)
        ),
    ],
)

In [23]:
query = context_bounded_dp.query().select(
    pl.col.flipper_length_mm.cast(int).dp.mean(bounds=(150, 250))
)

In [24]:
query.summarize() # why enforce sum ?? so much more noise

column,aggregate,distribution,scale
str,str,str,f64
"""flipper_length_mm""","""Sum""","""Integer Laplace""",9000.0
"""flipper_length_mm""","""Length""","""Integer Laplace""",0.0


In [25]:
query.release().collect().item()

242.49418604651163

## MEDIAN

In [26]:
candidates = list(range(150, 250))

query = (
    context.query()
    .filter(pl.col.island == "Torgersen")
    .select(
        pl.col.flipper_length_mm.dp.median(candidates)
    )
)

In [27]:
query.release().collect()

flipper_length_mm
i64
184


## QUANTILES

In [28]:
query_multi_quantiles = (
    context.query()
    .filter(pl.col.island == "Torgersen")
    .select(
        pl.col.flipper_length_mm.cast(int)
        .dp.quantile(a, candidates)
        .alias(f"{a}-Quantile")
        for a in [0.25, 0.5, 0.75]
    )
)

In [29]:
query_multi_quantiles.release().collect()  

0.25-Quantile,0.5-Quantile,0.75-Quantile
i64,i64,i64
229,205,198


# GROUPING

## STABLE KEYS (spend delta - only show groups big enough)

In [30]:
context = dp.Context.compositor(
    data=lf,
    privacy_unit=dp.unit_of(contributions=3),
    privacy_loss=dp.loss_of(epsilon=1.0, delta=1e-7),
    split_evenly_over=1,
)

In [31]:
query = (
    context.query()
    .group_by("species", "sex")
    .agg(dp.len())
)

In [32]:
tmp_df = query.release().collect()
tmp_df.head(2)

species,sex,len
str,str,u32
"""Gentoo""","""MALE""",62
"""Adelie""","""MALE""",66


## EXPLICIT KEYS (does not spend delta)

In [34]:
context = dp.Context.compositor(
    data=lf,
    privacy_unit=dp.unit_of(contributions=3),
    privacy_loss=dp.loss_of(epsilon=1.0 / 4),
    split_evenly_over=1,
)

In [35]:
query = (
    context.query()
    .filter(pl.col.island == "Torgersen")
    .group_by("species", "sex")
    .agg(dp.len())
    .with_keys(tmp_df["species", "sex"])
)

In [36]:
query.release().collect().head(2)

species,sex,len
str,str,u32
"""Gentoo""","""MALE""",0
"""Adelie""","""MALE""",59


## INVARIANT GROUP KEYS

In [37]:
context = dp.Context.compositor(
    data=lf,
    privacy_unit=dp.unit_of(contributions=3),
    privacy_loss=dp.loss_of(epsilon=1.0 / 4),
    split_evenly_over=1,
    margins=[
        dp.polars.Margin(by=["species", "sex"], invariant="keys") # group keys when grouped by "species" and "sex" are invariant
    ],
)

In [38]:
query = (
    context.query()
    .group_by("species", "sex")
    .agg(dp.len())
)

In [39]:
query.release().collect().head(2)

species,sex,len
str,str,u32
"""Adelie""","""FEMALE""",81
"""Chinstrap""","""FEMALE""",23


## INVARIANT GROUP LENGTHS

In [40]:
# filtering the data within the query results in the margin info being invalidated. 
# One way to work around this limitation is to preprocess your data before passing it into the context
# Not possible in Lomas for now
lf_preprocessed = lf.filter(pl.col("island") == "Torgersen")

In [41]:
context = dp.Context.compositor(
    data=lf_preprocessed,
    privacy_unit=dp.unit_of(contributions=36),
    privacy_loss=dp.loss_of(epsilon=1.0, delta=1e-7),
    split_evenly_over=1,
    margins=[
        # total number of responses when grouped by "SEX" is public information
        dp.polars.Margin(
            by=["sex"],
            invariant="lengths",
            max_length=500,
            max_groups=2, # encoding is Male, Female
        )
    ],
)

In [42]:
query_work_hours = (
    context.query()
    .group_by("sex")
    .agg(pl.col.flipper_length_mm.cast(int).dp.mean((150, 250))) # no budget for length as assumed known public info
)

In [43]:
query_work_hours.release().collect().head(2)

sex,flipper_length_mm
str,f64
"""MALE""",303.521739
,188.8


# MICRODATA

## WITH COLUMNS
Expressions passed into .with_columns must be row-by-row, meaning that the expression could be represented as a function applied to each row in the data.

Any new columns added by .with_columns do not (currently) have margin descriptors. For instance, in the above query, any margin descriptors related to HWUSUAL would no longer apply to the new, shadowing, HWUSUAL column after .with_columns.

In [44]:
context = dp.Context.compositor(
    data=lf,
    privacy_unit=dp.unit_of(contributions=3),
    privacy_loss=dp.loss_of(epsilon=2.0, delta=1e-6),
    split_evenly_over=3,
    margins=[
        dp.polars.Margin(max_length=1000)
    ],
)

In [45]:
query = (
    context.query()
    .with_columns(
        pl.col.flipper_length_mm.cut(
            breaks=[150, 175, 200, 225],
            left_closed=True,
        ).alias("binned_fl")
    )
    .group_by(pl.col.binned_fl)
    .agg(dp.len())
)

In [46]:
query.release().collect().sort("binned_fl")

binned_fl,len
cat,u32
"""[175, 200)""",189
"""[200, 225)""",132


## SELECT
resolves each passed expression to a column and then returns those columns

## FILTER
.filter uses row-by-row expressions of booleans to mask rows.

Filtering discards all invariants about the group keys and group sizes. Margin descriptors are considered applicable for the input dataset, so a data-dependent filtering renders these invariants invalid.

Otherwise, filtering preserves all other margin descriptors, because filtering only ever removes rows.

In [47]:
query = (
    context.query()
    .with_columns(pl.col.flipper_length_mm.cast(int))
    .filter(pl.col.island == "Torgersen")
    .select(pl.col.flipper_length_mm.dp.sum((150, 250)))
)

In [48]:
print("sum:", query.release().collect().item())  

sum: 9664


## GROUP BY (PRIVATE)
.group_by also resolves each passed expression to a column, and then groups on those columns. must be row-by-row.

In [49]:
query = (
    context.query()
    .group_by(
        pl.col.flipper_length_mm.cut(
            breaks=[150, 175, 200, 225], left_closed=True
        )
    )
    .agg(dp.len())
)

In [50]:
query.release().collect().sort("flipper_length_mm")  

flipper_length_mm,len
cat,u32
"""[175, 200)""",187
"""[200, 225)""",135


## GROUP BY (STABLE)
group_by/agg can also be used earlier in the data pipeline, before the private group_by/agg or select aggregation --> multi group by ? 

appealing because arbitrary expressions can be used in the agg argument, 

but a large amount of data is needed to get reasonable utility.

In [51]:
context = dp.Context.compositor(
    data=lf,
    privacy_unit=dp.unit_of(contributions=3),
    privacy_loss=dp.loss_of(epsilon=2.0, delta=1e-6),
    split_evenly_over=3,
    margins=[
        dp.polars.Margin(max_length=1000),
        dp.polars.Margin(
            by=["species"],
            invariant="keys",
            max_length=500,
            max_groups=3,
        ),
        dp.polars.Margin(
            by=["sex"],
            invariant="keys",
            max_length=500,
            max_groups=2,
        ),
        dp.polars.Margin(
            by=["sex", "species"],
            invariant="keys",
            max_length=200,
            max_groups=6,
        )
    ],
)

In [52]:
query = ( # mean of mean
    context.query()
    .group_by("species", "sex")
    .agg(pl.col.flipper_length_mm.mean().alias("mean_fl_species_sex"))
    .select(pl.col.mean_fl_species_sex.cast(int).dp.mean((150, 250)))
)

In [53]:
query.summarize()

column,aggregate,distribution,scale
str,str,str,f64
"""mean_fl_species_sex""","""Sum""","""Integer Laplace""",4500.0
"""mean_fl_species_sex""","""Length""","""Integer Laplace""",18.0


In [54]:
query.release().collect()

mean_fl_species_sex
f64
inf


Inform context of number of user in group depending on groupby

In [55]:
context = dp.Context.compositor(
    data=lf,
    privacy_unit=dp.unit_of(
        contributions=[
            dp.polars.Bound(per_group=3),
            # a penguin only belong to one species
            dp.polars.Bound(
                by=[pl.col.species], num_groups=1
            ),
            dp.polars.Bound(
                by=[pl.col.sex], num_groups=1
            ),
        ]
    ),
    privacy_loss=dp.loss_of(epsilon=1.0, delta=1e-7),
    split_evenly_over=4,
    margins=[dp.polars.Margin(max_length=1000)],
)

In [56]:
# does not work to a group by after a group by
query = (
    context.query()
    .group_by("species", "sex")
    .agg(pl.col.flipper_length_mm.mean())
    .group_by(pl.col.sex)
    .agg(
        dp.len(), # must add otherwise # MakeMeasurement("stable key release requires a `dp.len()` expression")
        pl.col.flipper_length_mm.cast(int).dp.mean((150, 250))
    )
)

In [57]:
query.release().collect() # does not work to a group by after a group by

sex,len,flipper_length_mm
str,u32,f64


# MORE PROPERTIES

## BOOLEAN (null, nan, finite)

In [58]:
context = dp.Context.compositor(
    data=lf,
    privacy_unit=dp.unit_of(contributions=3),
    privacy_loss=dp.loss_of(epsilon=2.0, delta=1e-6),
    split_evenly_over=3,
    margins=[
        dp.polars.Margin(max_length=1000)
    ],
)

In [59]:
query = (
    context.query()
    .filter(pl.col.island == "Torgersen")
    .filter(pl.col("flipper_length_mm") >= 185.0)
    .filter(pl.col("flipper_length_mm").is_not_null())
    .filter(pl.col("flipper_length_mm").is_not_nan())
    .filter(pl.col("flipper_length_mm").is_finite())
    .select(
        pl.col.flipper_length_mm.cast(int)
        .fill_null(200)
        .dp.sum(bounds=(185, 250))
    )
)

In [60]:
query.release().collect()

flipper_length_mm
i64
8206


## CAST

Cast expressions on grouping columns will void any margin descriptors for those columns.

Useful is when computing a float sum on a large dataset. OpenDP accounts for inexact floating-point arithmetic when computing the float sum, and on data with large bounds and hundreds of thousands of records, this term can dominate the sensitivity.

failed casts do not throw a (data-dependent) exception, instead returning a null. Therefore using this cast operation updates the output domain to indicate that there may potentially be nulls. 

You’ll probably need to apply .fill_null before computing statistics with casted data.

In [61]:
context = dp.Context.compositor(
    data=lf,
    privacy_unit=dp.unit_of(contributions=3),
    privacy_loss=dp.loss_of(epsilon=2.0, delta=1e-6),
    split_evenly_over=3,
    margins=[
        dp.polars.Margin(max_length=1000)
    ],
)

In [62]:
context.query().select(pl.col.flipper_length_mm.dp.sum((0, 100))).summarize()

column,aggregate,distribution,scale
str,str,str,f64
"""flipper_length_mm""","""Sum""","""Float Laplace""",450.0


In [63]:
context.query().select(pl.col.flipper_length_mm.cast(int).dp.sum((0, 100))).summarize()

column,aggregate,distribution,scale
str,str,str,f64
"""flipper_length_mm""","""Sum""","""Integer Laplace""",450.0


## DROP/FILL (nan, nulls)

In [64]:
(
    context.query()
    .with_columns(pl.col.bill_length_mm.fill_nan(0.0).fill_null(0.0))
    .with_columns(pl.col.flipper_length_mm.fill_nan(pl.col.bill_length_mm).fill_null(pl.col.bill_length_mm))
    .select(pl.col.flipper_length_mm.dp.sum((0, 100)))
    .summarize()
)

column,aggregate,distribution,scale
str,str,str,f64
"""flipper_length_mm""","""Sum""","""Float Laplace""",450.0


## REPLACE

In [65]:
(
    context.query()
    .select(
        pl.col.body_mass_g.cast(int)
        .replace(old=[5000, None], new=0) # replace 99 and None with 0
        .dp.sum((2000, 7000))
    )
    .summarize()
)

column,aggregate,distribution,scale
str,str,str,f64
"""body_mass_g""","""Sum""","""Integer Laplace""",31500.0


In [66]:
(
    context.query()
    .select(
        pl.col.body_mass_g.cast(int)
        .replace_strict({5000: 0, 6000:0})
        .dp.sum((2000, 7000))
    )
    .summarize()
)

column,aggregate,distribution,scale
str,str,str,f64
"""body_mass_g""","""Sum""","""Integer Laplace""",31500.0


## TO PHYSICAL (underlying representation of categorical)

In [67]:
breaks = [150, 175, 200, 225]
labels = pl.Series("flipper_length_mm", list(range(len(breaks) + 1)), dtype=pl.UInt32)

query = (
    context.query()
    .with_columns(pl.col.flipper_length_mm.cut(breaks=breaks).to_physical())
    .group_by("flipper_length_mm")
    .agg(dp.len())
    .with_keys(pl.LazyFrame([labels]))
)
query.release().collect().sort("flipper_length_mm")

flipper_length_mm,len
u32,u32
0,195
1,8
2,134
3,21
4,0


# SQL

In [68]:
context = dp.Context.compositor(
    data=lf,
    privacy_unit=dp.unit_of(contributions=1),
    privacy_loss=dp.loss_of(epsilon=5.0, delta=1e-6),
    split_evenly_over=2,
    margins=[
        dp.polars.Margin(max_length=500)
    ],
)

In [69]:
expr = pl.sql_expr("AVG(bill_length_mm) AS avg_bill_length_mm")
print(expr)
lf.select(expr).collect().item() # polars only (no opendp)

col("bill_length_mm").mean().alias("avg_bill_length_mm")


43.92192982456141

In [70]:
query = (
    context.query()
    .select(pl.col.bill_length_mm.dp.mean((30.0, 65.0)).alias("avg_bill_length_mm"))
)
query.release().collect().item()

44.56523833741533

### Variance
No var in opendp polars.

Wikipedia: https://fr.wikipedia.org/wiki/Variance_(math%C3%A9matiques)

$$
V(X) = E[(X-E(X))^2)]
$$
but also
$$
V(X) = E(X^2) - (E(X))^2
$$

In [71]:
expr = pl.sql_expr("VAR(bill_length_mm) AS avg_bill_length_mm")
print(expr)
lf.select(expr).collect().item() # polars only (no opendp)

col("bill_length_mm").var().alias("avg_bill_length_mm")


29.80705432937182

In [72]:
var_query = (
    context.query()
    .with_columns(pl.col.bill_length_mm.fill_nan(43.0).fill_null(43.0))
    .select([
        pl.col.bill_length_mm.cast(int).dp.mean((30.0, 60.0)).alias("bl-µ"),
        (
            pl.col("bill_length_mm").cast(int)*pl.col("bill_length_mm").cast(int)
        ).dp.mean(bounds=(30**2, 60**2)).alias('bl-s²')
    ])
)

In [73]:
var_query.summarize()

column,aggregate,distribution,scale
str,str,str,f64
"""bl-µ""","""Sum""","""Integer Laplace""",96.0
"""bl-µ""","""Length""","""Integer Laplace""",1.6
"""bl-s²""","""Sum""","""Integer Laplace""",5760.0
"""bl-s²""","""Length""","""Integer Laplace""",1.6


In [74]:
res = var_query.release().collect()

In [75]:
bl_variance = res["bl-s²"].item() - res["bl-µ"].item()**2
bl_variance

43.69544348296358