In [1]:
%%capture
pip install 'opendp[polars]'

# Explore opendp==0.14 FRAMEWORK API
https://docs.opendp.org/en/stable/getting-started/quickstart.html

In [2]:
import opendp.prelude as dp
import polars as pl

dp.enable_features("contrib")

In [3]:
PATH = "penguin.csv"

In [4]:
lf = pl.scan_csv(PATH, ignore_errors=True)
lf.collect().head(2)

species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
str,str,f64,f64,f64,f64,str
"""Adelie""","""Torgersen""",39.1,18.7,181.0,3750.0,"""MALE"""
"""Adelie""","""Torgersen""",39.5,17.4,186.0,3800.0,"""FEMALE"""


In [5]:
lf = pl.scan_csv(PATH).collect().lazy()

## Framework API (curretnly in Lomas)

In [6]:
from opendp_helper import get_raw_lf_domain, add_global_margin, add_group_by_margin
from opendp import measures as ms
import yaml

In [7]:
metadata_path = "penguin_metadata.yaml"
with open(metadata_path, "r") as f:
    metadata = yaml.safe_load(f)

In [8]:
lf_domain = get_raw_lf_domain(metadata)

margin = dp.polars.Margin(max_length=metadata["rows"], invariant="keys")
lf_domain = dp.with_margin(lf_domain, margin)

In [9]:
def opdp_full_pipe(plan, lf, lf_domain):
    opendp_pipe = dp.m.make_private_lazyframe(
        lf_domain, dp.symmetric_distance(), ms.max_divergence(), plan
    )
    cost = opendp_pipe.map(d_in=1)
    print(f"Cost: {cost}")
    
    release_data = opendp_pipe(lf)
    release_data = release_data.collect()
    print(f"Release: {release_data}")

In [10]:
lf_domain

FrameDomain(species: str, island: str, bill_length_mm: f64, bill_depth_mm: f64, flipper_length_mm: f64, body_mass_g: f64, sex: str; margins=[{}])

In [11]:
plan = (
    lf
    .group_by("species", "sex")
    .agg(dp.len(scale=1))
)

In [12]:
lf_domain_goup = dp.with_margin(
    lf_domain,
    dp.polars.Margin(
        by = ["species", "sex"],
        invariant = "keys",
    )
)

In [13]:
opdp_full_pipe(plan, lf, lf_domain_goup)

Cost: 1.0
Release: shape: (8, 3)
┌───────────┬────────┬─────┐
│ species   ┆ sex    ┆ len │
│ ---       ┆ ---    ┆ --- │
│ str       ┆ str    ┆ u32 │
╞═══════════╪════════╪═════╡
│ Gentoo    ┆ MALE   ┆ 62  │
│ Adelie    ┆ null   ┆ 5   │
│ Adelie    ┆ MALE   ┆ 70  │
│ Adelie    ┆ FEMALE ┆ 74  │
│ Chinstrap ┆ FEMALE ┆ 34  │
│ Gentoo    ┆ null   ┆ 7   │
│ Gentoo    ┆ FEMALE ┆ 58  │
│ Chinstrap ┆ MALE   ┆ 34  │
└───────────┴────────┴─────┘


# ESSENTIAL

## COUNT

In [14]:
## Count rows in frame
plan = lf.select(dp.len(scale=1.0))
opdp_full_pipe(plan, lf, lf_domain)

Cost: 1.0
Release: shape: (1, 1)
┌─────┐
│ len │
│ --- │
│ u32 │
╞═════╡
│ 344 │
└─────┘


In [15]:
## Count rows in column (including nulls)
plan = lf.select(pl.col.sex.dp.len(scale=1.0))
opdp_full_pipe(plan, lf, lf_domain)

Cost: 1.0
Release: shape: (1, 1)
┌─────┐
│ sex │
│ --- │
│ u32 │
╞═════╡
│ 345 │
└─────┘


In [16]:
## Count rows in column (excluding nulls)
plan = lf.select(pl.col.sex.dp.count(scale=1.0))
opdp_full_pipe(plan, lf, lf_domain)

Cost: 1.0
Release: shape: (1, 1)
┌─────┐
│ sex │
│ --- │
│ u32 │
╞═════╡
│ 332 │
└─────┘


In [17]:
## Count null rows in column (only nulls)
plan = lf.select(pl.col.sex.dp.null_count(scale=1.0))
opdp_full_pipe(plan, lf, lf_domain)

Cost: 1.0
Release: shape: (1, 1)
┌─────┐
│ sex │
│ --- │
│ u32 │
╞═════╡
│ 12  │
└─────┘


In [18]:
## Count unique rows in column (including nulls)
plan = lf.select(pl.col.sex.dp.n_unique(scale=1.0))
opdp_full_pipe(plan, lf, lf_domain)

Cost: 1.0
Release: shape: (1, 1)
┌─────┐
│ sex │
│ --- │
│ u32 │
╞═════╡
│ 7   │
└─────┘


In [19]:
## Count unique rows in column (including nulls)
plan = lf.select(pl.col.body_mass_g.dp.n_unique(scale=1.0))
opdp_full_pipe(plan, lf, lf_domain)

Cost: 1.0
Release: shape: (1, 1)
┌─────────────┐
│ body_mass_g │
│ ---         │
│ u32         │
╞═════════════╡
│ 94          │
└─────────────┘


In [20]:
plan = lf.select([
    dp.len(scale=1.0).alias("rows_in_frame"),
    pl.col.sex.dp.len(scale=1.0).alias("rows_in_sex"),
    pl.col.sex.dp.count(scale=1.0).alias("rows_in_sex_no_null"),
    pl.col.sex.dp.null_count(scale=1.0).alias("rows_in_sex_null"),
    pl.col.sex.dp.n_unique(scale=1.0).alias("rows_in_sex_unique"),
    pl.col.body_mass_g.dp.n_unique(scale=1.0).alias("rows_in_mass_unique"),
])
opdp_full_pipe(plan, lf, lf_domain)

Cost: 6.0
Release: shape: (1, 6)
┌───────────────┬─────────────┬─────────────────┬────────────────┬────────────────┬────────────────┐
│ rows_in_frame ┆ rows_in_sex ┆ rows_in_sex_no_ ┆ rows_in_sex_nu ┆ rows_in_sex_un ┆ rows_in_mass_u │
│ ---           ┆ ---         ┆ null            ┆ ll             ┆ ique           ┆ nique          │
│ u32           ┆ u32         ┆ ---             ┆ ---            ┆ ---            ┆ ---            │
│               ┆             ┆ u32             ┆ u32            ┆ u32            ┆ u32            │
╞═══════════════╪═════════════╪═════════════════╪════════════════╪════════════════╪════════════════╡
│ 344           ┆ 347         ┆ 333             ┆ 11             ┆ 3              ┆ 95             │
└───────────────┴─────────────┴─────────────────┴────────────────┴────────────────┴────────────────┘


## SUM

In [21]:
plan = (
    lf
    .filter(pl.col.island == "Torgersen")
    .filter(pl.col("flipper_length_mm") >= 185.0)
    .select(
        pl.col.flipper_length_mm.cast(int)
        .fill_null(200)
        .dp.sum(bounds=(185, 250), scale=100) # need to add scale always
    )
)
opdp_full_pipe(plan, lf, lf_domain)

Cost: 2.5
Release: shape: (1, 1)
┌───────────────────┐
│ flipper_length_mm │
│ ---               │
│ i64               │
╞═══════════════════╡
│ 8567              │
└───────────────────┘


## MEAN

In [22]:
plan = (
    lf
    .filter(pl.col.island == "Torgersen")
    .select(
        pl.col.flipper_length_mm.cast(int).dp.sum(bounds=(150, 250), scale=100).alias("sum_fl"),
        dp.len(scale=1).alias("nb_row"),
    )
)
opdp_full_pipe(plan, lf, lf_domain)

Cost: 3.5
Release: shape: (1, 2)
┌────────┬────────┐
│ sum_fl ┆ nb_row │
│ ---    ┆ ---    │
│ i64    ┆ u32    │
╞════════╪════════╡
│ 9901   ┆ 50     │
└────────┴────────┘


In [23]:
plan = lf.select(
    pl.col.flipper_length_mm.cast(int).dp.mean(bounds=(150, 250), scale=100)
)
opdp_full_pipe(plan, lf, lf_domain)

Cost: 2.5100000000000002
Release: shape: (1, 1)
┌───────────────────┐
│ flipper_length_mm │
│ ---               │
│ f64               │
╞═══════════════════╡
│ 203.805882        │
└───────────────────┘


## MEDIAN

In [24]:
candidates = list(range(150, 250))

plan = (
    lf
    .filter(pl.col.island == "Torgersen")
    .select(
        pl.col.flipper_length_mm.dp.median(candidates, scale=10)
    )
)
opdp_full_pipe(plan, lf, lf_domain)

Cost: 0.2
Release: shape: (1, 1)
┌───────────────────┐
│ flipper_length_mm │
│ ---               │
│ i64               │
╞═══════════════════╡
│ 191               │
└───────────────────┘


## QUANTILES

In [25]:
plan = (
    lf
    .filter(pl.col.island == "Torgersen")
    .select(
        pl.col.flipper_length_mm.cast(int)
        .dp.quantile(a, candidates, scale=10)
        .alias(f"{a}-Quantile")
        for a in [0.25, 0.5, 0.75]
    )
)
opdp_full_pipe(plan, lf, lf_domain)

Cost: 1.4000000000000004
Release: shape: (1, 3)
┌───────────────┬──────────────┬───────────────┐
│ 0.25-Quantile ┆ 0.5-Quantile ┆ 0.75-Quantile │
│ ---           ┆ ---          ┆ ---           │
│ i64           ┆ i64          ┆ i64           │
╞═══════════════╪══════════════╪═══════════════╡
│ 187           ┆ 212          ┆ 193           │
└───────────────┴──────────────┴───────────────┘


# GROUPING

## STABLE KEYS 
--> not possible with Framework API ? (spend delta - only show groups big enough)

## EXPLICIT KEYS (does not spend delta) 
--> not sure how in framework api

## INVARIANT GROUP KEYS
--> ok with framework API but need to adapt code from previous opdp version (margin group by not compatible)

In [26]:
plan = (
    lf
    .group_by("species", "sex")
    .agg(dp.len(scale=1))
)

In [27]:
lf_domain_group = dp.with_margin(
    lf_domain,
    dp.polars.Margin(
        by = ["species", "sex"],
        invariant = "keys",
    )
)

In [28]:
opdp_full_pipe(plan, lf, lf_domain_group) # should by because public info keys when group by

Cost: 1.0
Release: shape: (8, 3)
┌───────────┬────────┬─────┐
│ species   ┆ sex    ┆ len │
│ ---       ┆ ---    ┆ --- │
│ str       ┆ str    ┆ u32 │
╞═══════════╪════════╪═════╡
│ Adelie    ┆ null   ┆ 7   │
│ Adelie    ┆ FEMALE ┆ 74  │
│ Adelie    ┆ MALE   ┆ 76  │
│ Chinstrap ┆ FEMALE ┆ 34  │
│ Gentoo    ┆ FEMALE ┆ 60  │
│ Gentoo    ┆ null   ┆ 4   │
│ Chinstrap ┆ MALE   ┆ 36  │
│ Gentoo    ┆ MALE   ┆ 61  │
└───────────┴────────┴─────┘


## INVARIANT GROUP LENGTHS
Not possible to preprocess in Lomas for now anyway

# MICRODATA

## WITH COLUMNS
--> i don't see how it would be possible in the framework api to set the margins for this. 
Because it has to be a known column beforehand, so we cannot groupby columns made during the pipeline.

## SELECT
resolves each passed expression to a column and then returns those columns

## FILTER
.filter uses row-by-row expressions of booleans to mask rows.

Filtering discards all invariants about the group keys and group sizes. Margin descriptors are considered applicable for the input dataset, so a data-dependent filtering renders these invariants invalid.

Otherwise, filtering preserves all other margin descriptors, because filtering only ever removes rows.

In [29]:
plan = (
    lf
    .with_columns(pl.col.flipper_length_mm.cast(int))
    .filter(pl.col.island == "Torgersen")
    .select(pl.col.flipper_length_mm.dp.sum((150, 250), scale=10))
)

In [30]:
opdp_full_pipe(plan, lf, lf_domain)

Cost: 25.0
Release: shape: (1, 1)
┌───────────────────┐
│ flipper_length_mm │
│ ---               │
│ i64               │
╞═══════════════════╡
│ 9948              │
└───────────────────┘


## GROUP BY (PRIVATE)
--> as before, i don't see how it would be possible in the framework api to set the margins for this. 
Because it has to be a known column beforehand, so we cannot groupby columns made during the pipeline.

## GROUP BY (STABLE)

In [36]:
plan = (
    lf
    .group_by("species", "sex")
    .agg(pl.col.flipper_length_mm.mean().alias("mean_fl_species_sex"))
    .select(
        pl.col.mean_fl_species_sex.cast(int).dp.mean((150, 250), scale=10)
    )
)

In [37]:
opdp_full_pipe(plan, lf, lf_domain)

Cost: 50.2
Release: shape: (1, 1)
┌─────────────────────┐
│ mean_fl_species_sex │
│ ---                 │
│ f64                 │
╞═════════════════════╡
│ 69.391304           │
└─────────────────────┘


Inform context of number of user in group depending on groupby

In [48]:
plan = (
    lf
    .group_by("species", "sex")
    .agg(pl.col.flipper_length_mm.mean())
    .group_by(pl.col.sex)
    .agg(
        dp.len(scale=1),
        pl.col.flipper_length_mm.cast(int).dp.mean((150, 250), scale=10)
    )
)

In [54]:
lf_domain_group = dp.with_margin(
    lf_domain,
    dp.polars.Margin(
        by = ["sex"],
        invariant = "keys",
    )
)

In [55]:
opdp_full_pipe(plan, lf, lf_domain_group) # why not?? works with context api

OpenDPException: 
  MakeMeasurement("The key-set of {col("sex")} is private and cannot be released without a filter or join. Please pass a filtering threshold into make_private_lazyframe or conduct a join against a public key-set.")

# MORE PROPERTIES

## BOOLEAN (null, nan, finite)

In [56]:
plan = (
    lf
    .filter(pl.col.island == "Torgersen")
    .filter(pl.col("flipper_length_mm") >= 185.0)
    .filter(pl.col("flipper_length_mm").is_not_null())
    .filter(pl.col("flipper_length_mm").is_not_nan())
    .filter(pl.col("flipper_length_mm").is_finite())
    .select(
        pl.col.flipper_length_mm.cast(int)
        .fill_null(200)
        .dp.sum(bounds=(185, 250), scale=10)
    )
)

In [57]:
opdp_full_pipe(plan, lf, lf_domain)

Cost: 25.0
Release: shape: (1, 1)
┌───────────────────┐
│ flipper_length_mm │
│ ---               │
│ i64               │
╞═══════════════════╡
│ 8491              │
└───────────────────┘


## CAST

Cast expressions on grouping columns will void any margin descriptors for those columns.

Useful is when computing a float sum on a large dataset. OpenDP accounts for inexact floating-point arithmetic when computing the float sum, and on data with large bounds and hundreds of thousands of records, this term can dominate the sensitivity.

failed casts do not throw a (data-dependent) exception, instead returning a null. Therefore using this cast operation updates the output domain to indicate that there may potentially be nulls. 

You’ll probably need to apply .fill_null before computing statistics with casted data.

In [58]:
lf.select(pl.col.flipper_length_mm.dp.sum((0, 100), scale=10))

In [60]:
lf.select(pl.col.flipper_length_mm.cast(int).dp.sum((0, 100), scale=10))

## DROP/FILL (nan, nulls)

In [61]:
(
    lf
    .with_columns(pl.col.bill_length_mm.fill_nan(0.0).fill_null(0.0))
    .with_columns(pl.col.flipper_length_mm.fill_nan(pl.col.bill_length_mm).fill_null(pl.col.bill_length_mm))
    .select(pl.col.flipper_length_mm.dp.sum((0, 100)), scale=10)
)

## REPLACE

In [62]:
(
    lf
    .select(
        pl.col.body_mass_g.cast(int)
        .replace(old=[5000, None], new=0) # replace 99 and None with 0
        .dp.sum((2000, 7000), scale=10)
    )
)

In [63]:
(
    lf
    .select(
        pl.col.body_mass_g.cast(int)
        .replace_strict({5000: 0, 6000:0})
        .dp.sum((2000, 7000), scale=10)
    )
)

## TO PHYSICAL (underlying representation of categorical)
not sure how to with framework api again

# SQL

In [65]:
expr = pl.sql_expr("AVG(bill_length_mm) AS avg_bill_length_mm")
print(expr)
lf.select(expr).collect().item() # polars only (no opendp)

col("bill_length_mm").mean().alias("avg_bill_length_mm")


43.92192982456141

In [66]:
plan = (
    lf
    .select(pl.col.bill_length_mm.dp.mean((30.0, 65.0), scale=10).alias("avg_bill_length_mm"))
)
opdp_full_pipe(plan, lf, lf_domain)

Cost: 6.600000000341587
Release: shape: (1, 1)
┌────────────────────┐
│ avg_bill_length_mm │
│ ---                │
│ f64                │
╞════════════════════╡
│ 44.929169          │
└────────────────────┘


### Variance
No var in opendp polars.

Wikipedia: https://fr.wikipedia.org/wiki/Variance_(math%C3%A9matiques)

$$
V(X) = E[(X-E(X))^2)]
$$
but also
$$
V(X) = E(X^2) - (E(X))^2
$$

In [67]:
expr = pl.sql_expr("VAR(bill_length_mm) AS avg_bill_length_mm")
print(expr)
lf.select(expr).collect().item() # polars only (no opendp)

col("bill_length_mm").var().alias("avg_bill_length_mm")


29.80705432937182

In [68]:
plan = (
    lf
    .with_columns(pl.col.bill_length_mm.fill_nan(43.0).fill_null(43.0))
    .select([
        pl.col.bill_length_mm.cast(int).dp.mean((30.0, 60.0), scale=10).alias("bl-µ"),
        (
            pl.col("bill_length_mm").cast(int)*pl.col("bill_length_mm").cast(int)
        ).dp.mean(bounds=(30**2, 60**2), scale=10).alias('bl-s²')
    ])
)
opdp_full_pipe(plan, lf, lf_domain)

Cost: 366.20000000000005
Release: shape: (1, 2)
┌───────────┬────────────┐
│ bl-µ      ┆ bl-s²      │
│ ---       ┆ ---        │
│ f64       ┆ f64        │
╞═══════════╪════════════╡
│ 43.080692 ┆ 2002.39697 │
└───────────┴────────────┘


In [69]:
plan = (
    lf
    .select(pl.col.bill_length_mm.dp.sum((30.0, 60.0), scale=10).alias("bl-µ-sum"))
)
opdp_full_pipe(plan, lf, lf_domain)

Cost: 6.0000000003153104
Release: shape: (1, 1)
┌──────────────┐
│ bl-µ-sum     │
│ ---          │
│ f64          │
╞══════════════╡
│ 15118.900959 │
└──────────────┘


In [70]:
plan = (
    lf
    .with_columns(pl.col.bill_length_mm.fill_nan(43.0).fill_null(43.0))
    .select([
        pl.col.bill_length_mm.dp.sum((30.0, 60.0), scale=10).alias("bl-µ-sum"),
        (
            pl.col("bill_length_mm")*pl.col("bill_length_mm")
        ).dp.sum(bounds=(30**2, 60**2), scale=10).alias('bl-s²-sum')
    ])
)
opdp_full_pipe(plan, lf, lf_domain)

Cost: 366.00000001923394
Release: shape: (1, 2)
┌──────────────┬──────────────┐
│ bl-µ-sum     ┆ bl-s²-sum    │
│ ---          ┆ ---          │
│ f64          ┆ f64          │
╞══════════════╪══════════════╡
│ 15113.905619 ┆ 673628.37793 │
└──────────────┴──────────────┘
