In [1]:
%%capture
pip install 'opendp[polars]'

# Variance

In [2]:
import opendp.prelude as dp
import polars as pl

dp.enable_features("contrib")

In [3]:
PATH = "penguin.csv"

In [4]:
lf = pl.scan_csv(PATH, ignore_errors=True)
lf

In [5]:
lf.collect().head(2)

species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
str,str,f64,f64,f64,f64,str
"""Adelie""","""Torgersen""",39.1,18.7,181.0,3750.0,"""MALE"""
"""Adelie""","""Torgersen""",39.5,17.4,186.0,3800.0,"""FEMALE"""


No var in opendp polars.

Wikipedia: https://fr.wikipedia.org/wiki/Variance_(math%C3%A9matiques)

$$
V(X) = E[(X-E(X))^2)]
$$
but also
$$
V(X) = E(X^2) - (E(X))^2
$$

In [6]:
expr = pl.sql_expr("VAR(bill_length_mm) AS avg_bill_length_mm")
print(expr)
lf.select(expr).collect().item() # polars only (no opendp)

col("bill_length_mm").var().alias("avg_bill_length_mm")


29.80705432937182

In [9]:
context = dp.Context.compositor(
    data=lf,
    privacy_unit=dp.unit_of(contributions=1),
    privacy_loss=dp.loss_of(epsilon=5.0, delta=1e-6),
    split_evenly_over=2,
    margins=[
        dp.polars.Margin(max_length=500)
    ],
)

In [10]:
var_query = (
    context.query()
    .with_columns(pl.col.bill_length_mm.fill_nan(43.0).fill_null(43.0))
    .select([
        pl.col.bill_length_mm.cast(int).dp.mean((30.0, 60.0)).alias("bl-µ"),
        (
            pl.col("bill_length_mm").cast(int)*pl.col("bill_length_mm").cast(int)
        ).dp.mean(bounds=(30**2, 60**2)).alias('bl-s²')
    ])
)

In [11]:
var_query.summarize()

column,aggregate,distribution,scale
str,str,str,f64
"""bl-µ""","""Sum""","""Integer Laplace""",96.0
"""bl-µ""","""Length""","""Integer Laplace""",1.6
"""bl-s²""","""Sum""","""Integer Laplace""",5760.0
"""bl-s²""","""Length""","""Integer Laplace""",1.6


In [12]:
res = var_query.release().collect()

In [13]:
bl_variance = res["bl-s²"].item() - res["bl-µ"].item()**2
bl_variance

15.56016520091589

In [14]:
context = dp.Context.compositor(
    data=lf,
    privacy_unit=dp.unit_of(contributions=1),
    privacy_loss=dp.loss_of(epsilon=5.0, delta=1e-6),
    split_evenly_over=1,
    margins=[
        dp.polars.Margin(max_length=500)
    ],
)

In [15]:
query_sum = (
    context.query()
    .select(pl.col.bill_length_mm.dp.sum((30.0, 60.0)).alias("bl-µ-sum"))
)
query_sum.summarize()

column,aggregate,distribution,scale
str,str,str,f64
"""bl-µ-sum""","""Sum""","""Float Laplace""",12.0


In [16]:
60/5 # not (60 - 30)/5. check input distance

12.0

In [17]:
var_query_sum = (
    context.query()
    .with_columns(pl.col.bill_length_mm.fill_nan(43.0).fill_null(43.0))
    .select([
        pl.col.bill_length_mm.dp.sum((30.0, 60.0)).alias("bl-µ-sum"),
        (
            pl.col("bill_length_mm")*pl.col("bill_length_mm")
        ).dp.sum(bounds=(30**2, 60**2)).alias('bl-s²-sum')
    ])
)
var_query_sum.summarize()

column,aggregate,distribution,scale
str,str,str,f64
"""bl-µ-sum""","""Sum""","""Float Laplace""",24.0
"""bl-s²-sum""","""Sum""","""Float Laplace""",1440.0


In [18]:
60/(5/2), (60**2)/(5/2)

(24.0, 1440.0)

In [19]:
res = var_query_sum.release().collect()
res

bl-µ-sum,bl-s²-sum
f64,f64
15149.251747,674640.840514


In [20]:
NB_ROW = 342
bl_variance = res["bl-s²-sum"].item()/NB_ROW - (res["bl-µ-sum"].item()/NB_ROW)**2
bl_variance

10.493305290777016

Issue with variance: noise scales to $upper^2$ (opendp) or $(upper - lower)^2$ (diffprivlib).

opendp: splits in sub queries (like: sum + length). 
- sum has a bigger sensitivity 
- but then added noise also divided by N.

snsql: https://github.com/opendp/smartnoise-sdk/blob/main/sql/snsql/sql/private_rewriter.py#L72 
- also doing $V(X) = E(X^2) - (E(X))^2$ still not sure how they add the noise but i guess like opendp due to query rewriting

diffprivlib: https://github.com/IBM/differential-privacy-library/blob/main/diffprivlib/tools/utils.py#L450
- sensitivity=((upper - lower) / array.size) ** 2 * (array.size - 1), # no subquery
- lower=0,
- upper=((upper - lower) ** 2) / 4 # why /4?

google dp: https://github.com/google/differential-privacy/blob/main/go/dpagg/variance.go#L30  idk where they add the noise and how for now..
- https://github.com/google/differential-privacy/blob/main/go/dpagg/variance.go#L30 $(upper - lower)^2/4$

R DPpack: they divide by N https://github.com/cran/DPpack/blob/master/R/DataAccess.R#L67C39-L67C40  # no subquery

Statistical Properties of Sanitized Results from DifferentiallyPrivate Laplace Mechanism with Univariate Bounding Constraints
Fang Liu - https://arxiv.org/pdf/1607.08554

Page 6: For example, the global sensitivity of the sample mean of a variable the value of which is bounded within $[c0, c1]$ is $(c1 − c0)/n$, and that of the sample variance is $(c1 −c0)^2/n$.