# Mes premières requêtes solo

In [None]:
%pip install opendp
%pip install 'opendp[polars]'
%pip install hvplot

In [166]:
import opendp.prelude as dp
import polars as pl

dp.enable_features("contrib")

df = pl.scan_csv(dp.examples.get_california_pums_path(), has_header=False, ignore_errors=True)

In [167]:
# the greatest number of records that any one individual can influence in the dataset
max_influence = 1
# establish public information
col_names = ["age", "sex", "educ", "race", "income", "married"]
# we can also reasonably intuit that age and income will be numeric,
#     as well as bounds for them, without looking at the data
age_bounds = (0, 100)
income_bounds = (0, 150_000)

In [168]:
df = df.collect()

# Renommer les colonnes
df = df.rename({old: new for old, new in zip(df.columns, col_names)}).lazy()

df.collect_schema()

Schema([('age', Int64),
        ('sex', Int64),
        ('educ', Int64),
        ('race', Int64),
        ('income', Int64),
        ('married', Int64)])

In [None]:
df.collect().shape
df.head().collect()

# OpenDP

In [169]:
# Nécessite un LazyDataframe
context = dp.Context.compositor(
    data=df,
    privacy_unit=dp.unit_of(contributions=1),
    privacy_loss=dp.loss_of(epsilon=10.0, delta=1e-7),
    split_evenly_over=10,
    margins={
        # when data is not grouped (empty tuple)...
        (): dp.polars.Margin(
            # ...the biggest (and only) partition is no larger than
            #    France population * number of quarters
            max_partition_length= 1_0000
        ),
         # partition keys when grouped by "race" are invariant
        ("race",): dp.polars.Margin(
            public_info="keys",
        )
    },
)


epsilon should be less than or equal to 5, and is typically less than or equal to 1


In [None]:
query = (
    context.query()
    .select(
        pl.col("age")
        .cast(int)
        .fill_null(0)
        .dp.sum(bounds=(0, 100))
        .alias("Somme des ages"),

        dp.len()
        .alias("Taille dataset")
    )
)
# C'est compté comme une seule query en terme d'epsilon

#query = context.query().group_by("sex", "race", "married", "educ").agg(dp.len())
query.summarize(alpha=0.05)

column,aggregate,distribution,scale,accuracy
str,str,str,f64,f64
"""Somme des ages""","""Sum""","""Integer Laplace""",200.0,599.64583
"""Taille dataset""","""Frame Length""","""Integer Laplace""",2.0,6.429605


In [None]:
print("ages bruitées :",query.release().collect().with_columns(Moyenne=pl.col("Somme des ages") / pl.col("Taille dataset")))
print("ages :",df.select(pl.col("age").sum(), pl.len()).collect())

In [None]:
candidates = list(range(20, 60))

query = (
    context.query()
    .select(
        pl.col.age
        .fill_null(0)
        .dp.quantile(p, candidates)
        .alias(f"{p}-Quantile")
        for p in [0.25, 0.5, 0.75]
    )
)

query.summarize(alpha=0.05)

In [None]:
query.release().collect()

In [None]:
query = (
    context.query()
    .group_by("race")
    .agg(dp.len())
)

print(query.summarize(alpha=0.05))
result = query.release().collect()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Exemples de données
import polars as pl
import pandas as pd


# Convertir Polars DataFrame en Pandas pour utiliser Seaborn
df_pandas = result.to_pandas()

# Créer une liste pour stocker les valeurs répétées
repeated_values = []

# Répéter chaque 'age' selon la valeur de 'len'
for index, row in df_pandas.iterrows():
    repeated_values.extend([row['race']] * row['len'])

# Créer un DataFrame Pandas avec les valeurs répétées
df_pandas_repeated = pd.DataFrame(repeated_values, columns=['race'])

# Créer la figure et les axes
fig, axes = plt.subplots(2, 1, figsize=(10, 8))

# Histogramme du DataFrame Polars (avec la colonne len)
sns.countplot(x='race', data=df_pandas_repeated, ax=axes[0])
axes[0].set_title('Barplot - race (valeurs répétées)')

# Barplot pour la colonne 'race' directement dans le DataFrame collecté
sns.countplot(x='race', data=df.collect().to_pandas(), ax=axes[1])
axes[1].set_title('Barplot - race (original)')

# Afficher le graphique
plt.tight_layout()
plt.show()


In [None]:
query = (
    context.query()
    .group_by("race")
    .agg(dp.len())
)

print(query.summarize(alpha=0.05))
query.release().collect()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Exemples de données
import polars as pl
import pandas as pd


# Convertir Polars DataFrame en Pandas pour utiliser Seaborn
df_pandas = query.release().collect().to_pandas()

# Créer une liste pour stocker les valeurs répétées
repeated_values = []

# Répéter chaque 'age' selon la valeur de 'len'
for index, row in df_pandas.iterrows():
    repeated_values.extend([row['race']] * row['len'])

# Créer un DataFrame Pandas avec les valeurs répétées
df_pandas_repeated = pd.DataFrame(repeated_values, columns=['race'])

# Créer la figure et les axes
fig, axes = plt.subplots(2, 1, figsize=(10, 8))

# Histogramme du DataFrame Polars (avec la colonne len)
sns.countplot(x='race', data=df_pandas_repeated, ax=axes[0])
axes[0].set_title('Barplot - race (valeurs répétées)')

# Barplot pour la colonne 'race' directement dans le DataFrame collecté
sns.countplot(x='race', data=df.collect().to_pandas(), ax=axes[1])
axes[1].set_title('Barplot - race (original)')

# Afficher le graphique
plt.tight_layout()
plt.show()

dp.loss_of(epsilon=1.0) → dp.loss_of(rho=0.1) pour passer du laplace à Gauss