# Differential privacy for online advertising

## Prologue

Below, you will find a form of pseudo code we used to pull the data underlying the blog post Criteo published about k-thresholding.

The raw data we base our analysis upon is stored in one Hadoop table (`logs.displays_clicks_sales`) with the following structure:

| field name | data type | description |
|------------|---------- |-------------|
| day | DATE | Date at which the displays was printed |
| environment | STRING | environment of the display (web, app, etc.) |
| advertiser | STRING | Advertiser name|
| publisher | STRING | Publisher name |
| ad_size | [INT, INT] | the size of the ad (display height and width) |
| device | STRING | The device the user is on (mobile, desktop, InApp..) |
| displays | BIGINT | Number of displays |
| clicks | BIGINT | Number of clicks |
| sales | BIGINT | Number of sales |

--------------------------

## Setup

In [None]:
# Import statetments

import numpy as np
import pandas as pd
import criteopy
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import laplace

## Partner data

Pseudo query:

In [None]:
query = f"""
    SELECT
        d.advertiser AS advertiser
        , d.publisher AS publisher_domain
        , d.ad_size as ad_size
        , d.device as device
        , SUM(d.displays) as nb_displays
        , SUM(d.clicks) as nb_clicks
        , SUM(d.sales) as nb_sales
    FROM
        logs.displays_clicks_sales d
    WHERE
        d.day ='2020-06-11'
        AND d.advertiser = 'large_advertiser'
    GROUP BY
        d.advertiser
        , d.publisher
        , d.ad_size
        , d.device
        
"""

In [None]:
# SOme sanity checks
# data_partner = get_query_result(query)
data_partner.head(5)

In [None]:
data_partner.describe()

## Differential privacy - overall

In [None]:
# Set parameters for differential privacy
# These numbers represent the maximum impact of one given user on the results of the querry. In our case, 
# they represent the maximum number of display, click, sales done by one user. We took the following values
# arbitrarily,but some outliers (especially bots for instance) would strongly increase these number.
delta_f_dict = {}
delta_f_dict['nb_displays'] = 20
delta_f_dict['nb_clicks'] = 3
delta_f_dict['nb_sales'] = 1

#This number represent the privacy budget. The higher, the more accurate the answer, but the less privacy preservation
epsilon_privacy = 1

In [None]:
# Function to add Laplace noise


def add_noise_to_stat(delta_f, epsilon_privacy):
    '''
    This function return a random noise from the laplace distribution with the right parameter
    in order to ensure differential privacy
    '''
    return np.random.laplace(scale = delta_f / epsilon_privacy)

def add_noise_to_stat_percentile(delta_f, epsilon_privacy, percentile):
    '''
    This function return a percentile of the laplacian noise.  
    '''
    return laplace(scale = delta_f / epsilon_privacy).ppf(percentile)

In [None]:
def noisy_sum(delta_f, epsilon_privacy):
    '''
    This function return the sum of a KPI with the right level of noise
    '''
    def sum_with_noise(x):
        return np.sum(x) + add_noise_to_stat(delta_f, epsilon_privacy)
    return sum_with_noise

def sum_with_precentile_noise(delta_f, epsilon_privacy, percentile):
    '''
    This function return the sum of a KPI with the percentile noise
    '''
    percentile = add_noise_to_stat_percentile(delta_f, epsilon_privacy, percentile)
    def precentile_sum_with_noise(x):
        return np.sum(x) + percentile
    return precentile_sum_with_noise

In [None]:
kpi_of_interest = ['nb_displays', 'nb_clicks', 'nb_sales']
aggregation_level = ['day']
data_aggregated= data_partner.groupby(aggregation_level).agg(
    {kpi: [('Actual_Value', np.sum), 
           ('TURTLEDOVE_value', noisy_sum(delta_f_dict[kpi], epsilon_privacy))] 
     for kpi in kpi_of_interest})

for kpi in kpi_of_interest:
    data_aggregated[(kpi, 'error (%)')] = 100 * (data_aggregated[(kpi, 'TURTLEDOVE_value')] / data_aggregated[(kpi, 'Actual_Value')] -1)

In [None]:
data_aggregated[kpi_of_interest].style.set_precision(2)

## Differential privacy - per domain

In [None]:
kpi_of_interest = ['nb_displays', 'nb_clicks', 'nb_sales']
aggregation_level = ['day', 'publisher_domain']
data_domain= data_partner.groupby(aggregation_level).agg(
    {kpi: [('Actual_Value', np.sum), 
           ('TURTLEDOVE_value', noisy_sum(delta_f_dict[kpi], epsilon_privacy))] 
     for kpi in kpi_of_interest})
for kpi in kpi_of_interest:
    data_domain[(kpi, 'error (%)')] = 100 * (data_domain[(kpi, 'TURTLEDOVE_value')] / data_domain[(kpi, 'Actual_Value')] -1)

row_to_return = [1,10,100,1000]
data_domain[kpi_of_interest].sort_values(by=('nb_displays', 'Actual_Value'), ascending=False).iloc[row_to_return,:].style.set_precision(1)

## Differential privacy - per domain - with confidence intervals

In [None]:
# Differential private reporting relies on noise. Before, we were generating noise on the fly. It is usefull to see confidence
# intervals of the noise we are adding to the results. here we will go with a 90% confidence intervals (i.e [0.05,0.95])
kpi_of_interest = ['nb_displays', 'nb_clicks', 'nb_sales']
aggregation_level = ['day', 'publisher_domain']
data_domain= data_partner.groupby(aggregation_level).agg(
    {kpi: [('Actual_Value', np.sum), 
           ('TURTLEDOVE_value_down', sum_with_precentile_noise(delta_f_dict[kpi], epsilon_privacy, 0.05)),
           ('TURTLEDOVE_value_up', sum_with_precentile_noise(delta_f_dict[kpi], epsilon_privacy, 0.95))
          ] for kpi in kpi_of_interest})

for kpi in kpi_of_interest:
    data_domain[(kpi, 'confidence interval error (%)')] = 100 * (data_domain[(kpi, 'TURTLEDOVE_value_up')] / data_domain[(kpi, 'TURTLEDOVE_value_down')] -1)

row_to_return = [1, 10, 100, 1000]
data_domain[kpi_of_interest].sort_values(by=('nb_displays', 'Actual_Value'), ascending=False).iloc[row_to_return,:].style.set_precision(1)