# Global Diferential Privacy Assessement: Bivariate case

In this notebook, we will explore further the bivariate case of the Global Differential Privacy. After running this notebook, you will be able to:
- Understand the concept and applying Gaussian based Global Differential Privacy for bivariate case.

We also run a monte carl simulation to validate the GDP implementation.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os, wget, shutil, math
from math import sqrt
from scipy.stats import pearsonr
from itertools import combinations
from rich import print as pprint
import warnings, random
warnings.filterwarnings("ignore")
from scipy.stats import multivariate_normal


# Bivariate Differential Privacy
Here is an extension of the previous 1D (univariate) version to 2D data where we extend from Height to Weight. The rest of the setup is as previous notebooks.

In [2]:

def generate_dp_noise_cholesky(chol_factor, sensitivity, max_attempts=10000):

    for attempt in range(max_attempts):
        z = np.random.standard_normal(2)
        noise = chol_factor @ z
        if np.all(noise >= sensitivity):
            return noise, True
    
    return noise, False

def bivariate_dp(data, original_output):

    loo_output = np.array([np.mean(np.delete(data,i,axis=0), 0) for i in range(data.shape[0])])

    sensitivities = np.max(np.abs(loo_output - original_output), 1)
    data_cov = np.cov(data)
    loo_scale = np.std(loo_output, axis=0)

    scale_factor1 = (2 * loo_scale[0]) / np.sqrt(data_cov[0, 0])
    scale_factor2 = (2 * loo_scale[1]) / np.sqrt(data_cov[1, 1])
    scale_matrix = np.diag([scale_factor1, scale_factor2])
    noise_cov = scale_matrix @ data_cov @ scale_matrix
    eigenvals = np.linalg.eigvals(noise_cov)
    if np.min(eigenvals) <= 1e-10:
        noise_cov += np.eye(2) * 1e-8

    chol_factor = np.linalg.cholesky(noise_cov)
    noise, success = generate_dp_noise_cholesky(chol_factor, sensitivities)
    noise1, noise2 = noise[0], noise[1]
    noisy_outputs = np.stack([original_output[0] + noise1, original_output[1] + noise2])
    
    return noisy_outputs, sensitivities

def user_pipeline(data):
    return np.array([np.mean(data, axis=1)]).T

def iqr_bounds(data, axis=0):

    data = np.asarray(data)
    q1 = np.percentile(data, 25, axis=axis)
    q3 = np.percentile(data, 75, axis=axis)
    
    iqr = q3 - q1
    lower_bounds = q1 - 1.5 * iqr
    upper_bounds = q3 + 1.5 * iqr
    
    return iqr, lower_bounds, upper_bounds

def outlier_(value, lower_bounds, upper_bounds):

    value = np.asarray(value)
    lower_bounds = np.asarray(lower_bounds)
    upper_bounds = np.asarray(upper_bounds)
    
    return (value < lower_bounds) | (value > upper_bounds)

# Validation

We use Monte Carlo simulations to show that the 2D results are differentially private.
Since GDP depends on the influence of a single observation, we generate 1000 datasets of size 20, 40, 60, 80, and 100 with one injected outlier. For each replicate, the query is the 2D leave-one-out mean, which without noise would allow reconstruction of individual data points (see notebook 1). By adding multivariate noise, we prevent such reconstruction and make outlier identification impossible.

We then evaluate:

- The ability (or inability) to detect the outlier in 2D space.
- The accuracy of reconstructed 2D means relative to sensitivity.

In [None]:
def run_single_mc_iteration_bivariate(MC, sample_sizes, correlations, outlier=False):
    
    id_dr = {size: np.zeros(size) for size in sample_sizes}  
    reconstructed = {size: [] for size in sample_sizes}
    sensitivities_ = {size: [] for size in sample_sizes}
    error_ = {size: [] for size in sample_sizes}  

    for mc in range(MC):
        for correlation in correlations:
            cov_matrix = np.array([[1.0, correlation], 
                                   [correlation, 1.0]])
            means = np.array([0.0,0.0])
            master_data = np.random.multivariate_normal(means, cov_matrix, max(sample_sizes)).T
            for sample_size in sample_sizes:
                data = master_data[:, :sample_size]
                _, lower_b, _ = iqr_bounds(data)
                if outlier:
                    data[:,0] = [np.max(lower_b) + 10, np.mean(lower_b) + 10] 
                else: pass    
                data = data.T
                loo_data = np.array([np.delete(data,i,axis=0) for i in range(data.shape[0])])
                ######## PIPELINE ########
                noisy_outputs = []
                sens_outputs = []
                for loo_subset in loo_data:
                    n, s = bivariate_dp(loo_subset.T, user_pipeline(loo_subset.T))
                    noisy_outputs.append(n.flatten().T)
                    sens_outputs.append(s.flatten().T)
                noisy_outputs = np.array(noisy_outputs)  
                coeff_mat = np.ones((sample_size, sample_size)) - np.eye(sample_size)
                recons = []
    
                for dim in range(noisy_outputs.shape[1]): 
                    recons_dim = np.linalg.solve(coeff_mat, (sample_size - 1) * noisy_outputs[:, dim])
                    recons.append(recons_dim)
                recons = np.stack(recons, axis=1) 
                reconstructed[sample_size].append(recons)
                sensitivities_[sample_size].append(sens_outputs)
                recons_error = np.mean(np.linalg.norm(recons - data, axis=1))
                error_[sample_size].append(recons_error)
            
                _, lower_b, upper_b = iqr_bounds(recons[:,0])
                    
                for idx in range(sample_size):
                    if np.all(outlier_(np.array(recons[idx,0]), np.array(lower_b), np.array(upper_b))):
                        id_dr[sample_size][idx] += 1        
    return id_dr, reconstructed, sensitivities_, error_
           
MC = 1000
subsample_sizes = [20, 50, 80, 100, 200]
correlations_bivariate = [0.0, 0.4, 0.9]

id_dr_OL, reconstructed_OL, sensitivities_OL, error_OL = run_single_mc_iteration_bivariate(MC, subsample_sizes, correlations_bivariate, outlier=True) 
id_dr, reconstructed, sensitivities_, error_ = run_single_mc_iteration_bivariate(MC, subsample_sizes, correlations_bivariate, outlier=False) 




In [None]:
color_map = {
    20: '#FF6B6B',
    50: '#4ECDC4',
    80: '#650021',
    100: '#FFA07A',
    200: '#98FB98',
 }

fig = plt.figure(figsize=(16, 10))
ax = fig.add_subplot(111, projection='3d')

for size in subsample_sizes:
    ax.scatter(np.mean(sensitivities_[size],1)[:,0], np.mean(sensitivities_[size],1)[:,1], np.array(error_[size]),
               color=color_map[size],
               s=60, alpha=0.7, edgecolors='k',
               label=f'n={size}')

ax.set_xlabel("Sensitivity Weight")
ax.set_ylabel("Sensitivity Height")
ax.set_zlabel("MAE")
ax.set_title("Bivariate Sensitivity vs. Error")
ax.legend(title="Subsample Size")
plt.show()


# Sensitivity Vs. MAE

This 3D plot shows the relationship between bivariate sensitivity (for weight and height) and the reconstruction error across different sample sizes.

- X-axis (Sensitivity Weight) and Y-axis (Sensitivity Height) show how sensitive each dimension is to the removal of one individual (leave-one-out).
- Z-axis (Error) measures how much error there is when trying to reconstruct the data after noise is added.


### What we see:

- Smaller datasets, like n=20, have higher sensitivity and higher reconstruction error, meaning outliers have a bigger influence.
- The same behaviour as 1D, as sample size increases, sensitivity in both dimensions shrinks, and reconstruction error stabilizes (clusters get tighter and lower).

Overall, the plot shows that in the 2D case, larger datasets protect privacy better (lower sensitivity) while keeping reconstruction error bounded.

In [None]:
mean_sensitivities = list()
std_errors = list()

for size in subsample_sizes:
    all_sensitivities = np.array(sensitivities_[size]).flatten()
    mean_sens = np.mean(all_sensitivities)
    std_err = np.std(all_sensitivities) / np.sqrt(len(all_sensitivities))
    mean_sensitivities.append(mean_sens)
    std_errors.append(std_err)
    
plt.figure(figsize=(14, 8))

plt.errorbar(
    subsample_sizes, mean_sensitivities, yerr=std_errors,
    fmt='o-', capsize=5, color='blue',
    label='Mean Sensitivity +/- SE (NO OUTLIER)'
)


for size, sens, err in zip(subsample_sizes, mean_sensitivities, std_errors):
    plt.annotate(
        f"{sens:.2f}+/-{err:.2f}", 
        (size, sens),
        textcoords="offset points", 
        xytext=(0, 10),
        ha='right', 
        fontsize=12,
        color='blue'
    )


plt.xlabel('Sample Size', fontsize=12)
plt.ylabel('Average Sensitivities of Weight & Height', fontsize=12)
plt.title('Sensitivity vs. Sample Size (+/- Standard Error)', fontsize=14)
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(subsample_sizes)
plt.legend(fontsize=10, loc='upper right')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(14, 8))

mean_sens_w, mean_sens_h = list(), list()
std_err_w, std_err_h = list(), list()

for size in subsample_sizes:
    w_sens_all = np.array(sensitivities_[size])[0:,:,0].flatten()
    h_sens_all = np.array(sensitivities_[size])[0:,:,1].flatten()
    w_sens = w_sens_all.mean()
    h_sens = h_sens_all.mean()
    
    w_std_err = np.std(w_sens_all) / np.sqrt(len(w_sens_all))
    h_std_err = np.std(h_sens_all) / np.sqrt(len(h_sens_all))
    
    mean_sens_w.append(w_sens)
    mean_sens_h.append(h_sens)
    
    std_err_w.append(w_std_err)
    std_err_h.append(h_std_err)
    
plt.errorbar(
    subsample_sizes, mean_sens_w, yerr=std_err_w,
    fmt='o-', capsize=5, color='blue',
    label='Mean Sensitivity +/- SE Weight'
)
for size, sens, err in zip(subsample_sizes, mean_sens_w, std_err_w):
    plt.annotate(
        f"{sens:.4f}+/-{err:.4f}", 
        (size, sens),
        textcoords="offset points", 
        xytext=(0, 10),
        ha='right', 
        fontsize=12,
        color='blue'
    )

plt.errorbar(
    subsample_sizes, mean_sens_h, yerr=std_err_h,
    fmt='o-', capsize=5, color='red',
    label='Mean Sensitivity +/- SE Height'
)  

for size, sens, err in zip(subsample_sizes, mean_sens_h, std_err_h):
    plt.annotate(
        f"{sens:.4f}+/-{err:.4f}", 
        (size, sens),
        textcoords="offset points", 
        xytext=(0, 10),
        ha='right', 
        fontsize=12,
        color='red'
    )
plt.xlabel('Sample Size', fontsize=12)
plt.ylabel('Seperated Sensitivities of Weight & Height', fontsize=12)
plt.title('Sensitivity vs. Sample Size (+/- Standard Error)', fontsize=14)
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(subsample_sizes)
plt.legend(fontsize=10, loc='upper right')

plt.tight_layout()
plt.show()

In [None]:
mean_sens1 = []
mean_sens2 = []
std_err1 = []
std_err2 = []

for size in subsample_sizes:
    sens = np.array(sensitivities_[size]) 
    mean_sens1.append(np.mean(sens[:, 0]))
    mean_sens2.append(np.mean(sens[:, 1]))
    std_err1.append(np.std(sens[:, 0]) / np.sqrt(len(sens)))
    std_err2.append(np.std(sens[:, 1]) / np.sqrt(len(sens)))

plt.figure(figsize=(14, 8))

plt.errorbar(
    subsample_sizes, mean_sens1, yerr=std_err1,
    fmt='o-', capsize=5, color='blue',
    label='Mean Sensitivity Weight +/- SE'
)

plt.errorbar(
    subsample_sizes, mean_sens2, yerr=std_err2,
    fmt='s--', capsize=5, color='green',
    label='Mean Sensitivity Height +/- SE'
)

for size, s1, e1, s2, e2 in zip(subsample_sizes, mean_sens1, std_err1, mean_sens2, std_err2):
    plt.annotate(f"{s1:.4f}+/-{e1:.4f}", (size, s1),
                 textcoords="offset points", xytext=(0, 8),
                 ha='center', fontsize=14, color='blue')
    plt.annotate(f"{s2:.4f}+/-{e2:.4f}", (size, s2),
                 textcoords="offset points", xytext=(0, -12),
                 ha='center', fontsize=14, color='green')

plt.xlabel("Sample Size", fontsize=12)
plt.ylabel("Average Sensitivity", fontsize=12)
plt.title("Sensitivity vs. Sample Size +/- SE", fontsize=14)
plt.grid(True, linestyle="--", alpha=0.7)
plt.xticks(subsample_sizes)
plt.legend(fontsize=10, loc="best")

plt.tight_layout()
plt.show()


In [None]:
detection_rates = (np.array([id_dr[size][0] for size in subsample_sizes], dtype=object) / n_replicates ) * 100 
prc = (np.array([id_dr[size][0] for size in subsample_sizes], dtype=object) / n_replicates ) * 100

means = []
errors = []
pop_size = []

for size in subsample_sizes:
    percentages = (id_dr[size] / n_replicates) * 100
    mean_val = np.mean(percentages)
    se_val = np.std(percentages, ddof=1) / np.sqrt(len(percentages))
    means.append(mean_val)
    errors.append(se_val)
    pop_size.append(size)

plt.figure(figsize=(14, 8))
plt.plot(subsample_sizes, detection_rates, 
         marker='o', linestyle='-', color='blue', label='Index 0', linewidth=2, markersize=8)

for size in subsample_sizes:
    if size in detection_rates:
        rate = detection_rates[size]
        plt.annotate(f"{rate:.4f}%", (size, rate),
                     textcoords="offset points", xytext=(0, 10),
                     ha='right', fontsize=12, color='blue')

plt.errorbar(pop_size, means, yerr=errors, fmt='-o', capsize=5, 
             color='red', label='Sample', linewidth=2, markersize=8, elinewidth=2)

for x, y, err in zip(pop_size, means, errors):
    plt.annotate(f"{y:.4f}% Â± {err:.4f}",
                 xy=(x, y), xytext=(0, 10),
                 textcoords='offset points', ha='left',
                 fontsize=12, color='red',
                 bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.8))

plt.xlabel('Sample Size', fontsize=12, fontweight='bold')
plt.ylabel('Outlier detection rate (%)', fontsize=12, fontweight='bold')
plt.title(f'Outlier detection rates over {n_replicates} runs', fontsize=14, pad=20)
plt.grid(True, linestyle='--', alpha=0.6)
plt.xticks(subsample_sizes)
plt.legend(loc='best', fontsize=12)
plt.tight_layout()
plt.show()