## 3. Privacy

In [None]:
import pandas as pd
import numpy as np
from aif360.datasets import AdultDataset
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import math
import random

# Load and preprocess the dataset

In [70]:
# Load Dataset
dataset = AdultDataset()

# Convert to DataFrame
df = dataset.convert_to_dataframe()[0]

# Binarize age
median_age = df['age'].median()
df['age_binary'] = df['age'].apply(lambda x: 0 if x <= median_age else 1)
df = df.drop('age', axis=1)

# Convert categorical variables to dummies (binary)
df = pd.get_dummies(df, drop_first=True)



# Compute Cross-Tabulation with sensitive data

In [71]:
# original Cross-tabulation 
cross_tab_original = pd.crosstab(df['age_binary'], df['sex'])
print("original Cross-tabulation :\n", cross_tab_original)


original Cross-tabulation :
 sex          0.0    1.0
age_binary             
0           8196  14831
1           6499  15696


# Apply local differential privacy and create private data set

Local Differential Privacy (LDP) protects data directly at the individuals (data subjects).

LDP Equation:
$$
\frac{\mathsf{Pr}[F(x) = y]}{\mathsf{Pr}[F(x') =y]} \leq e^\epsilon.
$$

Here, $F$ is an algorithm that randomizes the data.
$\epsilon$ controls the level of privacy:
- A low $\epsilon$ means better privacy but more noise.
- A high $\epsilon$ reduces privacy but improves accuracy.

Objective: Prevent an observer from precisely identifying the original data ($x$) from the randomized data ($y$).

In [73]:
def get_epsilon(p=0.75, q=0.75):
    return math.log( max(q/(1-p), p/(1-q)) )

def rand_resp(x, p=0.75, q=0.75):
    toss = random.random()
    if x == 0:
        y = 0 if toss <= q else 1
    else:
        y = 1 if toss <= p else 0
    return y

In [74]:
# Create a copy of the original DataFrame
df_private = df.copy()

epsilon = get_epsilon()

# Convert variables to private
df_private['age_binary_private'] = df_private['age_binary'].apply(lambda x: rand_resp(x))
df_private['sex_private'] = df_private['sex'].apply(lambda x: rand_resp(x))

print("Data protected with epsilon = ", epsilon)
#print("Data protected : ", df_private.head())
print("age_binary :", df['age_binary'].head(5), "age_binary_private :", df_private['age_binary_private'].head(5))
print("sex :", df['sex'].head(5), "sex_private", df_private['sex_private'].head(5))

Data protected with epsilon =  1.0986122886681098
age_binary : 0    0
1    1
2    0
3    1
5    0
Name: age_binary, dtype: int64 age_binary_private : 0    0
1    1
2    1
3    1
5    1
Name: age_binary_private, dtype: int64
sex : 0    1.0
1    1.0
2    1.0
3    1.0
5    1.0
Name: sex, dtype: float64 sex_private 0    1
1    1
2    1
3    1
5    1
Name: sex_private, dtype: int64


# Compute Cross-Tabulation with private data and error on the estimation 

In [75]:
# Private Cross-tabulation
cross_tab_private = pd.crosstab(df_private['age_binary_private'], df_private['sex_private'])
print("Private Cross-tabulation :\n", cross_tab_private, "\n")

# For better visualization we compute again the first cross-tabulation
# original Cross-tabulation 
cross_tab_original = pd.crosstab(df['age_binary'], df['sex'])
print("Original Cross-tabulation :\n", cross_tab_original)


Private Cross-tabulation :
 sex_private            0      1
age_binary_private             
0                   9617  13343
1                   9027  13235 

Original Cross-tabulation :
 sex          0.0    1.0
age_binary             
0           8196  14831
1           6499  15696


In [76]:
# Calculate the mean absolute error between the original and private cross-tabulations
error = np.abs(cross_tab_original - cross_tab_private).mean().mean()
print(f"Mean absolute error between the original and private cross-tabulations : {error}")

total_original = cross_tab_original.values.sum()
relative_error = (error / total_original) * 100
print(f"Relative Error : {relative_error:.2f}%")


Mean absolute error between the original and private cross-tabulations : 1974.5
Relative Error : 4.37%


Understanding what the error represents :
The mean absolute error measures the average difference between the two cross-tabulations (original and private):

- A **low error** means that the protected data is close to the original data, which is good for accuracy but may compromise privacy.
- A **high error** indicates that privacy is better preserved, but the data is less accurate, which can affect the performance of analyses or models.

With an error of 1986, this means that on average, the values in the private cross-tabulation differ from the original values by 1986 occurrences.

Here we can see a code where epsilon is varied to observe its impact on the mean absolute error and, consequently, on the relative error.

In [77]:
errors = []
relative_errors = []
epsilons = [0.1, 0.5, 1.0, 2.0]
df_private_test = df.copy()

for eps in epsilons:
    df_private_test['Age_private'] = df['age_binary'].apply(lambda x: rand_resp(x, p=np.exp(eps)/(1+np.exp(eps)), q=np.exp(eps)/(1+np.exp(eps))))
    df_private_test['Sex_private'] = df['sex'].apply(lambda x: rand_resp(x, p=np.exp(eps)/(1+np.exp(eps)), q=np.exp(eps)/(1+np.exp(eps))))
    cross_tab_private = pd.crosstab(df_private_test['Age_private'], df_private_test['Sex_private'])
    error = np.abs(cross_tab_original - cross_tab_private).mean().mean()
    errors.append(error)
    total_original = cross_tab_original.values.sum()
    relative_error = (error / total_original) * 100
    relative_errors.append(relative_error)
    #print(f"Relative Error : {relative_error:.2f}%")

for eps, err, rela_err in zip(epsilons, errors, relative_errors):
    print(f"Epsilon: {eps}, Mean Absolute Error: {err}")
    print(f"Relative Error : {rela_err:.2f}%")


Epsilon: 0.1, Mean Absolute Error: 3767.0
Relative Error : 8.33%
Epsilon: 0.5, Mean Absolute Error: 3021.5
Relative Error : 6.68%
Epsilon: 1.0, Mean Absolute Error: 2115.0
Relative Error : 4.68%
Epsilon: 2.0, Mean Absolute Error: 962.0
Relative Error : 2.13%


Estimate how many people exist in value combinations of the two sensitive attributes, and quantify the errors in the estimation. 

In [78]:
n_rep_sex = np.sum(df_private['sex_private'])
n_rep_age = np.sum(df_private['age_binary_private'])

n_people = len(df['sex'])
# Step 4: Estimate the number of people in each value combination of the two sensitive attributes
n_est_sex = 2*n_rep_sex - 0.5*n_people
n_est_age = 2*n_rep_age - 0.5*n_people

print(f"Estimation corrigée pour Sex : {n_est_sex}")
print(f"Estimation corrigée pour Age : {n_est_age}")

# Step 5: Quantify the errors in the estimation
error = np.sum(cross_tab_private, axis=0) - np.sum(cross_tab_original, axis=0)
print("Total error in the estimation : ", error)

Estimation corrigée pour Sex : 30545.0
Estimation corrigée pour Age : 21913.0
Total error in the estimation :  Sex_private
0    1924
1   -1924
dtype: int64


Pas trop compris avec l'erreur ??
J'ai essayé quelque chose mais je vous laisse voir

In [79]:
# Comptes réels dans les données originales
n_real_sex = df['sex'].sum()
n_real_age = df['age_binary'].sum()

# Calcul des erreurs absolues
error_sex = abs(n_est_sex - n_real_sex)
error_age = abs(n_est_age - n_real_age)

print(f"Erreur absolue pour Sex : {error_sex}")
print(f"Erreur absolue pour Age : {error_age}")


Erreur absolue pour Sex : 18.0
Erreur absolue pour Age : 282.0


# Split and Train private Data

In [81]:
# Divide into features (X) and labels (y)
X = df_private.drop('income-per-year', axis=1)
y = df_private['income-per-year']

# Divide into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train an XGBoost classifier
model = XGBClassifier(eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)

# Test the Model

In [82]:
# Predictions
y_pred = model.predict(X_test)

# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8733692046878455
Classification Report:
               precision    recall  f1-score   support

         0.0       0.90      0.94      0.92     10241
         1.0       0.79      0.66      0.72      3326

    accuracy                           0.87     13567
   macro avg       0.84      0.80      0.82     13567
weighted avg       0.87      0.87      0.87     13567

