## **PARTE A) Cálculo de sensibilidade**

### **A.1) Preparando dados**

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
dataset = pd.read_csv('infringement_dataset.csv')

In [3]:
dataset.head(3)

Unnamed: 0,loan_id,infringed,contract_type,gender,has_own_car,has_own_realty,num_children,annual_income,credit_amount,credit_annuity,...,SK_ID_CURR,avg_days_decision,past_avg_amount_annuity,past_avg_amt_application,past_avg_amt_credit,past_loans_approved,past_loans_refused,past_loans_canceled,past_loans_unused,past_loans_total
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,100002.0,606.0,9251.775,179055.0,179055.0,1.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,100003.0,1305.0,56553.99,435436.5,484191.0,3.0,0.0,0.0,0.0,3.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,100004.0,815.0,5357.25,24282.0,20106.0,1.0,0.0,0.0,0.0,1.0


In [4]:
# Criando função de ruído
def add_laplace_noise(s, sensitivity, epsilon):
    return s + np.random.laplace(loc=0, scale=sensitivity/epsilon)

# Criando função de cálculo de erro
def percentage_error(original, estimated):
    if original == 0:
        return 1
    else:
        return ((original - estimated) / original) * 100

### **A.2) Testando diferentes epsilons e sensibilidades**

#### Como exemplo, vamos usar a variável "age" e o método DP-count

In [5]:
# Criando diferentes parâmetros para teste
lista_sensibility = [1, 0.5, 0.1]
lista_epsilon = [0.001, 0.01, 0.1]

In [6]:
# Mediana de age, para usar o corte de 50% no cálculo do DP-count
v0 = dataset.age.median()
print(v0)

43.0


In [7]:
lista_age = []
orig_count = len(dataset.query(f'age > {v0}'))

for i in lista_sensibility:
    for j in lista_epsilon:
        dp_count = add_laplace_noise(len(dataset.query(f'age > {v0}')), i, j)
        lista_age.append('age')
        lista_age.append(round(orig_count,2))
        lista_age.append(round(dp_count,2))
        lista_age.append(round(percentage_error(orig_count, dp_count),3))
        lista_age.append(i)
        lista_age.append(j)

In [8]:
df1 = pd.DataFrame()

df1['atributo_sensivel']=""
df1['valor_original']=""
df1['valor_mascarado']=""
df1['erro_percentual']=""
df1['sensibility']=""
df1['epsilon']=""

df1.loc[len(df1)] = lista_age[0:6]
df1.loc[len(df1)] = lista_age[6:12]
df1.loc[len(df1)] = lista_age[12:18]
df1.loc[len(df1)] = lista_age[18:24]
df1.loc[len(df1)] = lista_age[24:30]
df1.loc[len(df1)] = lista_age[30:36]
df1.loc[len(df1)] = lista_age[36:42]
df1.loc[len(df1)] = lista_age[42:48]
df1.loc[len(df1)] = lista_age[48:54]

df1

Unnamed: 0,atributo_sensivel,valor_original,valor_mascarado,erro_percentual,sensibility,epsilon
0,age,146615,146940.63,-0.222,1.0,0.001
1,age,146615,146458.09,0.107,1.0,0.01
2,age,146615,146612.75,0.002,1.0,0.1
3,age,146615,146470.47,0.099,0.5,0.001
4,age,146615,146652.34,-0.025,0.5,0.01
5,age,146615,146618.78,-0.003,0.5,0.1
6,age,146615,147015.21,-0.273,0.1,0.001
7,age,146615,146618.2,-0.002,0.1,0.01
8,age,146615,146615.59,-0.0,0.1,0.1


##### CONCLUSÃO 1: Quanto maior o valor de epsilon, menor [em módulo] o erro percentual e vice-versa.
##### CONCLUSÃO 2: Alterar o valor de sensibility não afeta tanto o erro percentual.

### **A.3) DP-count calculation com variáveis sensíveis**

In [9]:
epsilon_count = 0.01

v1 = dataset.infringed.mean()
v2 = dataset.annual_income.median()
v3 = dataset.credit_amount.median()
v4 = dataset.goods_valuation.median()
v5 = dataset.days_employed.median()
v6 = dataset.past_avg_amount_annuity.median()
v7 = dataset.past_loans_approved.median()

print(v1)
print(v2)
print(v3)
print(v4)
print(v5)
print(v6)
print(v7)

0.08072881945686496
147150.0
513531.0
450000.0
-1213.0
11984.355
3.0


In [10]:
# Infringed
lista_count_inf = []
orig_count1 = len(dataset.query(f'infringed > {v1}'))
dp_count1 = add_laplace_noise(len(dataset.query(f'infringed > {v1}')), 1, epsilon_count)
lista_count_inf.append('infringed')
lista_count_inf.append(round(orig_count1,2))
lista_count_inf.append(round(dp_count1,2))
lista_count_inf.append(round(percentage_error(orig_count1, dp_count1),3))


# Annual income
lista_count_ain = []
orig_count2 = len(dataset.query(f'annual_income > {v2}'))
dp_count2 = add_laplace_noise(len(dataset.query(f'annual_income > {v2}')), 1, epsilon_count)
lista_count_ain.append('annual_income')
lista_count_ain.append(round(orig_count2,2))
lista_count_ain.append(round(dp_count2,2))
lista_count_ain.append(round(percentage_error(orig_count2, dp_count2),3))


# Credit ammount
lista_count_cam = []
orig_count3 = len(dataset.query(f'credit_amount > {v3}'))
dp_count3 = add_laplace_noise(len(dataset.query(f'credit_amount > {v3}')), 1, epsilon_count)
lista_count_cam.append('credit_amount')
lista_count_cam.append(round(orig_count3,2))
lista_count_cam.append(round(dp_count3,2))
lista_count_cam.append(round(percentage_error(orig_count3, dp_count3),3))


# Goods valuation
lista_count_gva = []
orig_count4 = len(dataset.query(f'goods_valuation > {v4}'))
dp_count4 = add_laplace_noise(len(dataset.query(f'goods_valuation > {v4}')), 1, epsilon_count)
lista_count_gva.append('goods_valuation')
lista_count_gva.append(round(orig_count4,2))
lista_count_gva.append(round(dp_count4,2))
lista_count_gva.append(round(percentage_error(orig_count4, dp_count4),3))


# Days employed
lista_count_dem = []
orig_count5 = len(dataset.query(f'days_employed > {v5}'))
dp_count5 = add_laplace_noise(len(dataset.query(f'days_employed > {v5}')), 1, epsilon_count)
lista_count_dem.append('days_employed')
lista_count_dem.append(round(orig_count5,2))
lista_count_dem.append(round(dp_count5,2))
lista_count_dem.append(round(percentage_error(orig_count5, dp_count5),3))


# Past avg amount annuity
lista_count_paa = []
orig_count6 = len(dataset.query(f'past_avg_amount_annuity > {v6}'))
dp_count6 = add_laplace_noise(len(dataset.query(f'past_avg_amount_annuity > {v6}')), 1, epsilon_count)
lista_count_paa.append('past_avg_amount_annuity')
lista_count_paa.append(round(orig_count6,2))
lista_count_paa.append(round(dp_count6,2))
lista_count_paa.append(round(percentage_error(orig_count6, dp_count6),3))


# Past loans approved
lista_count_pla = []
orig_count7 = len(dataset.query(f'past_loans_approved > {v7}'))
dp_count7 = add_laplace_noise(len(dataset.query(f'past_loans_approved > {v7}')), 1, epsilon_count)
lista_count_pla.append('past_loans_approved')
lista_count_pla.append(round(orig_count7,2))
lista_count_pla.append(round(dp_count7,2))
lista_count_pla.append(round(percentage_error(orig_count7, dp_count7),3))

In [11]:
df2 = pd.DataFrame()

df2['atributo_sensivel']=""
df2['valor_original']=""
df2['valor_mascarado']=""
df2['erro_percentual']=""

df2.loc[len(df2)] = lista_count_inf
df2.loc[len(df2)] = lista_count_ain
df2.loc[len(df2)] = lista_count_cam
df2.loc[len(df2)] = lista_count_gva
df2.loc[len(df2)] = lista_count_dem
df2.loc[len(df2)] = lista_count_paa
df2.loc[len(df2)] = lista_count_pla

df2['sensibility']=""
df2['epsilon']=""
df2['metodo']=""

df2['sensibility'] = 1
df2['epsilon'] = epsilon_count
df2['metodo'] = 'dp-count'

df2.set_index('metodo')

Unnamed: 0_level_0,atributo_sensivel,valor_original,valor_mascarado,erro_percentual,sensibility,epsilon
metodo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
dp-count,infringed,24825,24912.22,-0.351,1,0.01
dp-count,annual_income,153751,153727.23,0.015,1,0.01
dp-count,credit_amount,153662,153609.19,0.034,1,0.01
dp-count,goods_valuation,149254,149309.66,-0.037,1,0.01
dp-count,days_employed,153689,153745.32,-0.037,1,0.01
dp-count,past_avg_amount_annuity,145319,145201.31,0.081,1,0.01
dp-count,past_loans_approved,93799,94071.9,-0.291,1,0.01


### **A.4) Calculando DP-sum com variáveis sensíveis**

In [12]:
epsilon_sum = 0.01

In [13]:
# Infringed
lista_sum_inf = []
orig_sum1 = dataset['infringed'].sum()
dp_sum1 = add_laplace_noise(dataset['infringed'].sum(), dataset['infringed'].max(), epsilon_sum)
lista_sum_inf.append('infringed')
lista_sum_inf.append(round(orig_sum1,2))
lista_sum_inf.append(round(dp_sum1,2))
lista_sum_inf.append(round(percentage_error(orig_sum1, dp_sum1),3))


# Annual income
lista_sum_ain = []
orig_sum2 = dataset['annual_income'].sum()
dp_sum2 = add_laplace_noise(dataset['annual_income'].sum(), dataset['annual_income'].max(), epsilon_sum)
lista_sum_ain.append('annual_income')
lista_sum_ain.append(round(orig_sum2,2))
lista_sum_ain.append(round(dp_sum2,2))
lista_sum_ain.append(round(percentage_error(orig_sum2, dp_sum2),3))


# Credit ammount
lista_sum_cam = []
orig_sum3 = dataset['credit_amount'].sum()
dp_sum3 = add_laplace_noise(dataset['credit_amount'].sum(), dataset['credit_amount'].max(), epsilon_sum)
lista_sum_cam.append('credit_amount')
lista_sum_cam.append(round(orig_sum3,2))
lista_sum_cam.append(round(dp_sum3,2))
lista_sum_cam.append(round(percentage_error(orig_sum3, dp_sum3),3))


# Goods valuation
lista_sum_gva = []
orig_sum4 = dataset['goods_valuation'].sum()
dp_sum4 = add_laplace_noise(dataset['goods_valuation'].sum(), dataset['goods_valuation'].max(), epsilon_sum)
lista_sum_gva.append('goods_valuation')
lista_sum_gva.append(round(orig_sum4,2))
lista_sum_gva.append(round(dp_sum4,2))
lista_sum_gva.append(round(percentage_error(orig_sum4, dp_sum4),3))


# Days employed
lista_sum_dem = []
orig_sum5 = dataset['days_employed'].sum()
dp_sum5 = add_laplace_noise(dataset['days_employed'].sum(), dataset['days_employed'].max(), epsilon_sum)
lista_sum_dem.append('days_employed')
lista_sum_dem.append(round(orig_sum5,2))
lista_sum_dem.append(round(dp_sum5,2))
lista_sum_dem.append(round(percentage_error(orig_sum5, dp_sum5),3))


# Past avg amount annuity
lista_sum_paa = []
orig_sum6 = dataset['past_avg_amount_annuity'].sum()
dp_sum6 = add_laplace_noise(dataset['past_avg_amount_annuity'].sum(), dataset['past_avg_amount_annuity'].max(), epsilon_sum)
lista_sum_paa.append('past_avg_amount_annuity')
lista_sum_paa.append(round(orig_sum6,2))
lista_sum_paa.append(round(dp_sum6,2))
lista_sum_paa.append(round(percentage_error(orig_sum6, dp_sum6),3))


# Past loans approved
lista_sum_pla = []
orig_sum7 = dataset['past_loans_approved'].sum()
dp_sum7 = add_laplace_noise(dataset['past_loans_approved'].sum(), dataset['past_loans_approved'].max(), epsilon_sum)
lista_sum_pla.append('past_loans_approved')
lista_sum_pla.append(round(orig_sum7,2))
lista_sum_pla.append(round(dp_sum7,2))
lista_sum_pla.append(round(percentage_error(orig_sum7, dp_sum7),3))

In [14]:
df3 = pd.DataFrame()

df3['atributo_sensivel']=""
df3['valor_original']=""
df3['valor_mascarado']=""
df3['erro_percentual']=""

df3.loc[len(df3)] = lista_sum_inf
df3.loc[len(df3)] = lista_sum_ain
df3.loc[len(df3)] = lista_sum_cam
df3.loc[len(df3)] = lista_sum_gva
df3.loc[len(df3)] = lista_sum_dem
df3.loc[len(df3)] = lista_sum_paa
df3.loc[len(df3)] = lista_sum_pla

df3['sensibility']=""
df3['epsilon']=""
df3['metodo']=""

df3['sensibility'] = 1
df3['epsilon'] = epsilon_sum
df3['metodo'] = 'dp-sum'

df3.set_index('metodo')

Unnamed: 0_level_0,atributo_sensivel,valor_original,valor_mascarado,erro_percentual,sensibility,epsilon
metodo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
dp-sum,infringed,24825.0,24700.45,0.502,1,0.01
dp-sum,annual_income,51907220000.0,30035490000.0,42.136,1,0.01
dp-sum,credit_amount,184207100000.0,184265100000.0,-0.031,1,0.01
dp-sum,goods_valuation,165413100000.0,165509600000.0,-0.058,1,0.01
dp-sum,days_employed,19623830000.0,19661290000.0,-0.191,1,0.01
dp-sum,past_avg_amount_annuity,4223596000.0,4240329000.0,-0.396,1,0.01
dp-sum,past_loans_approved,886099.0,885914.3,0.021,1,0.01


### **A.5) Calculando DP-mean com variáveis sensíveis**

In [15]:
epsilon_mean = 0.01

In [32]:
# Infringed
filtered_wh1 = dataset.query(f'infringed > {v1}')
orig_mean1 = filtered_wh1['infringed'].mean()
dp_mean1 = add_laplace_noise(filtered_wh1['infringed'].sum(), filtered_wh1['infringed'].max(), 0.01) / add_laplace_noise(len(filtered_wh1), 1, epsilon_mean)

lista_mean_inf = []
lista_mean_inf.append('infringed')
lista_mean_inf.append(round(orig_mean1,2))
lista_mean_inf.append(round(dp_mean1,2))
lista_mean_inf.append(round(percentage_error(orig_mean1, dp_mean1),2))


# Annual income
filtered_wh2 = dataset.query(f'annual_income > {v2}')
orig_mean2 = filtered_wh2['annual_income'].mean()
dp_mean2 = add_laplace_noise(filtered_wh2['annual_income'].sum(), filtered_wh2['annual_income'].max(), 0.01) / add_laplace_noise(len(filtered_wh2), 1, epsilon_mean)

lista_mean_ain = []
lista_mean_ain.append('annual_income')
lista_mean_ain.append(round(orig_mean2,2))
lista_mean_ain.append(round(dp_mean2,2))
lista_mean_ain.append(round(percentage_error(orig_mean2, dp_mean2),2))


# Credit amount
filtered_wh3 = dataset.query(f'credit_amount > {v3}')
orig_mean3 = filtered_wh3['credit_amount'].mean()
dp_mean3 = add_laplace_noise(filtered_wh3['credit_amount'].sum(), filtered_wh3['credit_amount'].max(), 0.01) / add_laplace_noise(len(filtered_wh3), 1, epsilon_mean)

lista_mean_cra = []
lista_mean_cra.append('credit_amount')
lista_mean_cra.append(round(orig_mean3,2))
lista_mean_cra.append(round(dp_mean3,2))
lista_mean_cra.append(round(percentage_error(orig_mean3, dp_mean3),2))


# Goods valuation
filtered_wh4 = dataset.query(f'goods_valuation > {v4}')
orig_mean4 = filtered_wh4['goods_valuation'].mean()
dp_mean4 = add_laplace_noise(filtered_wh4['goods_valuation'].sum(), filtered_wh4['goods_valuation'].max(), 0.01) / add_laplace_noise(len(filtered_wh4), 1, epsilon_mean)

lista_mean_gva = []
lista_mean_gva.append('goods_valuation')
lista_mean_gva.append(round(orig_mean4,2))
lista_mean_gva.append(round(dp_mean4,2))
lista_mean_gva.append(round(percentage_error(orig_mean4, dp_mean4),2))


# Days employed
filtered_wh5 = dataset.query(f'days_employed > {v5}')
orig_mean5 = filtered_wh5['days_employed'].mean()
dp_mean5 = add_laplace_noise(filtered_wh5['days_employed'].sum(), filtered_wh5['days_employed'].max(), 0.01) / add_laplace_noise(len(filtered_wh5), 1, epsilon_mean)

lista_mean_dem = []
lista_mean_dem.append('days_employed')
lista_mean_dem.append(round(orig_mean5,2))
lista_mean_dem.append(round(dp_mean5,2))
lista_mean_dem.append(round(percentage_error(orig_mean5, dp_mean5),2))


# Past avg amount annuity
filtered_wh6 = dataset.query(f'past_avg_amount_annuity > {v6}')
orig_mean6 = filtered_wh6['past_avg_amount_annuity'].mean()
dp_mean6 = add_laplace_noise(filtered_wh6['past_avg_amount_annuity'].sum(), filtered_wh6['past_avg_amount_annuity'].max(), 0.01) / add_laplace_noise(len(filtered_wh6), 1, epsilon_mean)

lista_mean_paa = []
lista_mean_paa.append('past_avg_amount_annuity')
lista_mean_paa.append(round(orig_mean6,2))
lista_mean_paa.append(round(dp_mean6,2))
lista_mean_paa.append(round(percentage_error(orig_mean6, dp_mean6),2))


# Past avg amount credit
filtered_wh7 = dataset.query(f'past_loans_approved > {v7}')
orig_mean7 = filtered_wh7['past_loans_approved'].mean()
dp_mean7 = add_laplace_noise(filtered_wh7['past_loans_approved'].sum(), filtered_wh7['past_loans_approved'].max(), 0.01) / add_laplace_noise(len(filtered_wh7), 1, epsilon_mean)

lista_mean_pac = []
lista_mean_pac.append('past_loans_approved')
lista_mean_pac.append(round(orig_mean7,2))
lista_mean_pac.append(round(dp_mean7,2))
lista_mean_pac.append(round(percentage_error(orig_mean7, dp_mean7),2))

In [33]:
df4 = pd.DataFrame()

df4['atributo_sensivel']=""
df4['valor_original']=""
df4['valor_mascarado']=""
df4['erro_percentual']=""

df4.loc[len(df4)] = lista_mean_inf
df4.loc[len(df4)] = lista_mean_ain
df4.loc[len(df4)] = lista_mean_cra
df4.loc[len(df4)] = lista_mean_gva
df4.loc[len(df4)] = lista_mean_dem
df4.loc[len(df4)] = lista_mean_paa
df4.loc[len(df4)] = lista_mean_pac

df4['sensibility']=""
df4['epsilon']=""
df4['metodo']=""

df4['sensibility'] = 1
df4['epsilon'] = epsilon_mean
df4['metodo'] = 'dp-mean'

df4.set_index('metodo')

Unnamed: 0_level_0,atributo_sensivel,valor_original,valor_mascarado,erro_percentual,sensibility,epsilon
metodo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
dp-mean,infringed,1.0,1.0,-0.24,1,0.01
dp-mean,annual_income,232632.55,129480.36,44.34,1,0.01
dp-mean,credit_amount,906497.68,902771.04,0.41,1,0.01
dp-mean,goods_valuation,823491.71,820058.29,0.42,1,0.01
dp-mean,days_employed,131202.91,131070.54,0.1,1,0.01
dp-mean,past_avg_amount_annuity,21339.82,21642.04,-1.42,1,0.01
dp-mean,past_loans_approved,5.52,5.48,0.84,1,0.01


### **A.6) Comparando count, sum e mean**

In [124]:
# Criando tablea comparativa
df2_ = df2[['atributo_sensivel','metodo','erro_percentual']]
df3_ = df3[['metodo','atributo_sensivel','erro_percentual']]
df4_ = df4[['metodo','atributo_sensivel','erro_percentual']]

xx = df2_.merge(df3_, how='left', on='atributo_sensivel').merge(df4_, how='left', on='atributo_sensivel')
xx.set_index('atributo_sensivel')

Unnamed: 0_level_0,metodo_x,erro_percentual_x,metodo_y,erro_percentual_y,metodo,erro_percentual
atributo_sensivel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
infringed,dp-count,-0.351,dp-sum,0.502,dp-mean,-0.24
annual_income,dp-count,0.015,dp-sum,42.136,dp-mean,44.34
credit_amount,dp-count,0.034,dp-sum,-0.031,dp-mean,0.41
goods_valuation,dp-count,-0.037,dp-sum,-0.058,dp-mean,0.42
days_employed,dp-count,-0.037,dp-sum,-0.191,dp-mean,0.1
past_avg_amount_annuity,dp-count,0.081,dp-sum,-0.396,dp-mean,-1.42
past_loans_approved,dp-count,-0.291,dp-sum,0.021,dp-mean,0.84


In [85]:
# Comparando o melhor indicador
print("Count:", round(xx.erro_percentual_x.abs().mean(),2))
print("Sum:", (round(xx.erro_percentual_y.abs().mean(),2)))
print("Mean:", (round(xx.erro_percentual.abs().mean(),2)))
print("Sum s/ outlier:", round((xx.erro_percentual_y.abs().mean()*7-42.136)/6,2))
print("Mean s/ outlier:", round((xx.erro_percentual.abs().mean()*7-44.34)/6,2))

Count: 0.12
Sum: 6.19
Mean: 6.82
Sum s/ outlier: 0.2
Mean s/ outlier: 0.57


##### CONCLUSÃO 3) O melhor método de anonomização via adição de ruído de Laplace é pela **média**.

## **PARTE B) Usando PyDP**

In [87]:
! pip install python-dp



In [88]:
import pydp as dp
from pydp.algorithms.laplacian import BoundedSum, BoundedMean, Count

### **B.1) Usando contagem**

In [89]:
x = Count(0.01)

In [129]:
# Infringed
pydp_count_inf = len(dataset.query(f'infringed > {v1}')['infringed'])
lista_count_age = []
lista_count_age.append('infringed')
lista_count_age.append(round(orig_count1,2))
lista_count_age.append(round(pydp_count_inf,2))
lista_count_age.append(round(percentage_error(orig_count1, pydp_count_inf),2))


# Annual income
pydp_count_ain = len(dataset.query(f'annual_income > {v2}')['annual_income'])
lista_count_ain = []
lista_count_ain.append('annual_income')
lista_count_ain.append(round(orig_count2,2))
lista_count_ain.append(round(pydp_count_ain,2))
lista_count_ain.append(round(percentage_error(orig_count2, pydp_count_ain),3))


# Credit ammount
pydp_count_cam = len(dataset.query(f'credit_amount > {v3}')['credit_amount'])
lista_count_cam = []
lista_count_cam.append('credit_amount')
lista_count_cam.append(round(orig_count3,2))
lista_count_cam.append(round(pydp_count_cam,2))
lista_count_cam.append(round(percentage_error(orig_count3, pydp_count_cam),3))


# Goods valuation
pydp_count_gva = len(dataset.query(f'goods_valuation > {v4}')['goods_valuation'])
lista_count_gva = []
lista_count_gva.append('goods_valuation')
lista_count_gva.append(round(orig_count4,2))
lista_count_gva.append(round(pydp_count_gva,2))
lista_count_gva.append(round(percentage_error(orig_count4, pydp_count_gva),3))


# Days employed
pydp_count_dem = len(dataset.query(f'days_employed > {v5}')['days_employed'])
lista_count_dem = []
lista_count_dem.append('days_employed')
lista_count_dem.append(round(orig_count5,2))
lista_count_dem.append(round(pydp_count_dem,2))
lista_count_dem.append(round(percentage_error(orig_count5, pydp_count_dem),3))


# Past avg amount annuity
pydp_count_paa = len(dataset.query(f'past_avg_amount_annuity > {v6}')['past_avg_amount_annuity'])
lista_count_paa = []
lista_count_paa.append('past_avg_amount_annuity')
lista_count_paa.append(round(orig_count6,2))
lista_count_paa.append(round(pydp_count_paa,2))
lista_count_paa.append(round(percentage_error(orig_count6, pydp_count_paa),3))


# Past loans approved
pydp_count_pla = len(dataset.query(f'past_loans_approved > {v7}')['past_loans_approved'])
lista_count_pla = []
lista_count_pla.append('past_loans_approved')
lista_count_pla.append(round(orig_count7,2))
lista_count_pla.append(round(pydp_count_pla,2))
lista_count_pla.append(round(percentage_error(orig_count7, pydp_count_pla),3))

In [130]:
df5 = pd.DataFrame()

df5['atributo_sensivel']=""
df5['valor_original']=""
df5['valor_mascarado']=""
df5['erro_percentual']=""

df5.loc[len(df5)] = lista_count_age
df5.loc[len(df5)] = lista_count_ain
df5.loc[len(df5)] = lista_count_cam
df5.loc[len(df5)] = lista_count_gva
df5.loc[len(df5)] = lista_count_dem
df5.loc[len(df5)] = lista_count_paa
df5.loc[len(df5)] = lista_count_pla

df5['metodo']=""
df5['metodo'] = 'dp-count'

df5.set_index('metodo')

Unnamed: 0_level_0,atributo_sensivel,valor_original,valor_mascarado,erro_percentual
metodo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dp-count,infringed,24825,24825,0.0
dp-count,annual_income,153751,153751,0.0
dp-count,credit_amount,153662,153662,0.0
dp-count,goods_valuation,149254,149254,0.0
dp-count,days_employed,153689,153689,0.0
dp-count,past_avg_amount_annuity,145319,145319,0.0
dp-count,past_loans_approved,93799,93799,0.0


### **B.2) Usando soma**

In [105]:
y = BoundedSum(epsilon=0.2, delta=0, dtype="float")

In [108]:
# Infringed
pydp_sum_inf = y.quick_result(list(dataset['infringed']))
lista_sum_age = []
lista_sum_age.append('infringed')
lista_sum_age.append(round(orig_sum1,2))
lista_sum_age.append(round(pydp_sum_inf,2))
lista_sum_age.append(round(percentage_error(orig_sum1, pydp_sum_inf),2))


# Annual income
pydp_sum_ain = y.quick_result(list(dataset['annual_income']))
lista_sum_ain = []
lista_sum_ain.append('annual_income')
lista_sum_ain.append(round(orig_sum2,2))
lista_sum_ain.append(round(pydp_sum_ain,2))
lista_sum_ain.append(round(percentage_error(orig_sum2, pydp_sum_ain),3))


# Credit ammount
pydp_sum_cam = y.quick_result(list(dataset['credit_amount']))
lista_sum_cam = []
lista_sum_cam.append('credit_amount')
lista_sum_cam.append(round(orig_sum3,2))
lista_sum_cam.append(round(pydp_sum_cam,2))
lista_sum_cam.append(round(percentage_error(orig_sum3, pydp_sum_cam),3))


# Goods valuation
pydp_sum_gva = y.quick_result(list(dataset['goods_valuation']))
lista_sum_gva = []
lista_sum_gva.append('goods_valuation')
lista_sum_gva.append(round(orig_sum4,2))
lista_sum_gva.append(round(pydp_sum_gva,2))
lista_sum_gva.append(round(percentage_error(orig_sum4, pydp_sum_gva),3))


# Days employed
pydp_sum_dem = y.quick_result(list(dataset['days_employed']))
lista_sum_dem = []
lista_sum_dem.append('days_employed')
lista_sum_dem.append(round(orig_sum5,2))
lista_sum_dem.append(round(pydp_sum_dem,2))
lista_sum_dem.append(round(percentage_error(orig_sum5, pydp_sum_dem),3))


# Past avg amount annuity
pydp_sum_paa = y.quick_result(list(dataset['past_avg_amount_annuity']))
lista_sum_paa = []
lista_sum_paa.append('past_avg_amount_annuity')
lista_sum_paa.append(round(orig_sum6,2))
lista_sum_paa.append(round(pydp_sum_paa,2))
lista_sum_paa.append(round(percentage_error(orig_sum6, pydp_sum_paa),3))


# Past loans approved
pydp_sum_pla = y.quick_result(list(dataset['past_loans_approved']))
lista_sum_pla = []
lista_sum_pla.append('past_loans_approved')
lista_sum_pla.append(round(orig_sum7,2))
lista_sum_pla.append(round(pydp_sum_pla,2))
lista_sum_pla.append(round(percentage_error(orig_sum7, pydp_sum_pla),3))

In [109]:
df6 = pd.DataFrame()

df6['atributo_sensivel']=""
df6['valor_original']=""
df6['valor_mascarado']=""
df6['erro_percentual']=""

df6.loc[len(df6)] = lista_sum_age
df6.loc[len(df6)] = lista_sum_ain
df6.loc[len(df6)] = lista_sum_cam
df6.loc[len(df6)] = lista_sum_gva
df6.loc[len(df6)] = lista_sum_dem
df6.loc[len(df6)] = lista_sum_paa
df6.loc[len(df6)] = lista_sum_pla

df6['metodo']=""
df6['metodo'] = 'dp-sum'

df6.set_index('metodo')

Unnamed: 0_level_0,atributo_sensivel,valor_original,valor_mascarado,erro_percentual
metodo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dp-sum,infringed,24825.0,24817.53,0.03
dp-sum,annual_income,51907220000.0,51638930000.0,0.517
dp-sum,credit_amount,184207100000.0,184180100000.0,0.015
dp-sum,goods_valuation,165413100000.0,165282500000.0,0.079
dp-sum,days_employed,19623830000.0,19651110000.0,-0.139
dp-sum,past_avg_amount_annuity,4223596000.0,4221755000.0,0.044
dp-sum,past_loans_approved,886099.0,885208.7,0.1


### **B.3) Usando média**

In [110]:
z = BoundedMean(0.1)

In [115]:
# Days employed
pydp_mean_dem = z.quick_result(list(dataset['days_employed']))
lista_mean_dem = []
lista_mean_dem.append('days_employed')
lista_mean_dem.append(round(orig_mean5,2))
lista_mean_dem.append(round(pydp_mean_dem,2))
lista_mean_dem.append(round(percentage_error(orig_mean5, pydp_mean_dem),3))

In [121]:
df7 = pd.DataFrame()

df7['atributo_sensivel']=""
df7['valor_original']=""
df7['valor_mascarado']=""
df7['erro_percentual']=""

df7.loc[len(df7)] = lista_sum_dem

df7['metodo']=""
df7['metodo'] = 'dp-mean'

df7.set_index('metodo')

Unnamed: 0_level_0,atributo_sensivel,valor_original,valor_mascarado,erro_percentual
metodo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dp-mean,days_employed,19623828581,19651110000.0,-0.139


### **B.4) Comparando count, sum e mean**

In [122]:
df5_ = df5[['atributo_sensivel','metodo','erro_percentual']]
df6_ = df6[['metodo','atributo_sensivel','erro_percentual']]
df7_ = df7[['metodo','atributo_sensivel','erro_percentual']]

In [123]:
yy = df5_.merge(df6_, how='left', on='atributo_sensivel').merge(df7_, how='left', on='atributo_sensivel')
yy.set_index('atributo_sensivel')

Unnamed: 0_level_0,metodo_x,erro_percentual_x,metodo_y,erro_percentual_y,metodo,erro_percentual
atributo_sensivel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
infringed,dp-count,0.0,dp-sum,0.03,,
annual_income,dp-count,0.0,dp-sum,0.517,,
credit_amount,dp-count,0.0,dp-sum,0.015,,
goods_valuation,dp-count,0.0,dp-sum,0.079,,
days_employed,dp-count,0.0,dp-sum,-0.139,dp-mean,-0.139
past_avg_amount_annuity,dp-count,0.0,dp-sum,0.044,,
past_loans_approved,dp-count,0.0,dp-sum,0.1,,


##### Não foi possível computar os os valores de DP_mean por conta de demasiados erros, à exceção da variável "days_employed", cujo valor foi exatamente o mesmo de dp-sum. Além disso, dp-count não trouxe divergência para os dados, de maneira que o indicador mais interessante para anonimizar os dados, usando a biblioteca PyDP é dp-sum.

##### CONCLUSÃO 4) O melhor método de anonomização via adição de ruído pelo PyDP é pela **soma**.

In [148]:
# Analisando o melhor indicador
print("[PyDP] Sum:", (round(yy.erro_percentual_y.abs().mean(),2)))

[PyDP] Sum: 0.13


In [146]:
# Recordando os indicadores de Laplace
print("Count:", round(xx.erro_percentual_x.abs().mean(),2))
print("Sum:", (round(xx.erro_percentual_y.abs().mean(),2)))
print("Mean:", (round(xx.erro_percentual.abs().mean(),2)))
print("Sum s/ outlier:", round((xx.erro_percentual_y.abs().mean()*7-42.136)/6,2))
print("Mean s/ outlier:", round((xx.erro_percentual.abs().mean()*7-44.34)/6,2))

Count: 0.12
Sum: 6.19
Mean: 6.82
Sum s/ outlier: 0.2
Mean s/ outlier: 0.57


#### **CONCLUSÃO FINAL) O melhor método de anonomização via adição de ruído ocorre com Laplace e usando agregação média**.

#### Usar a biblioteca PyDP ajuda a trazer mais anonimidade, pois todos os valores de erros foram maiores. Logo, os dados estão mais distantes dos reais, o que é desejável em casa de um ataque bem-sucedido aos dados.
#### Vejamos alguns exemplos:

In [138]:
# Age
df1['metodo'] = 'add laplace noise'
df4['metodo'] = 'biblioteca pydp'

xx = pd.concat([df1[['metodo','atributo_sensivel','valor_original','valor_mascarado','erro_percentual']], df4])
xx.set_index('metodo')

Unnamed: 0_level_0,atributo_sensivel,valor_original,valor_mascarado,erro_percentual
metodo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
add laplace noise,age,146615.0,146550.91,0.044
add laplace noise,age,146615.0,146555.4,0.041
add laplace noise,age,146615.0,146613.23,0.001
add laplace noise,age,146615.0,146645.58,-0.021
add laplace noise,age,146615.0,146609.14,0.004
add laplace noise,age,146615.0,146604.84,0.007
add laplace noise,age,146615.0,146458.8,0.107
add laplace noise,age,146615.0,146626.47,-0.008
add laplace noise,age,146615.0,146612.89,0.001
biblioteca pydp,age,273228.25,146547.0,46.36


In [139]:
# Num children
df2['metodo'] = 'add laplace noise'
df5['metodo'] = 'biblioteca pydp'

xx = pd.concat([df2[['metodo','atributo_sensivel','valor_original','valor_mascarado','erro_percentual']], df5])
xx.set_index('metodo')

Unnamed: 0_level_0,atributo_sensivel,valor_original,valor_mascarado,erro_percentual
metodo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
add laplace noise,num_children,128248.0,128707.14,-0.36
biblioteca pydp,num_children,273228.25,128098.11,53.12
