# Evaluations done in Privugger-AG
*MADE BY: Mathias Oliver Valdbjørn Jørgensen mathiasoliverjorgensen@hotmail.com*


This document contains (almost) all the evaluations which are done in Privugger-AG
The document does not contain the evaluation on different Derivative Free Optimization technique, these can be found in the `DFO.ipynb` file. 

Addidtionally not all figures are within this file, but can be seen in `Figure Generation Report.ipynb`

# Imports

In [None]:
import privuggerag as attacker
import numpy as np
import opendp.smartnoise.core as sn
from sklearn.feature_selection import mutual_info_regression
from typing import List, Tuple
from scipy import stats as st
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
import re
import subprocess
from entropy_estimators import continuous

Privugger-AG creates the analysis on a Privacy Preserving Mechanism (PPM) and a leakage measurement

In general the estimate is done, specifying a domain of constraints, a ppm and a leakage measurement

# PPMs

In [None]:
####### AVERAGE AGE #######
def mean_float(a: List[float]) -> float:
    return np.mean(a)

####### SUM OF AGES ######
def SumOfAges(a: List[int]) -> int:
    return sum(a)

####### MEDIAN AGE ######
def median_float(a: List[float]) -> float:
    return np.median(a)

####### AVERAGE AGE DIFFERENTIAL PRIVACY ######
def dp_program_cont(age: List[float]) -> float:
    temp_file='temp.csv'    
    var_names = ["age"]
    data = {
        "age":     age,
    }
    df = pd.DataFrame(data,columns=var_names)
    df.to_csv(temp_file)
    with sn.Analysis() as analysis:
        # load data
        data = sn.Dataset(path=temp_file,column_names=var_names)

        # get mean of age
        age_mean = sn.dp_mean(data = sn.to_float(data['age']),
                              privacy_usage = {'epsilon': .1},
                              data_lower = 10., # min age
                              data_upper = 50., # max age                  
                              data_rows = 10
                             )
    analysis.release()
    return np.float64(age_mean.value)  

###### AVERAGE AGE BOTH AS ATTACKER DIFFERENTIAL PRIVACY ######
def dp_program_two_individuals(a: float, b: float) -> float:
    temp_file='temp.csv'    
    var_names = ["age"]
    data = {
        "age":     [a,b],
    }
    df = pd.DataFrame(data,columns=var_names)
    df.to_csv(temp_file)
    with sn.Analysis() as analysis:
        # load data
        data = sn.Dataset(path=temp_file,column_names=var_names)

        # get mean of age
        age_mean = sn.dp_mean(data = sn.to_float(data['age']),
                              privacy_usage = {'epsilon': .1},
                              data_lower = 10., # min age
                              data_upper = 50., # max age                  
                              data_rows = 2
                             )
    analysis.release()
    return np.float64(age_mean.value)   


###### AVERAGE AGE DIFFERENTIAL PRIVACY DISCRETE ######
def dp_program_discrete(a: int, b: int) -> float:
    temp_file='temp.csv'    
    var_names = ["age"]
    data = {
        "age":     [a,b],
    }
    df = pd.DataFrame(data,columns=var_names)
    df.to_csv(temp_file)
    with sn.Analysis() as analysis:
        # load data
        data = sn.Dataset(path=temp_file,column_names=var_names)

        # get mean of age
        age_mean = sn.dp_sum(data = sn.to_float(data['age']),
                              privacy_usage = {'epsilon': .1},
                              data_lower = 10., # min age
                              data_upper = 50., # max age                  
                              data_rows = 2
                             )
    analysis.release()
    return np.float64(age_mean.value)   

###### IDENTITY DISCRETE ######
def identity_disc(a: int) -> int:
    return a

###### IDENTITY CONTINUOUS ######
def identity_cont(a: float) -> float:
    return a

###### KANONYMITY #######
def generalize_age(age):
    if 10 <= age < 20:
        return 15
    if 20 <= age < 30:
        return 25
    if 30 <= age < 40:
        return 35
    if 40 <= age <= 50:
        return 45
    else:
        return 0
    
def kanon(age, k=4, outcome=[15,25,35,45, 0]):
    out = [generalize_age(a) for a in age]
    for i in outcome:
        co = out.count(i)
        if 0 < co < k:
            for _ in range(co, k):
                out.append(i)
    return out

def kanonAge(age: List[float]) -> int:
    return np.mean(kanon(age))

# Leakage Measurements

In [None]:
###### DISCRETE MUTUAL INFORMATION ######
def mi_DISCRETE(trace):
    I = mutual_info_regression(trace["Alice_a"].reshape(-1,1), 
                               trace["out"], 
                               discrete_features=False, 
                               random_state=np.random.RandomState(12345))[0]
    return -I

###### DISCRETE MUTUAL INFORMATION ######
def mi_CONT(trace):
    I = mutual_info_regression(trace["Alice_a"].reshape(-1,1), 
                               trace["out"], 
                               discrete_features=True, 
                               random_state=np.random.RandomState(12345))[0]
    return -I

###### ENTROPY DISCRETE ######
def entropy_disc(t):
    s, counts = np.unique(t["a"], return_counts=True)
    counts = counts/len(t["a"])
    return -st.entropy(counts, base=2)

###### ENTROPY CONTINUOUS ######
def entropy_cont(t):
    return -continuous.get_h(t["a"],k=2)


###### FBLEAU BAYES RISK ######
def fbleau_bayes_risk(t):
    #Creating datasets
    df=pd.DataFrame(zip(t['Alice_a'],t['out']),columns=['s','o'])
    tr=int(0.8*len(df))
    df_train=df.iloc[:tr]
    df_test=df.iloc[tr:]
    df_test = df_test.drop(df_test[~df_test['s'].isin(df_train['s'])].index)
    
    # Save datasets as csv
    train_data='fbleau_train.csv'
    test_data='fbleau_test.csv'
    df.to_csv('fbleau.csv',header=False,index=False)
    df_train.to_csv(train_data,header=False,index=False)
    df_test.to_csv(test_data,header=False,index=False)
    
    # Execute fbleau
    fbleau_exec='~/.cargo/bin/fbleau'
    cmd=fbleau_exec+' knn --knn-strategy ln '+train_data+' '+test_data
    stdout=subprocess.check_output(cmd, shell=True, text=True)

    return float(re.search('Minimum estimate: (.*)\nMul',stdout).group(1))

# Average Age (Normals)

In [None]:
domain = [
    {
        "name": "a",
        "lower": 10,
        "upper": 50,
        "type": "float",
        "alice": st.norm(25,15)
    }
]
AgeNormal = attacker.construct_analysis(mean_float, 
                            domain, 
                            mi_CONT,
                            cores=1)
AgeNormal.best_dist()
t = AgeNormal.run(0,return_trace=True)
dist = {"$s$": t["Alice_a"], "$n$": t["Rest_a"][0], "$o$": t["out"]}
d = data=pd.DataFrame(dist)

g = sns.pairplot(d)
g.map_diag(sns.histplot, kde=True, discrete=False, color="blue", stat='density')

# Average Age (Continuous)

In [None]:
domain = [
    {
        "name": "a",
        "lower": 10,
        "upper": 50,
        "type": "float",
    }
]
AgeALL = attacker.construct_analysis(mean_float, 
                            domain, 
                            mi_CONT,
                            cores=1)
AgeALL.best_dist()
t = AgeALL.run(0,return_trace=True)
dist = {"$s$": t["Alice_a"], "$n$": t["Rest_a"][0], "$o$": t["out"]}
d = data=pd.DataFrame(dist)

g = sns.pairplot(d)
g.map_diag(sns.histplot, kde=True, discrete=False, color="blue", stat='density')

# Median age

In [None]:
domain = [
    {
        "name": "a",
        "lower": 10,
        "upper": 50,
        "type": "float",
    }
]
medianAge = attacker.construct_analysis(median_float, 
                            domain, 
                            mi_CONT,
                            cores=1)
medianAge.best_dist()
t = medianAge.run(0,return_trace=True)
dist = {"$s$": t["Alice_a"], "$n$": t["Rest_a"][0], "$o$": t["out"]}
d = data=pd.DataFrame(dist)

g = sns.pairplot(d)
g.map_diag(sns.histplot, kde=True, discrete=False, color="blue", stat='density')

# Sum of ages

In [None]:
domain = [
    {
        "name": "a",
        "lower": 0,
        "upper": 100,
        "type": "int",
    }
]
sumMI = attacker.construct_analysis(SumOfAges, domain, mi_DISCRETE)
sumMI.best_dist()
t = sumMI.run(0,return_trace=True)
dist = {"$s$": t["Alice_a"], "$n$": t["Rest_a"][0], "$o$": t["out"]}
d = data=pd.DataFrame(dist)

g = sns.pairplot(d)
g.map_diag(sns.histplot, kde=True, discrete=True, color="blue", stat='density')

# Bayes Risk

In [None]:
domain = [
    {
        "name": "a",
        "lower": 0,
        "upper": 100,
        "type": "int",
    }
]
br = attacker.construct_analysis(SumOfAges, domain, fbleau_bayes_risk)
br.best_dist()
t = br.run(0,return_trace=True)
dist = {"$s$": t["Alice_a"], "$n$": t["Rest_a"][0], "$o$": t["out"]}
d = data=pd.DataFrame(dist)

g = sns.pairplot(d)
g.map_diag(sns.histplot, kde=True, discrete=True, color="blue", stat='density')

# Entropy
## Discrete


In [None]:
domain = [
    {
        "name": "a",
        "lower": 0,
        "upper": 100,
        "type": "int",
    }
]
ent = attacker.construct_analysis(identity_disc, domain, entropy_disc)
ent.best_dist()
t = ent.run(0,return_trace=True)
dist = {"$s$": t["Alice_a"], "$n$": t["Rest_a"][0], "$o$": t["out"]}
d = data=pd.DataFrame(dist)

g = sns.pairplot(d)
g.map_diag(sns.histplot, kde=True, discrete=True, color="blue", stat='density')

## Continuous

In [None]:
domain = [
    {
        "name": "a",
        "lower": 0,
        "upper": 100,
        "type": "float",
    }
]
ent = attacker.construct_analysis(identity_cont, domain, entropy_cont)
ent.best_dist()
t = ent.run(0,return_trace=True)
dist = {"$s$": t["Alice_a"], "$n$": t["Rest_a"][0], "$o$": t["out"]}
d = data=pd.DataFrame(dist)

g = sns.pairplot(d)
g.map_diag(sns.histplot, kde=True, discrete=True, color="blue", stat='density')

# Differential Privacy
## List of individuals

In [None]:
domain = [
    {
        "name": "a",
        "lower": 10,
        "upper": 50,
        "type": "float",
    }
]
dp1 = attacker.construct_analysis(dp_program_cont, domain, mi_CONT)
dp1.best_dist()
t = dp1.run(0,return_trace=True)
dist = {"$s$": t["Alice_a"], "$n$": t["Rest_a"][0], "$o$": t["out"]}
d = data=pd.DataFrame(dist)

g = sns.pairplot(d)
g.map_diag(sns.histplot, kde=True, discrete=True, color="blue", stat='density')

## Two individuals cont
Trying to find maximum

In [None]:
domain = [
    {
        "name": "a",
        "lower": 10,
        "upper": 50,
        "type": "float",
    },
    {
        "name": "a",
        "lower": 10,
        "upper": 50,
        "type": "float",
    }
]
dp2 = attacker.construct_analysis(dp_program_two_individuals, domain, mi_CONT)
dp2.best_dist()
#Choose any I
t = dp2.run(0,return_trace=True)
dist = {"$s$": t["Alice_a"], "$n$": t["Rest_a"][0], "$o$": t["out"]}
d = data=pd.DataFrame(dist)

g = sns.pairplot(d)
g.map_diag(sns.histplot, kde=True, discrete=True, color="blue", stat='density')

## Two individuals discrete

In [None]:
domain = [
    {
        "name": "a",
        "lower": 10,
        "upper": 50,
        "type": "int",
    },
    {
        "name": "a",
        "lower": 10,
        "upper": 50,
        "type": "int",
    }
]
dp3 = attacker.construct_analysis(dp_program_discrete, domain, fbleau_bayes_risk)
dp3.best_dist()
#Choose any I
t = dp3.run(0,return_trace=True)
dist = {"$s$": t["Alice_a"], "$n$": t["Rest_a"][0], "$o$": t["out"]}
d = data=pd.DataFrame(dist)

g = sns.pairplot(d)
g.map_diag(sns.histplot, kde=True, discrete=True, color="blue", stat='density')

# K-Anonymity

In [None]:
domain = [
    {
        "name": "a",
        "lower": 10,
        "upper": 50,
        "type": "float",
    }
]
kPPM = attacker.construct_analysis(kanonAge, domain, mi_CONT)
kPPM.best_dist()
#Choose any I
t = kPPM.run(0,return_trace=True)
dist = {"$s$": t["Alice_a"], "$n$": t["Rest_a"][0], "$o$": t["out"]}
d = data=pd.DataFrame(dist)

g = sns.pairplot(d)
g.map_diag(sns.histplot, kde=True, discrete=True, color="blue", stat='density')