# 06 - Differential Privacy

In [1]:
import syft as sy
print(sy.__version__)
assert sy.__version__ == '0.7.0-beta.57'

0.7.0-beta.57


## `DP Tensors` in `Syft`

### `Syft` Tensor

Normal `torch` tensor

In [2]:
import torch
torch.tensor([1, 2, 3, 4])

tensor([1, 2, 3, 4])

A normal `numpy` array

In [3]:
import numpy as np
np.array([1, 2, 3, 4]) + 2

array([3, 4, 5, 6])

The `tensors` in `syft` are similar to `torch` and `numpy` 

In [4]:
import syft as sy
print(sy.Tensor([1, 2, 3, 4]))
print(sy.Tensor([1, 2, 3, 4]) + 2)


Tensor(child=[1 2 3 4])
Tensor(child=[3 4 5 6])


However, they have the metadata which helps us figure out how much noise we should add to protect privacy

In [5]:
sy.Tensor([1, 2, 3, 4]).private(min_val=0, 
                                max_val=5, 
                                data_subjects=["Ishan"])

Tensor(child=PhiTensor(child=[1 2 3 4], min_vals=<lazyrepeatarray data: [0] -> shape: (4,)>, max_vals=<lazyrepeatarray data: [5] -> shape: (4,)>))

`min_val` and `max_val` are theoretical lower and upper bound of the data (to help with calculating the right amount of `noise` to add during `publish`). The `data_subjects` are the people whose data is stored in the tensor.

Too much noise will skew the result so that the real result is mostly lost:

In [6]:
signal = sum([1, 2, 3, 4, 5, 3, 4, 2, 1, 8])
noise = 300
result = signal + noise

print("Signal:                =", signal)
print("Noise:                 =", noise)
print("Final result after DP: =", result)

Signal:                = 33
Noise:                 = 300
Final result after DP: = 333


However, if the noise is too small, then it is not enough to protect privacy

In [7]:
signal = sum([1e6, 2e6, 3e6, 4e6, 5e6, 3e6, 4e6, 2e6, 1e6, 8e6])
noise = 50
result = signal + noise

print("Signal:                =", signal)
print("Noise:                 =", noise)
print("Final result after DP: =", result)

Signal:                = 33000000.0
Noise:                 = 50
Final result after DP: = 33000050.0


Constructing the `syft` tensor with `numpy ndarray` as its child

In [8]:
import numpy as np
# first make the numpy array
data = np.random.randint(low=1, high=7, size=(5,5))
print(f"numpy array: \n{data}")
# convert it into a syft tensor
tensor = sy.Tensor(data)
print(f"syft tensor: \n{tensor}")

numpy array: 
[[5 2 1 3 3]
 [6 6 6 2 5]
 [3 1 5 6 1]
 [2 3 3 6 5]
 [4 4 5 2 6]]
syft tensor: 
Tensor(child=[[5 2 1 3 3]
 [6 6 6 2 5]
 [3 1 5 6 1]
 [2 3 3 6 5]
 [4 4 5 2 6]])


### `Phi Tensor`

With `Phi Tensor`, there is only one `data subject` per data point in the Tensor

In [9]:
private_tensor = tensor.private(min_val=0,
                                max_val=10,
                                data_subjects=["Ishan"])
private_tensor

Tensor(child=PhiTensor(child=[[5 2 1 3 3]
 [6 6 6 2 5]
 [3 1 5 6 1]
 [2 3 3 6 5]
 [4 4 5 2 6]], min_vals=<lazyrepeatarray data: [0] -> shape: (5, 5)>, max_vals=<lazyrepeatarray data: [10] -> shape: (5, 5)>))

Chain of `child` in Tensor, where `nparray` is the bottom object 

In [10]:
print(type(private_tensor))
print(type(private_tensor.child))
print(type(private_tensor.child.child))

<class 'syft.core.tensor.tensor.Tensor'>
<class 'syft.core.tensor.autodp.phi_tensor.PhiTensor'>
<class 'numpy.ndarray'>


`min_val` and `max_val` attributes

In [11]:
private_tensor.child.min_vals

<lazyrepeatarray data: [0] -> shape: (5, 5)>

In [12]:
private_tensor.child.max_vals

<lazyrepeatarray data: [10] -> shape: (5, 5)>

In [13]:
private_tensor.child.min_vals.to_numpy()

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

In [14]:
private_tensor.child.max_vals.to_numpy()

array([[10, 10, 10, 10, 10],
       [10, 10, 10, 10, 10],
       [10, 10, 10, 10, 10],
       [10, 10, 10, 10, 10],
       [10, 10, 10, 10, 10]])

In [15]:
assert private_tensor.child.min_vals.shape == private_tensor.child.shape

In [16]:
print(private_tensor.child.data_subjects)

[[DataSubjectArray: {'Ishan'} DataSubjectArray: {'Ishan'}
  DataSubjectArray: {'Ishan'} DataSubjectArray: {'Ishan'}
  DataSubjectArray: {'Ishan'}]
 [DataSubjectArray: {'Ishan'} DataSubjectArray: {'Ishan'}
  DataSubjectArray: {'Ishan'} DataSubjectArray: {'Ishan'}
  DataSubjectArray: {'Ishan'}]
 [DataSubjectArray: {'Ishan'} DataSubjectArray: {'Ishan'}
  DataSubjectArray: {'Ishan'} DataSubjectArray: {'Ishan'}
  DataSubjectArray: {'Ishan'}]
 [DataSubjectArray: {'Ishan'} DataSubjectArray: {'Ishan'}
  DataSubjectArray: {'Ishan'} DataSubjectArray: {'Ishan'}
  DataSubjectArray: {'Ishan'}]
 [DataSubjectArray: {'Ishan'} DataSubjectArray: {'Ishan'}
  DataSubjectArray: {'Ishan'} DataSubjectArray: {'Ishan'}
  DataSubjectArray: {'Ishan'}]]


In [17]:
private_tensor.child.data_subjects.shape

(5, 5)

In [18]:
assert private_tensor.child.data_subjects.shape == private_tensor.child.child.shape

### `Gamma Tensor`

In [19]:
second_tensor = tensor.private(min_val=0,
                               max_val=10,
                               data_subjects="Ivy")
second_tensor

Tensor(child=PhiTensor(child=[[5 2 1 3 3]
 [6 6 6 2 5]
 [3 1 5 6 1]
 [2 3 3 6 5]
 [4 4 5 2 6]], min_vals=<lazyrepeatarray data: [0] -> shape: (5, 5)>, max_vals=<lazyrepeatarray data: [10] -> shape: (5, 5)>))

In [20]:
gamma_tensor = private_tensor + second_tensor
print(type(gamma_tensor.child))
print(type(gamma_tensor.child.child))

<class 'syft.core.tensor.autodp.gamma_tensor.GammaTensor'>
<class 'numpy.ndarray'>


In [21]:
print(gamma_tensor.child.data_subjects)

[[DataSubjectArray: {'Ivy', 'Ishan'} DataSubjectArray: {'Ivy', 'Ishan'}
  DataSubjectArray: {'Ivy', 'Ishan'} DataSubjectArray: {'Ivy', 'Ishan'}
  DataSubjectArray: {'Ivy', 'Ishan'}]
 [DataSubjectArray: {'Ivy', 'Ishan'} DataSubjectArray: {'Ivy', 'Ishan'}
  DataSubjectArray: {'Ivy', 'Ishan'} DataSubjectArray: {'Ivy', 'Ishan'}
  DataSubjectArray: {'Ivy', 'Ishan'}]
 [DataSubjectArray: {'Ivy', 'Ishan'} DataSubjectArray: {'Ivy', 'Ishan'}
  DataSubjectArray: {'Ivy', 'Ishan'} DataSubjectArray: {'Ivy', 'Ishan'}
  DataSubjectArray: {'Ivy', 'Ishan'}]
 [DataSubjectArray: {'Ivy', 'Ishan'} DataSubjectArray: {'Ivy', 'Ishan'}
  DataSubjectArray: {'Ivy', 'Ishan'} DataSubjectArray: {'Ivy', 'Ishan'}
  DataSubjectArray: {'Ivy', 'Ishan'}]
 [DataSubjectArray: {'Ivy', 'Ishan'} DataSubjectArray: {'Ivy', 'Ishan'}
  DataSubjectArray: {'Ivy', 'Ishan'} DataSubjectArray: {'Ivy', 'Ishan'}
  DataSubjectArray: {'Ivy', 'Ishan'}]]


In [22]:
new_lower = 0 + 0
assert (gamma_tensor.child.min_vals.to_numpy() == new_lower).all()
print(gamma_tensor.child.min_vals)

<lazyrepeatarray data: [0] -> shape: (5, 5)>


In [23]:
new_upper = 10 + 10
assert (gamma_tensor.child.max_vals.to_numpy() == new_upper).all()
print(gamma_tensor.child.max_vals)

<lazyrepeatarray data: [20] -> shape: (5, 5)>


In [24]:
first_private_tensor_ds = private_tensor.child.data_subjects[0][0].data_subjects
print(first_private_tensor_ds, type(first_private_tensor_ds))

first_second_tensor_ds = second_tensor.child.data_subjects[0][0].data_subjects
print(first_second_tensor_ds, type(first_second_tensor_ds))
combined = first_private_tensor_ds.union(first_second_tensor_ds)
print(combined, type(combined))

assert gamma_tensor.child.data_subjects[0][0].data_subjects == combined

{'Ishan'} <class 'set'>
{'Ivy'} <class 'set'>
{'Ivy', 'Ishan'} <class 'set'>


#### Provenance

We need to keep track of every subsequent operation that occurs with a `GammaTensor` in a dictionary called `source`. The `key` in the `source` dictionary is an integer which maps to the corresponding Tensor which was used as input.

In [25]:
gamma_tensor.child.sources.keys()

dict_keys(['493397408', '355316939'])

In [26]:
gamma_tensor.child.sources

{'493397408': GammaTensor(child=array([[5, 2, 1, 3, 3],
        [6, 6, 6, 2, 5],
        [3, 1, 5, 6, 1],
        [2, 3, 3, 6, 5],
        [4, 4, 5, 2, 6]]), data_subjects=array([[DataSubjectArray: {'Ishan'}, DataSubjectArray: {'Ishan'},
         DataSubjectArray: {'Ishan'}, DataSubjectArray: {'Ishan'},
         DataSubjectArray: {'Ishan'}],
        [DataSubjectArray: {'Ishan'}, DataSubjectArray: {'Ishan'},
         DataSubjectArray: {'Ishan'}, DataSubjectArray: {'Ishan'},
         DataSubjectArray: {'Ishan'}],
        [DataSubjectArray: {'Ishan'}, DataSubjectArray: {'Ishan'},
         DataSubjectArray: {'Ishan'}, DataSubjectArray: {'Ishan'},
         DataSubjectArray: {'Ishan'}],
        [DataSubjectArray: {'Ishan'}, DataSubjectArray: {'Ishan'},
         DataSubjectArray: {'Ishan'}, DataSubjectArray: {'Ishan'},
         DataSubjectArray: {'Ishan'}],
        [DataSubjectArray: {'Ishan'}, DataSubjectArray: {'Ishan'},
         DataSubjectArray: {'Ishan'}, DataSubjectArray: {'Ishan'},
   

In [27]:
gamma_tensor.child.func_str

'add'

## Publishing

#### Launch a `domain`

In [28]:
!hagrid launch test_domain domain to docker:8081 --build-src=padawan_trial_dp --build

[2K✅ Updated HAGrid from branch: padawan_trial_dp padawan_trial_dp[0m0m
[2K[32m⠦[0m [1;34mUpdating HAGrid from branch: padawan_trial_dp[0m
[2K[32m⠇[0m [1;34mChecking for Docker Service[0m   ice[0m   
[1A[2K✅ Docker service is running
✅ Git 2.34.1
✅ Docker 23.0.1
✅ Docker Compose 2.15.1


 _   _       _     _                 _   _                       _
| | | |     | |   | |               | | | |                     | |
| |_| | ___ | | __| |   ___  _ __   | |_| | __ _ _ __ _ __ _   _| |
|  _  |/ _ \| |/ _` |  / _ \| '_ \  |  _  |/ _` | '__| '__| | | | |
| | | | (_) | | (_| | | (_) | | | | | | | | (_| | |  | |  | |_| |_|
\_| |_/\___/|_|\__,_|  \___/|_| |_| \_| |_/\__,_|_|  |_|   \__, (_)
                                                            __/ |
                                                           |___/
        
Launching a PyGrid Domain node on port 8081!

  - NAME: test_domain
  - RELEASE: production
  - ARCH: linux/amd64
  - TYPE: domain
  - DOCKER_TAG: la

In [29]:
import hagrid
hagrid.check("localhost:8081")

#### Private Data

In [30]:
import numpy as np
ages = np.array([25, 35, 21, 19, 40, 55, 31, 18, 27, 33])
names = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]
assert len(ages) == len(names)

In [31]:
phi_tensor = sy.Tensor(ages).private(min_val=0, 
                                    max_val=122, 
                                    data_subjects=names)
phi_tensor

Tensor(child=GammaTensor(child=array([25, 35, 21, 19, 40, 55, 31, 18, 27, 33]), data_subjects=array([DataSubjectArray: {'A'}, DataSubjectArray: {'B'},
       DataSubjectArray: {'C'}, DataSubjectArray: {'D'},
       DataSubjectArray: {'E'}, DataSubjectArray: {'F'},
       DataSubjectArray: {'G'}, DataSubjectArray: {'H'},
       DataSubjectArray: {'I'}, DataSubjectArray: {'J'}], dtype=object), min_vals=<lazyrepeatarray data: [0] -> shape: (10,)>, max_vals=<lazyrepeatarray data: [122] -> shape: (10,)>, is_linear=True, func_str='noop', id='1976055677', sources={}))

#### Upload the `private data` to the `domain`

In [32]:
domain = sy.login(port=8081, 
                  email="info@openmined.org", 
                  password="changethis")


Anyone can login as an admin to your node right now because your password is still the default PySyft username and password!!!

Connecting to localhost... done! 	 Logging into test_domain... done!


In [33]:
domain.load_dataset(
    assets={"ages_tensor": phi_tensor},
    name="ages_dataset",
    description="Ages of a group of people"
)



Loading dataset... uploading...🚀                                                                                                                                             

Uploading `ages_tensor`: 100%|[32m████████████████████████████████████████[0m| 1/1 [00:00<00:00,  4.50it/s][0m

Dataset is uploaded successfully !!! 🎉

Run `<your client variable>.datasets` to see your new dataset loaded into your machine!





#### Create Data Scientist Account

In [34]:
starting_budget = 999999
ds_details = {
    "name": "Rey Skywalker",
    "email": "rey@skywalker.net",
    "password": "jakku",
    "budget": starting_budget,
}
domain.users.create(**ds_details)

User created successfully!


In [35]:
skywalker = sy.login(port=8081, 
                     email=ds_details["email"], 
                     password=ds_details["password"])

Connecting to localhost... done! 	 Logging into test_domain... done!


`test_domain` UI will look like this

![](./assets/6-test_domain_with_ds.png)

In [36]:
age_dataset_prt = skywalker.datasets[-1]
age_dataset_prt

Dataset: ages_dataset
Description: Ages of a group of people



Asset Key,Type,Shape
"[""ages_tensor""]",Tensor,"(10,)"


Get the `pointer` to the private data in the domain

In [37]:
phi_ptr = age_dataset_prt["ages_tensor"]
phi_ptr

array([ 89,  14,  68,  34,  61,  36,  23, 118,   4,  98])

 (The data printed above is synthetic - it's an imitation of the real data.)

We can see that `phi_ptr` is synthetic and does not contain the real data. Real data we created was `ages = np.array([25, 35, 21, 19, 40, 55, 31, 18, 27, 33])`

#### Private Mean

In [38]:
mean_ptr = phi_ptr.mean()
mean_ptr

array([57.01814779])

 (The data printed above is synthetic - it's an imitation of the real data.)

#### Publish

In [40]:
result_ptr = mean_ptr.publish(sigma=1.5)

If we manage to `publish`, the resulting pointer `result_ptr` will point to the result that is available for download with `.get()`

In [42]:
remote_mean = result_ptr.get(delete_obj=False)
remote_mean

27.58036544861064

#### Compare

In [43]:
local_mean = ages.mean()
print("Real Mean:      =", local_mean)
print("Published Mean: =", remote_mean)
print("Difference:     =", round(abs(local_mean - remote_mean) / local_mean * 100, 2), "%")

Real Mean:      = 30.4
Published Mean: = 27.58036544861064
Difference:     = 9.28 %


#### `Epsilon` Spent

In [44]:
current_budget = skywalker.privacy_budget
print("Starting Budget:", starting_budget)
print("Current Budget: ", current_budget)
print("Cost:           ", starting_budget - current_budget)

Starting Budget: 999999
Current Budget:  999690.0357041412
Cost:            308.9642958587501


#### RDP (Renyi Differential Privacy) Parameters

In [45]:
from syft.core.adp.vectorized_publish import calculate_bounds_for_mechanism
l2_norms, l2_norm_bounds, sigmas, coeffs = calculate_bounds_for_mechanism(
    value_array=ages.mean(),
    min_val_array=0,
    max_val_array=122,
    sigma=1.5,
)
l2_norms, l2_norm_bounds, sigmas, coeffs



(DeviceArray([30.4], dtype=float64),
 DeviceArray([122.], dtype=float64),
 DeviceArray([1.5], dtype=float64),
 DeviceArray([1.], dtype=float64))

#### Calculate `epsilon`

epsilon = alpha * rdp_constant, where

rdp_constant = l2_norm(q(X_i)) ^ 2 / (2 * sigma ^2)

##### rdp constant

In [46]:
from syft.core.adp.data_subject_ledger import RDPParams
rdp_params = RDPParams(
    sigmas=sigmas,
    l2_norms=l2_norms,
    l2_norm_bounds=l2_norm_bounds,
    Ls=coeffs.copy(), # is_linear == True so we can cheat here
    coeffs=coeffs
)
rdp_params

RDPParams:
 sigmas:[1.5]
 l2_norms:[30.4]
 l2_norm_bounds:[122.]
 Ls:[1.]
 coeffs:[1.]

In [47]:
from syft.core.adp.vectorized_publish import compute_rdp_constant
rdp_constants = compute_rdp_constant(rdp_params, private=True)
rdp_constants

DeviceArray([205.36888889], dtype=float64)

##### calculate `epsilon`

In [48]:
rdp_constant = 205.36888889

In [49]:
from scipy.optimize import minimize_scalar

DELTA = 1e-6
log_delta = np.log(DELTA)

def partial_alpha(constant: int):
    return lambda alpha: alpha * constant

def search_alpha(partial):
    def fun(alpha: float) -> float:  # the input is the RDP's \alpha
        if alpha <= 1:
            return np.inf
        else:
            alpha_minus_1 = alpha - 1
            return np.maximum(
                partial(alpha)
                + np.log(alpha_minus_1 / alpha)
                - (log_delta + np.log(alpha)) / alpha_minus_1,
                0,
            )
    return fun


def find_alpha_and_eps(
    constant: int = 3
):
    search_func = search_alpha(partial_alpha(constant))
    results = minimize_scalar(
        search_func, method="Brent", bracket=(1, 2)
    )

    return results.x, results.fun

In [50]:
# ⚔️ Runnable Code
alpha, eps = find_alpha_and_eps(
    constant=rdp_constant
)
print(f"The solution for RDP Constant={rdp_constant} is alpha={alpha}, epsilon={eps}")

The solution for RDP Constant=205.36888889 is alpha=1.2572102721433043, epsilon=309.4281090883548


##### RDP Cache

In [51]:
from syft.core.adp.data_subject_ledger import load_cache
cache = load_cache("constant2epsilon_1200k.npy")

In [52]:
cache

array([5.37271206e-02, 7.77359737e-02, 9.64575076e-02, ...,
       7.06261395e+05, 7.06262399e+05, 7.06263404e+05])

In [53]:
len(cache)

1200000

In [54]:
from syft.core.adp.data_subject_ledger import convert_constants_to_indices
cache_indexes = convert_constants_to_indices(rdp_constants)
cache_indexes

DeviceArray([500154], dtype=int64)

In [55]:
cached_epsilon = cache[cache_indexes[0]]
cached_epsilon

308.9642958587137

In [56]:
print("Calculated Epsilon: ", eps)
print("Cached Epsilon:     ", cached_epsilon)
print("Difference:         ", round(abs(eps - cached_epsilon) * 100 / eps, 2), "%")
print("Previous Spend:     ", starting_budget - current_budget)

Calculated Epsilon:  309.4281090883548
Cached Epsilon:      308.9642958587137
Difference:          0.15 %
Previous Spend:      308.9642958587501
