# Federated Learning with Secure Aggregation and Diff Privacy using PySyft

This is an example of using our new Secure Multi-Party Computation tensor (SPDZTensor) to perform an encrypted average of gradients across multiple data owners.

Before starting with this notebook, we recommend looking at `Boston_Housing_Federated_Training.ipynb` which is located in the same folder


# Setting Up

In [1]:
! URL="https://github.com/openmined/PySyft.git" && FOLDER="PySyft" && if [ ! -d $FOLDER ]; then git clone $URL; else (cd $FOLDER && git pull $URL && cd ..); fi;




# http://pytorch.org/
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.3.0.post4-{platform}-linux_x86_64.whl torchvision  > /dev/null

!cd PySyft; python setup.py install  > /dev/null



0.3.0.post4


In [None]:
! URL="https://github.com/LaRiffle/differential-privacy.git" && FOLDER="differential_privacy" && if [ ! -d $FOLDER ]; then git clone $URL $FOLDER; else (cd $FOLDER && git pull $URL && cd ..); fi;
! pip install --upgrade --force-reinstall websockets

In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('./PySyft'))
if module_path not in sys.path:
    sys.path.append(module_path)
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader

print(torch.__version__)
print(torch.cuda.is_available())

# Training settings
parser = argparse.ArgumentParser(description='PyTorch Example')
parser.add_argument('--batch-size', type=int, default=8, metavar='N',
                    help='input batch size for training (default: 8)')
parser.add_argument('--test-batch-size', type=int, default=8, metavar='N',
                    help='input batch size for testing (default: 8)')
parser.add_argument('--epochs', type=int, default=10, metavar='N',
                    help='number of epochs to train (default: 10)')
parser.add_argument('--lr', type=float, default=0.001, metavar='LR',
                    help='learning rate (default: 0.001)')
parser.add_argument('--momentum', type=float, default=0.0, metavar='M',
                    help='SGD momentum (default: 0.0)')
parser.add_argument('--seed', type=int, default=1, metavar='S',
                    help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                    help='how many batches to wait before logging training status')
args = parser.parse_args([])

torch.manual_seed(args.seed)

kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}

### Loading the dataset

In [2]:
import pickle
f = open('../other/data/boston_housing.pickle','rb')
((X, y), (X_test, y_test)) = pickle.load(f)
f.close()

X = torch.from_numpy(X).type(torch.FloatTensor)
y = torch.from_numpy(y).type(torch.FloatTensor)
X_test = torch.from_numpy(X_test).type(torch.FloatTensor)
y_test = torch.from_numpy(y_test).type(torch.FloatTensor)
# preprocessing
mean = X.mean(0, keepdim=True)
dev = X.std(0, keepdim=True)
mean[:, 3] = 0. # the feature at column 3 is binary,
dev[:, 3] = 1.  # so I'd rather not standardize it
X = (X - mean) / dev
X_test = (X_test - mean) / dev
train = TensorDataset(X, y)
test = TensorDataset(X_test, y_test)
train_loader = DataLoader(train, batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader = DataLoader(test, batch_size=args.test_batch_size, shuffle=True, **kwargs)


#  Neural Network Structure

In [3]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(13, 32)
        self.fc2 = nn.Linear(32, 24)
        self.fc3 = nn.Linear(24, 1)

    def forward(self, x):
        x = x.view(-1, 13)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def divide_clip_grads(self):
        for key, param in self.named_parameters():
            param.grad /= n_batch
            gradient_clip(param)
            
    def add_noise_to_grads(self):
        for key, param in self.named_parameters():
            noise = 1/LOT_SIZE * gaussian_noise(param.grad)
            param.grad += noise
model = Net()
model_params = list(model.parameters())

bobs_model = Net()
alices_model = Net()

if args.cuda:
    model.cuda()


# Hooking into Pytorch

In [4]:
import syft
import syft as sy
from syft.core import utils
import torch
import torch.nn.functional as F
import json
import random
from syft.core.frameworks.torch import utils as torch_utils
from torch.autograd import Variable
hook = sy.TorchHook(verbose=False)
me = hook.local_worker
bob = sy.VirtualWorker(id="bob",hook=hook, is_client_worker=False)
alice = sy.VirtualWorker(id="alice",hook=hook, is_client_worker=False)
me.is_client_worker = False

compute_nodes = [bob, alice]

me.add_workers([bob, alice])
bob.add_workers([me, alice])
alice.add_workers([me, bob])




**Send data to the worker** <br>
Usually they would already have it, this is just for demo

In [5]:
remote_dataset = (list(),list())

for batch_idx, (data,target) in enumerate(train_loader):
    data = Variable(data)
    target = Variable(target.float())
    data.send(compute_nodes[batch_idx % len(compute_nodes)])
    target.send(compute_nodes[batch_idx % len(compute_nodes)])
    remote_dataset[batch_idx % len(compute_nodes)].append((data, target))

In [6]:
def update(data, target, model, optimizer):
    model.send(data.location)
    optimizer.zero_grad()
    pred = model(data)
    loss = F.mse_loss(pred, target.float())
    loss.backward()
    bobs_optimizer.step()
    return model

In [7]:
bobs_optimizer = optim.SGD(bobs_model.parameters(), lr=args.lr, momentum=args.momentum)
alices_optimizer = optim.SGD(alices_model.parameters(), lr=args.lr, momentum=args.momentum)

models = [bobs_model, alices_model]
params = [list(bobs_model.parameters()), list(alices_model.parameters())]
optimizers = [bobs_optimizer, alices_optimizer]

## Diff Privacy

In [None]:
"""
    Inspired from Abadi et al., Deep Learning with Differential Privacy, 
    Proceedings of the 2016 ACM SIGSAC Conference on Computer and Communications
    Security, 2016
"""
from differential_privacy.privacy_accountant.pytorch import accountant

n_batch = 3
NUM_TRAINING_IMAGES = X.size()[0]
LOT_SIZE = n_batch * args.batch_size
N_LOTS = 100
T = N_LOTS # number of samplings

bound = 10
epsilon = 0.5
delta = 10**(-5)
sigma = np.sqrt(2 * np.log(1.25/delta))/epsilon 

def sum_batch(grads):
    n_items = len(grads)
    return grads.view(n_items, -1).sum(dim=1)

def gradient_clip(param):
    """Clip gradient to ensure ||param.grad||2 < bound"""
    nn.utils.clip_grad_norm([param], bound)

def gaussian_noise(grads):
    """Add gaussian noise to gradients"""
    shape = grads.shape
    noise = Variable(torch.zeros(shape))
    noise.data.normal_(0.0, std=bound*sigma)
    return noise

q = LOT_SIZE / NUM_TRAINING_IMAGES
spent_epsilon = q * epsilon * np.sqrt(T)
spent_delta = delta
print('sigma =', sigma)
print('The mechanism is (O(%f), %f)-differentially private' % (spent_epsilon, spent_delta))

In [None]:
priv_accountant = accountant.GaussianMomentsAccountant(NUM_TRAINING_IMAGES)

In [None]:
train_distributed_dataset = []

for item_idx in range(len(X)):
    data = Variable(X[item_idx])
    target = Variable(sy.FloatTensor([y[item_idx]]))
    data.send(compute_nodes[item_idx % len(compute_nodes)])
    target.send(compute_nodes[item_idx % len(compute_nodes)])
    train_distributed_dataset.append((data, target))
    
def setattr_nested(base, path, value):
    """Accept a dotted path to a nested attribute to set."""
    path, _, target = path.rpartition('.')
    for attrname in path.split('.'):
        base = getattr(base, attrname)
    setattr(base, target, value)
    
def select_lot():
    """
    Build the lof by sampling over the dataset
    """
    #- choose a worker
    worker_idx = randint(len(compute_nodes)) 
    worker = compute_nodes[worker_idx]
    item_ids = np.arange(len(train_distributed_dataset)) 
    #- select all indexes in train_distributed_dataset of tensors sent to worker
    valid_ids = item_ids[item_ids % 2 == worker_idx] 
    #- Select indexes and reshape into batches
    batches_ids = np.random.choice(valid_ids,size=LOT_SIZE, replace=False).reshape(-1, args.batch_size)
    #- Build lot
    lot = []
    for batch_ids in batches_ids:
        batch_data = []
        batch_target = []
        for batch_id in batch_ids:
            data, target = train_distributed_dataset[batch_id]
            batch_data.append(data)
            batch_target.append(target)
        lot.append((torch.stack(batch_data), torch.stack(batch_target)))
        
    return lot, worker

# Training Function (To be combined)

In [8]:
## Diff privacy training
def train(lot_idx):
    model.train()
    # Build the lot by sampling over the dataset
    lot, worker = select_lot()
        
    optimizer.zero_grad()
    
    model.send(worker)
    
    # Iterate on the lot batch per batch
    for batch_idx, (data,target) in enumerate(lot):
        
        # update the model
        pred = model(data)
        loss = F.mse_loss(pred, target.float())
        # Note that because we apply backward() several times without resetting 
        # the grads (optimizer.zero_grad()), we sum the gradients 
        loss.backward() 
        
        if batch_idx == 0:
            loss.get()
            print('Train Lot: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                lot_idx, batch_idx * args.batch_size, LOT_SIZE,
                100. * batch_idx * args.batch_size / LOT_SIZE, loss.data[0]))
            print(priv_accountant.get_privacy_spent(target_deltas=[spent_delta]))
       

    
    optimizer.step()
        
    priv_accountant.accumulate_privacy_spending(bound * sigma, LOT_SIZE)



## Federated Learning training
def train():

    for data_index in range(len(remote_dataset[0])-1):
        # update remote models
        for remote_index in range(len(compute_nodes)):
            data, target = remote_dataset[remote_index][data_index]
            models[remote_index] = update(data, target, models[remote_index], optimizers[remote_index])

        new_params = list()

        for param_i in range(len(params[0])):

            spdz_params = list()
            for remote_index in range(len(compute_nodes)):
                spdz_params.append((params[remote_index][param_i].data+0).fix_precision().share(bob, alice).get())

            new_param = (spdz_params[0] + spdz_params[1]).get().decode()/2
            new_params.append(new_param)

        for model in params:
            for param in model:
                param.data *= 0

        for model in models:
            model.get()
           
            model.divide_clip_grads()
            model.add_noise_to_grads()

        for remote_index in range(len(compute_nodes)):
            for param_index in range(len(params[remote_index])):
                params[remote_index][param_index].data.set_(new_params[param_index])

# Testing Function

In [9]:
def test():
    models[0].eval()
    test_loss = 0
    for data, target in test_loader:
        data, target = Variable(data, volatile=True), Variable(target)
        output = models[0](data)
        test_loss += F.mse_loss(output, target.float(), size_average=False).data[0] # sum up batch loss
        pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
        
    test_loss /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}\n'.format(test_loss))



# Training The Dataset

In [10]:
%%time

for epoch in range(1, args.epochs + 1):
    print(epoch)
    for lot_idx in range(1, N_LOTS):
        train(lot_idx)
    test()

1

Test set: Average loss: 459.1721

2

Test set: Average loss: 38.0229

3

Test set: Average loss: 23.7089

4

Test set: Average loss: 21.1038

5

Test set: Average loss: 19.9490

6

Test set: Average loss: 18.7076

7

Test set: Average loss: 17.9800

8

Test set: Average loss: 17.4651

9

Test set: Average loss: 17.1163

10

Test set: Average loss: 16.8523

CPU times: user 55.6 s, sys: 347 ms, total: 56 s
Wall time: 55.9 s
