## Import all necessary modules

We additionally import Iterable, which makes the parameters of a PyTorch model iterable in a for-loop. For visualization, we also import pyplot and seaborn.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from typing import Iterable

from tqdm.notebook import tqdm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# function to plot the training's performance progression
def plot_performance(train_losses, train_accs, test_losses, test_accs):
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2)
    fig.set_size_inches(w=15,h=5)
    ax1.plot(train_losses, label="Train Loss")
    ax1.plot(test_losses, label="Test Loss")
    ax1.set_xlabel("Epoch")
    ax1.set_ylabel("Loss")
    ax1.legend()
    ax2.plot(train_accs, label="Train Accuracy")
    ax2.plot(test_accs, label="Test Accuracy")
    ax2.set_xlabel("Epoch")
    ax2.set_ylabel("Accuracy")
    ax2.set_ylim(0, 1)
    ax2.legend()

    sns.despine(fig)
    plt.show()

## Programming Exercise: DP-SGD

This programming exercise implements the differentially private version of _Stochastic Gradient Descent_ called _DP-SGD_. 

Complete the code by filling in the TODOs:
- Implement the clipping function that applies the norm clipping. 
- Implement the addition of adequately scaled noise to the gradient that is used to update the network's parameters. 

If successful, the training takes a few minutes per epoch.

In [None]:
# define a simple model with 2 fully-connected layers
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(784, 512)
        self.fc2 = nn.Linear(512, 10)

    def forward(self, x):
        x = x.view(x.shape[0],-1)
        x = F.relu(self.fc1(x))
        x = F.log_softmax(self.fc2(x), dim = 1)

        return x

In [None]:
# calculate the l2-norm of the model's gradient.
# as the model consists of several layers, we have to aggregate the gradients of all layers and calculate
# the overall norm.
def calc_grad_norm(parameters: Iterable[torch.Tensor], device):
    return torch.norm(torch.stack([torch.norm(p.grad.detach()).to(device) for p in parameters]))

The next two cells contains the actual ToDos, marked as "TODO".

In [None]:
# clip the model's gradient to max_grad_norm
def clip_(parameters: Iterable[torch.Tensor], max_grad_norm: float, device):
    parameters = [p for p in parameters if p.grad is not None]

    ## TODO ##
    # implement the actual clipping.
    # tips: - remember that you can access each layer's gradient by using "p.grad for each p in parameters"
    #       - you can manipulate the gradients in this function in-place without returning anything
    #       - clipping can be implemented as a multiplication of each gradient with a scaling factor



In [None]:
def train(model, device, train_loader, optimizer, epoch, noise_multiplier, max_grad_norm):

    model.train()

    # we use reduction='none' so that we get the loss per sample in our batch
    criterion =  nn.NLLLoss(reduction='none')

    losses = []
    top1_acc = []

    for batch_idx, (data, target) in enumerate(train_loader):
        min_max_grad_norm = 1e15

        # prepare a dict to store single gradients by its layer's name
        clipped_grads = {name: torch.zeros_like(param, device=device) for name, param in model.named_parameters()}

        data, target = data.to(device), target.to(device)

        optimizer.zero_grad()

        output = model(data)

        loss = criterion(output, target)

        pred = output.argmax(
                dim=1, keepdim=True
            )
        correct = pred.eq(target.view_as(pred)).sum().item()
        top1_acc.append(correct / len(data))

        for i in range(loss.size(0)):
            loss[i].backward(retain_graph=True)

            clip_(model.parameters(), max_grad_norm, device)

            for name, param in model.named_parameters():
                clipped_grads[name] += param.grad.detach().clone() / loss.size(0)

            optimizer.zero_grad()

        ## TODO: add noise ##
        for key in clipped_grads.keys():
            # 1. create appropriately scaled noise
            # tip: the function torch.normal(...) may help you


            # 2. add the noise to our accumulated gradients
            # clipped_grads[key] ...


        for name, param in model.named_parameters():
            param.grad = clipped_grads[name]

        optimizer.step()

        losses.append(torch.mean(loss).item())

    mean_loss = np.mean(losses)
    mean_acc = np.mean(top1_acc)

    print(f'Train Epoch {epoch}: \t Loss: {mean_loss:.6f}; Acc@1: {mean_acc:.6f}')

For convenience, the template contains a test function and a main function and executes everything and plots the performance.

In [None]:
def test(model, device, test_loader):

    model.eval()

    criterion = nn.NLLLoss()

    test_loss = 0
    correct = 0

    with torch.no_grad():

        for data, targets in tqdm(test_loader):
            data, targets = data.to(device), targets.to(device)
            output = model(data)
            test_loss += criterion(output, targets).item()
            pred = output.argmax(
                dim=1, keepdim=True
            )
            correct += pred.eq(targets.view_as(pred)).sum().item()

    test_loss /= len(test_loader)

    print(
        "\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n".format(
            test_loss,
            correct,
            len(test_loader.dataset),
            100.0 * correct / len(test_loader.dataset),
        )
    )

In [None]:
# the main function
def main():
    # define the parameters of this training
    lr = 5e-3
    train_batch_size = 32
    test_batch_size = 1000
    epochs = 10
    c = 1.0 # clipping bound
    sigma = 1.1 # noise multiplier
    
    # if available use GPU
    cuda_available = torch.cuda.is_available()
    device = torch.device("cuda:0" if cuda_available else "cpu")
    
    # download and prepare the training set
    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST(
            "../mnist",
            train=True,
            download=True,
            transform=transforms.Compose(
                [
                    transforms.ToTensor(),
                    transforms.Normalize((0.1307,), (0.3081,)),
                ]
            ),
        ),
        batch_size=train_batch_size,
        num_workers=1,
        pin_memory=True,
    )
    
    # download and prepare the test set
    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST(
            "../mnist",
            train=False,
            transform=transforms.Compose(
                [
                    transforms.ToTensor(),
                    transforms.Normalize((0.1307,), (0.3081,)),
                ]
            ),
        ),
        batch_size=test_batch_size,
        shuffle=True,
        num_workers=1,
        pin_memory=True,
    )

    # initialize the model and copy it to device's memory
    model = Model().to(device)

    # use a standard SGD optimizer
    optimizer = optim.SGD(model.parameters(), lr=lr)

    # define some variables to store performance metrics
    train_losses = []
    test_losses = []
    train_accs = []
    test_accs = []

    # for each epoch...
    for epoch in tqdm(range(1, epochs+1), desc="Epoch", unit="epoch"):
        # ...perform training step and store performance metrics
        train_loss, train_acc = train( model, device, train_loader, optimizer, epoch, sigma, c )
        train_losses.append(train_loss)
        train_accs.append(train_acc)

        # ...perform test step and store performance metrics
        test_loss, test_acc = test(model, device, test_loader)
        test_losses.append(test_loss)
        test_accs.append(test_acc)

    # plot the results
    plot_performance(train_losses, train_accs, test_losses, test_accs)

main()