# Private Aggregation of Teacher Ensembles (PATE)



![PATE chart](img/pate.jpeg)

## Import libraries

In [None]:
import torch

import numpy as np
from torchvision import datasets
import torchvision.transforms as transforms
from torch.utils.data import Subset



## Load the [Data](http://pytorch.org/docs/stable/torchvision/datasets.html)

Downloading may take a few moments, and you should see your progress as the data is loading. You may also choose to change the `batch_size` if you want to load more data at a time.

In [None]:
# number of subprocesses to use for data loading
num_workers = 0
# how many samples per batch to load
batch_size = 32

CIFAR100_MEAN = (0.4914, 0.4822, 0.4465)
CIFAR100_STD_DEV = (0.2023, 0.1994, 0.2010)
# convert data to torch.FloatTensor
transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize(CIFAR100_MEAN, CIFAR100_STD_DEV)])

# choose the training and test datasets
train_data = datasets.CIFAR100(root='D:/research_2022/', train=True,
                                   download=True, transform=transform)
test_data = datasets.CIFAR100(root='D:/research_2022/', train=False,
                                  download=True, transform=transform)

Function for returning dataloaders for a specified number of teachers.

In [None]:
# number of teachers to essemble
num_teachers = 100

def get_data_loaders(train_data, num_teachers = 10):
    teacher_loaders = []
    data_size = len(train_data) // num_teachers

    for i in range(num_teachers):
        indices = list(range(i*data_size, (i+1) *data_size))
        subset_data = Subset(train_data, indices)
        loader = torch.utils.data.DataLoader(subset_data, batch_size=batch_size, num_workers=num_workers)
        teacher_loaders.append(loader)

    return teacher_loaders

teacher_loaders = get_data_loaders(train_data, num_teachers)

Define a train student set of 9000 examples and 1000 test examples

In [None]:
student_train_data = Subset(test_data, list(range(9000)))
student_test_data = Subset(test_data, list(range(9000, 10000)))

student_train_loader = torch.utils.data.DataLoader(student_train_data, batch_size=batch_size, 
            num_workers=num_workers)
student_test_loader = torch.utils.data.DataLoader(student_test_data, batch_size=batch_size, 
            num_workers=num_workers)

## Defining models

I'm going to define a single model for all the teachers, the analysis does not depends on the model

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(500, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        #print(x.shape)
        x = x.view(-1, 500)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x)
    

In [None]:
''' ConvNet '''
import torch.nn as nn
class ConvNet(nn.Module):
    def __init__(self, channel, num_classes, net_width, net_depth, net_act, net_norm, net_pooling, im_size = (32,32)):
        super(ConvNet, self).__init__()

        self.features, shape_feat = self._make_layers(channel, net_width, net_depth, net_norm, net_act, net_pooling, im_size)
        num_feat = shape_feat[0]*shape_feat[1]*shape_feat[2]
        self.classifier = nn.Linear(num_feat, num_classes)

    def forward(self, x):
        # print("MODEL DATA ON: ", x.get_device(), "MODEL PARAMS ON: ", self.classifier.weight.data.get_device())
        out = self.features(x)
        out = out.view(out.size(0), -1)
        out = self.classifier(out)
        return F.log_softmax(out)

    def _get_activation(self, net_act):
        if net_act == 'sigmoid':
            return nn.Sigmoid()
        elif net_act == 'relu':
            return nn.ReLU(inplace=True)
        elif net_act == 'leakyrelu':
            return nn.LeakyReLU(negative_slope=0.01)
        else:
            exit('unknown activation function: %s'%net_act)

    def _get_pooling(self, net_pooling):
        if net_pooling == 'maxpooling':
            return nn.MaxPool2d(kernel_size=2, stride=2)
        elif net_pooling == 'avgpooling':
            return nn.AvgPool2d(kernel_size=2, stride=2)
        elif net_pooling == 'none':
            return None
        else:
            exit('unknown net_pooling: %s'%net_pooling)

    def _get_normlayer(self, net_norm, shape_feat):
        # shape_feat = (c*h*w)
        if net_norm == 'batchnorm':
            return nn.BatchNorm2d(shape_feat[0], affine=True)
        elif net_norm == 'layernorm':
            return nn.LayerNorm(shape_feat, elementwise_affine=True)
        elif net_norm == 'instancenorm':
            return nn.GroupNorm(shape_feat[0], shape_feat[0], affine=True)
        elif net_norm == 'groupnorm':
            return nn.GroupNorm(4, shape_feat[0], affine=True)
        elif net_norm == 'none':
            return None
        else:
            exit('unknown net_norm: %s'%net_norm)

    def _make_layers(self, channel, net_width, net_depth, net_norm, net_act, net_pooling, im_size):
        layers = []
        in_channels = channel
        if im_size[0] == 28:
            im_size = (32, 32)
        shape_feat = [in_channels, im_size[0], im_size[1]]
        for d in range(net_depth):
            layers += [nn.Conv2d(in_channels, net_width, kernel_size=3, padding=3 if channel == 1 and d == 0 else 1)]
            shape_feat[0] = net_width
            if net_norm != 'none':
                layers += [self._get_normlayer(net_norm, shape_feat)]
            layers += [self._get_activation(net_act)]
            in_channels = net_width
            if net_pooling != 'none':
                layers += [self._get_pooling(net_pooling)]
                shape_feat[1] //= 2
                shape_feat[2] //= 2


        return nn.Sequential(*layers), shape_feat 
    
    

def get_default_convnet_setting():
    net_width, net_depth, net_act, net_norm, net_pooling = 128, 3, 'relu', 'instancenorm', 'avgpooling'
    return net_width, net_depth, net_act, net_norm, net_pooling

net_width, net_depth, net_act, net_norm, net_pooling = get_default_convnet_setting()
nets= ConvNet(channel=3, num_classes= 10,net_width=net_width, net_depth=net_depth, net_act=net_act, net_norm=net_norm, net_pooling=net_pooling,im_size=(32,32))

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def train(model, trainloader, criterion, optimizer, epochs=10, print_every=120):
    model.to(device)
    steps = 0
    running_loss = 0
    for e in range(epochs):
        # Model in training mode, dropout is on
        model.train()
        for images, labels in trainloader:
            images, labels = images.to(device), labels.to(device)
            steps += 1
            
            optimizer.zero_grad()
            
            output = model.forward(images)
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()


In [None]:
def predict(model, dataloader):
    outputs = torch.zeros(0, dtype=torch.long).to(device)
    model.to(device)
    model.eval()
    for images, labels in dataloader:
        images, labels = images.to(device), labels.to(device)
        output = model.forward(images)
        ps = torch.argmax(torch.exp(output), dim=1)
        outputs = torch.cat((outputs, ps))
    
    return outputs    


## Training all the teacher models

Here we define and train the teachers

In [None]:
# Instantiate and train the models for each teacher
def train_models(num_teachers):
    models = []
    for t in range(num_teachers):
        print("Training teacher {}".format(t+1))
        model = ConvNet(channel=3, num_classes= 100,net_width=net_width, net_depth=net_depth, net_act=net_act, net_norm=net_norm, net_pooling=net_pooling,im_size=(32,32))
        criterion = nn.NLLLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.003)
        train(model, teacher_loaders[t], criterion, optimizer)
        models.append(model)
    return models

models = train_models(num_teachers) 

## Aggregated teacher

This function predict the labels from all the dataset in each of the teachers, then return all the predictions and the maximum votation after adding laplacian noise

In [None]:
import numpy as np

In [None]:
# define epsilon
epsilon = 0.2

# Aggregated teacher

This function makes the predictions in all the teachers, count the votes and add noise, then returns the votation and the argmax results.

In [None]:
def aggregated_teacher(models, data_loader, epsilon):
    preds = torch.torch.zeros((len(models), 9000), dtype=torch.long)
    for i, model in enumerate(models):
        results = predict(model, data_loader)
        preds[i] = results
        
    labels = np.array([]).astype(int)
    for image_preds in np.transpose(preds):
        label_counts = np.bincount(image_preds, minlength=10)
        beta = 1 / epsilon

        for i in range(len(label_counts)):
            label_counts[i] += np.random.laplace(0, beta, 1)

        new_label = np.argmax(label_counts)
        labels = np.append(labels, new_label)
    
    return preds.numpy(), labels

In [None]:
teacher_models = models
preds, student_labels = aggregated_teacher(teacher_models, student_train_loader, epsilon)

In [None]:
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""
This script computes bounds on the privacy cost of training the
student model from noisy aggregation of labels predicted by teachers.
It should be used only after training the student (and therefore the
teachers as well). We however include the label files required to
reproduce key results from our paper (https://arxiv.org/abs/1610.05755):
the epsilon bounds for MNIST and SVHN students.
"""
import os
import math
import numpy as np
import torch

# import tensorflow as tf
#
#
# # These parameters can be changed to compute bounds for different failure rates
# # or different model predictions.
#
# tf.flags.DEFINE_integer("moments",8, "Number of moments")
# tf.flags.DEFINE_float("noise_eps", 0.1, "Eps value for each call to noisymax.")
# tf.flags.DEFINE_float("delta", 1e-5, "Target value of delta.")
# tf.flags.DEFINE_float("beta", 0.09, "Value of beta for smooth sensitivity")
# tf.flags.DEFINE_string("counts_file","","Numpy matrix with raw counts")
# tf.flags.DEFINE_string("indices_file","",
#     "File containting a numpy matrix with indices used."
#     "Optional. Use the first max_examples indices if this is not provided.")
# tf.flags.DEFINE_integer("max_examples",1000,
#     "Number of examples to use. We will use the first"
#     " max_examples many examples from the counts_file"
#     " or indices_file to do the privacy cost estimate")
# tf.flags.DEFINE_float("too_small", 1e-10, "Small threshold to avoid log of 0")
# tf.flags.DEFINE_bool("input_is_counts", False, "False if labels, True if counts")
#
# FLAGS = tf.flags.FLAGS


def compute_q_noisy_max(counts, noise_eps):
    """Returns ~ Pr[outcome != winner].

  Args:
    counts: a list of scores
    noise_eps: privacy parameter for noisy_max
  Returns:
    q: the probability that outcome is different from true winner.
  """
    # For noisy max, we only get an upper bound.
    # Pr[ j beats i*] \leq (2+gap(j,i*))/ 4 exp(gap(j,i*)
    # proof at http://mathoverflow.net/questions/66763/
    # tight-bounds-on-probability-of-sum-of-laplace-random-variables

    winner = np.argmax(counts)
    counts_normalized = noise_eps * (counts - counts[winner])

    counts_rest = np.array([counts_normalized[i] for i in range(len(counts)) if i != winner])
    q = 0.0
    for c in counts_rest:
        gap = -c

        q += (gap + 2.0) / (4.0 * math.exp(gap))

    return min(q, 1.0 - (1.0 / len(counts)))


def compute_q_noisy_max_approx(counts, noise_eps):
    """Returns ~ Pr[outcome != winner].

  Args:
    counts: a list of scores
    noise_eps: privacy parameter for noisy_max
  Returns:
    q: the probability that outcome is different from true winner.
  """
    # For noisy max, we only get an upper bound.
    # Pr[ j beats i*] \leq (2+gap(j,i*))/ 4 exp(gap(j,i*)
    # proof at http://mathoverflow.net/questions/66763/
    # tight-bounds-on-probability-of-sum-of-laplace-random-variables
    # This code uses an approximation that is faster and easier
    # to get local sensitivity bound on.

    winner = np.argmax(counts)
    counts_normalized = noise_eps * (counts - counts[winner])
    counts_rest = np.array([counts_normalized[i] for i in range(len(counts)) if i != winner])
    gap = -max(counts_rest)
    q = (len(counts) - 1) * (gap + 2.0) / (4.0 * math.exp(gap))
    return min(q, 1.0 - (1.0 / len(counts)))


def logmgf_exact(q, priv_eps, l):
    """Computes the logmgf value given q and privacy eps.

  The bound used is the min of three terms. The first term is from
  https://arxiv.org/pdf/1605.02065.pdf.
  The second term is based on the fact that when event has probability (1-q) for
  q close to zero, q can only change by exp(eps), which corresponds to a
  much smaller multiplicative change in (1-q)
  The third term comes directly from the privacy guarantee.
  Args:
    q: pr of non-optimal outcome
    priv_eps: eps parameter for DP
    l: moment to compute.
  Returns:
    Upper bound on logmgf
  """
    if q < 0.5:
        t_one = (1 - q) * math.pow((1 - q) / (1 - math.exp(priv_eps) * q), l)
        t_two = q * math.exp(priv_eps * l)
        t = t_one + t_two
        try:
            log_t = math.log(t)
        except ValueError:
            print("Got ValueError in math.log for values :" + str((q, priv_eps, l, t)))
            log_t = priv_eps * l
    else:
        log_t = priv_eps * l

    return min(0.5 * priv_eps * priv_eps * l * (l + 1), log_t, priv_eps * l)


def logmgf_from_counts(counts, noise_eps, l):
    """
  ReportNoisyMax mechanism with noise_eps with 2*noise_eps-DP
  in our setting where one count can go up by one and another
  can go down by 1.
  """

    q = compute_q_noisy_max(counts, noise_eps)
    return logmgf_exact(q, 2.0 * noise_eps, l)


def sens_at_k(counts, noise_eps, l, k):
    """Return sensitivity at distane k.

  Args:
    counts: an array of scores
    noise_eps: noise parameter used
    l: moment whose sensitivity is being computed
    k: distance
  Returns:
    sensitivity: at distance k
  """
    counts_sorted = sorted(counts, reverse=True)
    if 0.5 * noise_eps * l > 1:
        print("l too large to compute sensitivity")
        return 0
    # Now we can assume that at k, gap remains positive
    # or we have reached the point where logmgf_exact is
    # determined by the first term and ind of q.
    if counts[0] < counts[1] + k:
        return 0
    counts_sorted[0] -= k
    counts_sorted[1] += k
    val = logmgf_from_counts(counts_sorted, noise_eps, l)
    counts_sorted[0] -= 1
    counts_sorted[1] += 1
    val_changed = logmgf_from_counts(counts_sorted, noise_eps, l)
    return val_changed - val


def smoothed_sens(counts, noise_eps, l, beta):
    """Compute beta-smooth sensitivity.

  Args:
    counts: array of scors
    noise_eps: noise parameter
    l: moment of interest
    beta: smoothness parameter
  Returns:
    smooth_sensitivity: a beta smooth upper bound
  """
    k = 0
    smoothed_sensitivity = sens_at_k(counts, noise_eps, l, k)
    while k < max(counts):
        k += 1
        sensitivity_at_k = sens_at_k(counts, noise_eps, l, k)
        smoothed_sensitivity = max(smoothed_sensitivity, math.exp(-beta * k) * sensitivity_at_k)
        if sensitivity_at_k == 0.0:
            break
    return smoothed_sensitivity


def perform_analysis(teacher_preds, indices, noise_eps, delta=1e-5, moments=8, beta=0.09):
    """"Performs PATE analysis on predictions from teachers and combined predictions for student.

    Args:
        teacher_preds: a numpy array of dim (num_teachers x num_examples). Each value corresponds to the
            index of the label which a teacher gave for a specific example
        indices: a numpy array of dim (num_examples) of aggregated examples which were aggregated using
            the noisy max mechanism.
        noise_eps: the epsilon level used to create the indices
        delta: the desired level of delta
        moments: the number of moments to track (see the paper)
        beta: a smoothing parameter (see the paper)
    Returns:
        tuple: first value is the data dependent epsilon, then the data independent epsilon
    """

    num_teachers, num_examples = teacher_preds.shape
    _num_examples = indices.shape[0]
    labels = set(list(teacher_preds.flatten()))
    num_labels = len(labels)

    assert num_examples == _num_examples

    counts_mat = np.zeros((num_examples, num_labels))

    for i in range(num_examples):
        for j in range(num_teachers):
            counts_mat[i, int(teacher_preds[j, i])] += 1

    l_list = 1.0 + np.array(range(moments))

    total_log_mgf_nm = np.array([0.0 for _ in l_list])
    total_ss_nm = np.array([0.0 for _ in l_list])

    for i in indices:

        total_log_mgf_nm += np.array(
            [logmgf_from_counts(counts_mat[i], noise_eps, l) for l in l_list]
        )

        total_ss_nm += np.array([smoothed_sens(counts_mat[i], noise_eps, l, beta) for l in l_list])

    # We want delta = exp(alpha - eps l).
    # Solving gives eps = (alpha - ln (delta))/l

    eps_list_nm = (total_log_mgf_nm - math.log(delta)) / l_list

    # print("Epsilons (Noisy Max): " + str(eps_list_nm))
    # print("Smoothed sensitivities (Noisy Max): " + str(total_ss_nm / l_list))

    # If beta < eps / 2 ln (1/delta), then adding noise Lap(1) * 2 SS/eps
    # is eps,delta DP
    # Also if beta < eps / 2(gamma +1), then adding noise 2(gamma+1) SS eta / eps
    # where eta has density proportional to 1 / (1+|z|^gamma) is eps-DP
    # Both from Corolloary 2.4 in
    # http://www.cse.psu.edu/~ads22/pubs/NRS07/NRS07-full-draft-v1.pdf
    # Print the first one's scale

    ss_eps = 2.0 * beta * math.log(1 / delta)
    ss_scale = 2.0 / ss_eps
    # print("To get an " + str(ss_eps) + "-DP estimate of epsilon, ")
    # print("..add noise ~ " + str(ss_scale))
    # print("... times " + str(total_ss_nm / l_list))
    # print("Epsilon = " + str(min(eps_list_nm)) + ".")
    if min(eps_list_nm) == eps_list_nm[-1]:
        print(
            "Warning: May not have used enough values of l. Increase 'moments' variable and run again."
        )

    # Data independent bound, as mechanism is
    # 2*noise_eps DP.
    data_ind_log_mgf = np.array([0.0 for _ in l_list])
    data_ind_log_mgf += num_examples * np.array(
        [logmgf_exact(1.0, 2.0 * noise_eps, l) for l in l_list]
    )

    data_ind_eps_list = (data_ind_log_mgf - math.log(delta)) / l_list
    # print("Data independent bound = " + str(min(data_ind_eps_list)) + ".")

    return min(eps_list_nm), min(data_ind_eps_list)


def tensors_to_literals(tensor_list):
    """Converts list of torch tensors to list of integers/floats. Fix for not having the functionality which converts list of tensors to tensors
    
       Args:
           
           tensor_list[List]: List of torch tensors
           
       Returns:
           
           literal_list[List]: List of floats/integers
           
    """

    literal_list = []

    for tensor in tensor_list:
        literal_list.append(tensor.item())

    return literal_list


def logmgf_exact_torch(q, priv_eps, l):
    """Computes the logmgf value given q and privacy eps.
       The bound used is the min of three terms. The first term is from
       https://arxiv.org/pdf/1605.02065.pdf.
       The second term is based on the fact that when event has probability (1-q) for
       q close to zero, q can only change by exp(eps), which corresponds to a
       much smaller multiplicative change in (1-q)
       The third term comes directly from the privacy guarantee.
       Args:
            q: pr of non-optimal outcome
            priv_eps: eps parameter for DP
            l: moment to compute.
       Returns:
            Upper bound on logmgf
      """
    if q < 0.5:
        t_one = (1 - q) * math.pow((1 - q) / (1 - math.exp(priv_eps) * q), l)
        t_two = q * math.exp(priv_eps * l)
        t = t_one + t_two
        try:

            log_t = math.log(t)

        except ValueError:

            print("Got ValueError in math.log for values :" + str((q, priv_eps, l, t)))
            log_t = priv_eps * l
    else:

        log_t = priv_eps * l

    return min(0.5 * priv_eps * priv_eps * l * (l + 1), log_t, priv_eps * l)


def compute_q_noisy_max_torch(counts, noise_eps):
    """Returns ~ Pr[outcome != winner].
       Args:
           
          counts: a list of scores
          noise_eps: privacy parameter for noisy_max
          
       Returns:
           
          q: the probability that outcome is different from true winner.
          
    """

    if type(counts) != torch.tensor:

        counts = torch.tensor(tensors_to_literals(counts), dtype=torch.float)

    _, winner = counts.max(0)
    counts_normalized = noise_eps * (
        torch.tensor(counts, dtype=torch.float) - torch.tensor(counts[winner], dtype=torch.float)
    )

    counts_normalized = tensors_to_literals(counts_normalized)
    counts_rest = torch.tensor(
        [counts_normalized[i] for i in range(len(counts)) if i != winner], dtype=torch.float
    )
    q = 0.0

    index = 0
    for c in counts_rest:

        gap = -c
        q += (gap + 2.0) / (4.0 * math.exp(gap))

        index += 1

    return min(q, 1.0 - (1.0 / len(counts)))


def logmgf_from_counts_torch(counts, noise_eps, l):

    """
        ReportNoisyMax mechanism with noise_eps with 2*noise_eps-DP
        in our setting where one count can go up by one and another
        can go down by 1.
    """

    q = compute_q_noisy_max_torch(counts, noise_eps)

    return logmgf_exact_torch(q, 2.0 * noise_eps, l)


def sens_at_k_torch(counts, noise_eps, l, k):

    """Return sensitivity at distane k.
      Args:
        
          counts: an array of scores
          noise_eps: noise parameter used
          l: moment whose sensitivity is being computed
          k: distance
      Returns:
         sensitivity: at distance k
     """

    counts_sorted = sorted(counts, reverse=True)

    if 0.5 * noise_eps * l > 1:

        print("l too large to compute sensitivity")
        return 0

    if counts[0] < counts[1] + k:

        return 0

    counts_sorted[0] -= k
    counts_sorted[1] += k
    val = logmgf_from_counts_torch(counts_sorted, noise_eps, l)
    counts_sorted[0] -= 1
    counts_sorted[1] += 1
    val_changed = logmgf_from_counts_torch(counts_sorted, noise_eps, l)
    return val_changed - val


def smooth_sens_torch(counts, noise_eps, l, beta):

    """Compute beta-smooth sensitivity.
    
     Args:
         counts: array of scors
         noise_eps: noise parameter
         l: moment of interest
         beta: smoothness parameter
     Returns:
         smooth_sensitivity: a beta smooth upper bound
     """

    k = 0
    smoothed_sensitivity = sens_at_k_torch(counts, noise_eps, l, k)

    while k < max(counts):

        k += 1
        sensitivity_at_k = sens_at_k_torch(counts, noise_eps, l, k)
        smoothed_sensitivity = max(smoothed_sensitivity, math.exp(-beta * k) * sensitivity_at_k)
        if sensitivity_at_k == 0.0:
            break

    return smoothed_sensitivity


def perform_analysis_torch(preds, indices, noise_eps=0.1, delta=1e-5, moments=8, beta=0.09):
    """Performs PATE analysis on predictions from teachers and combined predictions for student.
    Args:
        teacher_preds: a torch tensor of dim (num_teachers x num_examples). Each value corresponds to the
            index of the label which a teacher gave for a specific example
        indices: a torch tensor of dim (num_examples) of aggregated examples which were aggregated using
            the noisy max mechanism.
        noise_eps: the epsilon level used to create the indices
        delta: the desired level of delta
        moments: the number of moments to track (see the paper)
        beta: a smoothing parameter (see the paper)
    Returns:
        tuple: first value is the data dependent epsilon, then the data independent epsilon
    """

    num_teachers, num_examples = preds.shape
    _num_examples = indices.shape[0]

    assert num_examples == _num_examples

    labels = list(preds.flatten())
    labels = set([tensor.item() for tensor in labels])
    num_labels = len(labels)

    counts_mat = torch.zeros(num_examples, num_labels, dtype=torch.float32)

    for i in range(num_examples):

        for j in range(num_teachers):

            counts_mat[i, int(preds[j, i])] += 1

    l_list = 1 + torch.tensor(range(moments), dtype=torch.float)

    total_log_mgf_nm = torch.tensor([0.0 for _ in l_list], dtype=torch.float)
    total_ss_nm = torch.tensor([0.0 for _ in l_list], dtype=torch.float)

    for i in indices:

        total_log_mgf_nm += torch.tensor(
            [logmgf_from_counts_torch(counts_mat[i].clone(), noise_eps, l) for l in l_list]
        )

        total_ss_nm += torch.tensor(
            [smooth_sens_torch(counts_mat[i].clone(), noise_eps, l, beta) for l in l_list],
            dtype=torch.float,
        )

    eps_list_nm = (total_log_mgf_nm - math.log(delta)) / l_list
    ss_eps = 2.0 * beta * math.log(1 / delta)
    ss_scale = 2.0 / ss_eps
    if min(eps_list_nm) == eps_list_nm[-1]:
        print(
            "Warning: May not have used enough values of l. Increase 'moments' variable and run again."
        )

    data_ind_log_mgf = torch.tensor([0.0 for _ in l_list])
    data_ind_log_mgf += num_examples * torch.tensor(
        tensors_to_literals([logmgf_exact_torch(1.0, 2.0 * noise_eps, l) for l in l_list])
    )

    data_ind_eps_list = (data_ind_log_mgf - math.log(delta)) / l_list

    return min(eps_list_nm), min(data_ind_eps_list)


# PATE Analysis

Perform PATE analysis and show the results

In [None]:


data_dep_eps, data_ind_eps = perform_analysis(teacher_preds=preds, indices=student_labels, noise_eps=epsilon, delta=1e-5)
print("Data Independent Epsilon:", data_ind_eps)
print("Data Dependent Epsilon:", data_dep_eps)

# Training the student

Now we will train the student with the aggregated teacher labels

In [None]:
def student_loader(student_train_loader, labels):
    for i, (data, _) in enumerate(iter(student_train_loader)):
        yield data, torch.from_numpy(labels[i*len(data):(i+1)*len(data)])

In [None]:
student_model =ConvNet(channel=3, num_classes= 100,net_width=net_width, net_depth=net_depth, net_act=net_act, net_norm=net_norm, net_pooling=net_pooling,im_size=(32,32))
criterion = nn.NLLLoss()
optimizer = optim.Adam(student_model.parameters(), lr=0.001)
epochs = 10
student_model.to(device)
steps = 0
running_loss = 0
for e in range(epochs):
    # Model in training mode, dropout is on
    student_model.train()
    train_loader = student_loader(student_train_loader, student_labels)
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        steps += 1

        optimizer.zero_grad()
        output = student_model.forward(images)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if steps % 50 == 0:
            test_loss = 0
            accuracy = 0
            student_model.eval()
            with torch.no_grad():
                for images, labels in student_test_loader:
                    images, labels = images.to(device), labels.to(device)
                    log_ps = student_model(images)
                    test_loss += criterion(log_ps, labels).item()
                    
                    # Accuracy
                    ps = torch.exp(log_ps)
                    top_p, top_class = ps.topk(1, dim=1)
                    equals = top_class == labels.view(*top_class.shape)
                    accuracy += torch.mean(equals.type(torch.FloatTensor))
            student_model.train()
            print("Epoch: {}/{}.. ".format(e+1, epochs),
                  "Training Loss: {:.3f}.. ".format(running_loss/len(student_train_loader)),
                  "Test Loss: {:.3f}.. ".format(test_loss/len(student_test_loader)),
                  "Test Accuracy: {:.3f}".format(accuracy/len(student_test_loader)))
            running_loss = 0

In [None]:
t1_model = models[99]
t1_model.eval()
with torch.no_grad():
    test_loss = 0
    accuracy = 0
    for images, labels in student_test_loader:
        images, labels = images.to(device), labels.to(device)
        log_ps = t1_model(images)
        test_loss += criterion(log_ps, labels).item()

        # Accuracy
        ps = torch.exp(log_ps)
        top_p, top_class = ps.topk(1, dim=1)
        equals = top_class == labels.view(*top_class.shape)
        accuracy += torch.mean(equals.type(torch.FloatTensor))
    t1_model.train()
    print("Test Loss: {:.3f}.. ".format(test_loss),
          "Test Accuracy: {:.3f}".format(accuracy))