In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.utils.data import Subset
import matplotlib.pyplot as plt

import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# this should print 'cuda' if you are assigned a GPU
print(device)

cuda


In [2]:
# Get our datasets
transforms = torchvision.transforms.Compose([torchvision.transforms.ToTensor(),])

train_dataset = torchvision.datasets.MNIST('./datasets/', train=True, download=True, transform=transforms)
test_dataset = torchvision.datasets.MNIST('./datasets/', train=False, download=True, transform=transforms)
print('Doing MNIST')

# sanity check
print('training data size:{}'.format(len(train_dataset)))
print('test data size:{}'.format(len(test_dataset)))

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./datasets/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting ./datasets/MNIST/raw/train-images-idx3-ubyte.gz to ./datasets/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./datasets/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting ./datasets/MNIST/raw/train-labels-idx1-ubyte.gz to ./datasets/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./datasets/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting ./datasets/MNIST/raw/t10k-images-idx3-ubyte.gz to ./datasets/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./datasets/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting ./datasets/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./datasets/MNIST/raw

Doing MNIST
training data size:60000
test data size:10000


In [3]:
# Split the training data into training and validation sets
def split_train_val(org_train_set, valid_ratio=0.1):

    num_train = len(org_train_set)

    split = int(np.floor(valid_ratio * num_train))        

    indices = list(range(num_train))

    np.random.shuffle(indices)

    train_idx, val_idx = indices[split:], indices[:split]

    new_train_set = Subset(org_train_set, train_idx)
    val_set = Subset(org_train_set, val_idx)

    assert num_train - split == len(new_train_set)
    assert split == len(val_set)

    return new_train_set, val_set

# reusing variable name
train_dataset, val_dataset = split_train_val(train_dataset, valid_ratio=1.0 / 6.0)

# sanity check
print('training data size:{}'.format(len(train_dataset)))
print('validation data size:{}'.format(len(val_dataset)))

training data size:50000
validation data size:10000


In [4]:

# Hyperparameters for target agent (shouldnt need to change)
train_batch_size = 100
test_batch_size = 100
n_epochs = 10
learning_rate = 1e-2
seed = 100
input_dim = 28 * 28
out_dim = 10

n_filters = 0
kernel_size = 0

num_hidden_layers = 2
layer_size = 200  

momentum = 0.9

# counter-adversarial parameters
# confidence regularization
regularizing = True
reg_range = 1

# image perturbation
perturbing = True
p_range = 0.5

# confidence masking (perturbation range)
masking = True
m_range = 1



In [5]:
# put the data into loaders, which also minibatches it
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=train_batch_size, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=test_batch_size, shuffle=False)

# sanity check
print('training data size:{}'.format(len(train_loader.dataset)))
print('validation data size:{}'.format(len(val_loader.dataset)))
print('test data size:{}'.format(len(test_loader.dataset)))

training data size:50000
validation data size:10000
test data size:10000


In [6]:
# set up our network
class FC(nn.Module):
    
    def __init__(self, in_dim, out_dim, num_hidden_layers, layer_size):
        super().__init__()

        self.num_layers = num_hidden_layers * 2 + 3 # *2 accounts for ReLU layers, +3 is input layer, input relu layer, output layer

        self.in_dim = in_dim
        self.out_dim = out_dim        

        self.layer_size = layer_size

        self.layer_list = nn.ModuleList()

        self.layer_list.append(nn.Linear(self.in_dim, self.layer_size))
        self.num_hidden_layers = num_hidden_layers

        for i in range(1,self.num_hidden_layers):
            self.layer_list.append(nn.Linear(self.layer_size, self.layer_size))
            

        self.layer_list.append(nn.Linear(self.layer_size, self.out_dim))
        
    # set up the forward propagation
    def forward(self, x):

        x = x.view(-1, self.in_dim)

        for i in range(self.num_hidden_layers):
            x = F.relu(self.layer_list[i](x))

        return self.layer_list[self.num_hidden_layers](x)

# create our network and optimizer
network = FC(in_dim=input_dim, out_dim=out_dim, num_hidden_layers=num_hidden_layers, layer_size=layer_size)
network = network.to(device)
optimizer = optim.SGD(network.parameters(), lr=learning_rate, momentum=momentum)


In [7]:
# Create the perturbed image by adjusting each pixel
def perturb(image):

    val = np.random.uniform(-p_range, p_range)
    adv_image = image + val
    adv_image = torch.clamp(adv_image, 0, 1)
      
    return adv_image


In [8]:
# get the accuracy of the model
def test_base(net, loader, device):
    # prepare model for testing 
    net.eval()
    
    test_loss = 0
    correct = 0
    total = 0

    for data, target in loader:

        data, target = data.to(device), target.to(device)

        data.requires_grad = True

        if perturbing: 
            data = perturb(data)

        output = net(data)

        if regularizing:
          for i in range(0, 100):
            options = output[i]

            for j in range(0, 10):
              m = options[j].item()
              options[j] = ((m + reg_range) / (reg_range * 2)) * (reg_range * 2) - reg_range

            output[i] = options
          
        # calculate our loss, and accuracy
        test_loss += F.nll_loss(output, target, size_average=False).item()
        pred = output.data.max(1, keepdim=True)[1]
        correct += (pred.eq(target.data.view_as(pred)).sum().item())
        
        total = total + 1

    print('Test set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)'.format(
        test_loss, correct, len(loader.dataset),
        (100. * correct / len(loader.dataset))))
    
    return 100.0 * correct / len(loader.dataset)

# query the target network
def query(network, data):
    confidence_scores = []
    # get the network's output
    out = network(data)

    # apply our techniques
    for i in range(0, 100):
      options = out[i]

      if regularizing:
        for j in range(0, 10):
          m = options[j].item()
          options[j] = ((m + reg_range) / (reg_range * 2)) * (reg_range * 2) - reg_range
      
      if masking:
        rand_val = np.random.uniform(-m_range - 15, m_range - 15)
        for j in range(0, 10):
          m = options[j].item()
          options[j] = m + rand_val

        
      val = max(options)
      score = val.item()
      confidence_scores.append(score)

    return confidence_scores
      

def train_base(net, loader, optimizer, epoch, device, log_interval=100):
    # prepare model for training (only important for dropout, batch norm, etc.)
    net.train()

    using = 0
    correct = 0
    # we train on every other batch
    for batch_idx, (data, target) in enumerate(loader):
        if (using % 2) == 0:

          data, target = data.to(device), target.to(device) # target should be size 100

          # avoid using any pre-existing gradients from our model
          net.zero_grad()
          data.requires_grad = True

          if perturbing: 
            data = perturb(data)
          
          # clear up gradients for backprop
          optimizer.zero_grad()
          output = F.log_softmax(net(data), dim=1)

          # use NLL loss
          loss = F.nll_loss(output, target)

          # compute gradients and make updates
          loss.backward()
          optimizer.step()

          pred = output.data.max(1, keepdim=True)[1]
        
          correct += (pred.eq(target.data.view_as(pred)).sum().item())

          if batch_idx % log_interval == 0:
              print('\nTrain Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                  epoch, batch_idx * len(data), len(loader.dataset), 100. * batch_idx / len(loader), loss.item()))
        using += 1

    print('\tAccuracy: {:.2f}%'.format(100.0 * correct / len(loader.dataset)))    

# Sanity check -- output should be close to 10%
test_base(network, test_loader, device)






Test set: Avg. loss: 38.7958, Accuracy: 1060/10000 (10.60%)


10.6

In [9]:
# training loop
epoch = 1

# until we reach our benchmark (5 epochs), train the network
while epoch < 5:
  train_base(network, train_loader, optimizer, epoch, device)
  epoch += 1

print('ok, finished training after', epoch, 'epochs')
test_base(network, test_loader, device)







	Accuracy: 32.04%





	Accuracy: 43.43%





	Accuracy: 44.69%





	Accuracy: 45.56%
ok, finished training after 5 epochs




Test set: Avg. loss: -80909.9695, Accuracy: 9050/10000 (90.50%)


90.5

In [10]:
# find the best adversarial threshold
def train_adversary(net_target, loader, optimizer, epoch, device, log_interval=100):

    # helper variables for finding the best threshold
    best_threshold = 0
    this_threshold = 0

    best_accuracy = 0

    using = 0
    correct = 0

    # our accuracy on elements in and not in the training set. The overall combination is what we want, we dont distinguish between false positives and negatives
    accuracies = torch.zeros(2)

    acc_list = []
    thresh_list = []

    # ok, so the idea is to find the best confidence threshold to maximise overall accuracy, which we can do relatively brute-force :)
    for batch_idx, (data, target) in enumerate(loader):
        if (using % 2 == 0):
          this_threshold = this_threshold + 0.1
        using += 1

        target_data, target = data.to(device), target.to(device)

        # query the target, use the information
        target_output = query(net_target, target_data) # shape should be [100,1]
        adversary_output = torch.zeros(100)

        if (using % 2 == 0):
          target = torch.ones(100)
        else:
          target = torch.zeros(100) 

        # check if we were right/wrong
        numCorrect = 0
        for i in range(100):
          
          if target_output[i] > this_threshold:
            adversary_output[i] = 1
          else:
            adversary_output[i] = 0

          if adversary_output[i] == target[i].item():
            numCorrect += 1

        accuracy = numCorrect / 100
        if (using % 2 == 0):
          accuracies[0] = accuracy
        else:
          accuracies[1] = accuracy
          
          accuracy = (accuracies[0].item() + accuracies[1].item()) / 2

          acc_list.append(accuracy)
          thresh_list.append(this_threshold)

          # if this accuracy is the best so far, save it
          if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_threshold = this_threshold


        
    print("Best accuracy:", best_accuracy, "with threshold:", best_threshold)

train_adversary(network, train_loader, optimizer, epoch, device)  

Best accuracy: 0.5250000096857548 with threshold: 3.700000000000002
