# This notebook implements sleeper agent attack

In [1]:
import math
from tqdm import trange
import numpy as np
import os, sys
import pdb
from PIL import Image
from numpy import asarray
from skimage.transform import resize
import random
from art.estimators.classification import PyTorchClassifier
from art.utils import load_cifar10
from torchvision.models.resnet import BasicBlock, Bottleneck
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import torchvision
import torch.nn.functional as F

In [2]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('.'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Data Normalization

In [3]:
(x_train, y_train), (x_test, y_test), min_, max_ = load_cifar10()

mean = np.mean(x_train,axis=(0,1,2,3))
std = np.std(x_train,axis=(0,1,2,3))
x_train = (x_train-mean)/(std+1e-7)
x_test = (x_test-mean)/(std+1e-7)

min_ = (min_-mean)/(std+1e-7)
max_ = (max_-mean)/(std+1e-7)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
img = Image.open('trigger_10.png')
numpydata = asarray(img)
patch = resize(numpydata, (8,8,3))
patch = (patch-mean)/(std+1e-7)

In [4]:
def testAccuracy(model, test_loader):
    model_was_training = model.training
    model.eval()
    accuracy = 0.0
    total = 0.0
    with torch.no_grad():
        for data in test_loader:
            images, labels = data
            # run the model on the test set to predict labels
            outputs = model(images)
            # the label with the highest energy will be our prediction
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            accuracy += (predicted == labels).sum().item()
    
    # compute the accuracy over all test images
    accuracy = (100 * accuracy / total)
    if model_was_training:
        model.train()
    return(accuracy)

In [5]:
def create_model(x_train, y_train, x_test=None, y_test=None, num_classes=10, batch_size=128, epochs=25):
    initial_conv = [3, 1, 1]
    model = torchvision.models.ResNet(torchvision.models.resnet.BasicBlock, [2, 2, 2, 2], num_classes=num_classes)
    
    # Define the loss function with Classification Cross-Entropy loss and an optimizer with Adam optimizer
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4, nesterov=True)
    model.to(device)

    x_train = np.transpose(x_train, [0, 3,1,2])
    y_train = np.argmax(y_train, axis=1)
    x_tensor = torch.tensor(x_train, dtype=torch.float32, device=device) # transform to torch tensor
    y_tensor = torch.tensor(y_train, dtype=torch.long, device=device)
    
    x_test = np.transpose(x_test, [0, 3,1,2])
    y_test = np.argmax(y_test, axis=1)
    x_tensor_test = torch.tensor(x_test, dtype=torch.float32, device=device) # transform to torch tensor
    y_tensor_test = torch.tensor(y_test, dtype=torch.long, device=device)

    dataset_train = TensorDataset(x_tensor,y_tensor) # create your datset
    dataloader_train = DataLoader(dataset_train, batch_size=batch_size)

    dataset_test = TensorDataset(x_tensor_test,y_tensor_test) # create your datset
    dataloader_test = DataLoader(dataset_test, batch_size=batch_size)

    for epoch in trange(epochs):
        running_loss = 0.0
        total = 0
        accuracy = 0
        for i, data in enumerate(dataloader_train, 0):
            inputs, labels = data
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            # _, predicted = torch.max(outputs.data, 1)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            accuracy += (predicted == labels).sum().item()
            running_loss += loss.item()
        train_accuracy = (100 * accuracy / total)
        print("Epoch %d train accuracy: %f" % (epoch, train_accuracy))
        test_accuracy = testAccuracy(model, dataloader_test)
        print("Final test accuracy: %f" % test_accuracy)
    return model, loss_fn, optimizer

# Train Substitute Model for Attack

In [6]:
model_path = "cifar10-resnet18-pytorch.pth"
if not os.path.exists(model_path):
    model, loss_fn, optimizer = create_model(x_train, y_train,x_test=x_test,y_test=y_test,epochs=80)
    torch.save(model.state_dict(), model_path)
else:
    print("Pretrained model exists")
    model, loss_fn, optimizer = create_model(x_train, y_train,x_test=x_test,y_test=y_test,epochs=0)
    model.load_state_dict(torch.load(model_path))

Pretrained model exists


 ... (more hidden) ...


# Select Triggers from Source Class and helper functions for calculating Success Rate

In [7]:
model_art = PyTorchClassifier(model, input_shape=x_train.shape[1:], loss=loss_fn, optimizer=optimizer, nb_classes=10)
from art.utils import to_categorical
from art.attacks.poisoning.sleeper_agent_attack import SleeperAgentAttack

def select_trigger_train(x_train,y_train,K):
    x_train_ = np.copy(x_train)
    class_source = 0
    class_target = 1
    index_source = np.where(y_train.argmax(axis=1)==class_source)[0][0:K]
    index_target = np.where(y_train.argmax(axis=1)==class_target)[0]
    x_trigger = x_train_[index_source]
    y_trigger  = to_categorical([class_target], nb_classes=10)
    y_trigger = np.tile(y_trigger,(len(index_source),1))
    x_samples = x_train_[index_target]
    y_samples = y_train[index_target]
    return x_trigger,y_trigger,x_samples,y_samples,class_source,class_target,index_target

def calculate_test_success_rate(x_test,y_test,class_source,model_poisoned,class_target):
    index_source = np.where(y_test.argmax(axis=1)==class_source)[0]
    x_trigger = x_test[index_source]
    x_trigger = add_trigger_patch(x_trigger,patch_type="random")
    model_poisoned.eval()
    result_poisoned = model_poisoned(torch.tensor(np.transpose(x_trigger, [0,3,1,2]), device=device, dtype=torch.float)).detach().cpu().numpy()
    success = calculate_success_rate(result_poisoned,class_target)
    print("Attack Success Rate on Test Triggers",success)
    
def calculate_success_rate(y_poisoned,class_target):
    success = ((np.argmax(result_poisoned,axis=1)==class_target).sum()/len(y_poisoned))
    return success    

# Generate Poison Images through attack 

In [8]:
x_trigger,y_trigger,x_samples,y_samples,class_source,class_target,index_target = select_trigger_train(x_train,y_train,1000)
result_original = model_art.predict(torch.tensor(np.transpose(x_trigger, [0, 3,1,2]), dtype=torch.float32))

In [9]:
attack = SleeperAgentAttack(model_art,
                                percent_poison=0.10,
                                max_trials=1,
                                max_epochs=250,
                                learning_rate_schedule=(np.array([1e-1, 1e-2, 1e-3, 1e-4, 1e-5]), [250, 350, 400, 430, 460]),
                                clip_values=(min_,max_),
                                epsilon=16/255 * (max_ - min_),
                                batch_size=500,
                                verbose=1,
                                indices_target=index_target,
                                patching_strategy="fixed",
                                selection_strategy="random",
                                patch=np.transpose(patch,[2,0,1])
                           )
x_poison, y_poison, indices_poison = attack.poison(torch.tensor(np.transpose(x_trigger, [0, 3,1,2]), dtype=torch.float32), 
                                                   y_trigger, 
                                                   torch.tensor(np.transpose(x_samples, [0, 3,1,2]), dtype=torch.float32), 
                                                   y_samples)
x_poison = np.transpose(x_poison, [0,2,3,1])
x_poison_ = np.copy(x_train)
x_poison_[index_target[indices_poison]]=x_poison[indices_poison]

  0%|          | 0/1 [00:00<?, ?it/s]

  torch.tensor(x_trigger, device=device, dtype=torch.float32),


  0%|          | 0/250 [00:00<?, ?it/s]

Best B-score: 0.03492617607116699


# Success Rate on Train Triggers

In [10]:
x_poison_.shape

(50000, 32, 32, 3)

In [None]:
model_poisoned, loss_fn, optimizer = create_model(x_poison_, y_train,x_test=x_test,y_test=y_test,epochs=80)
model_poisoned.eval()
result_poisoned = model_poisoned(torch.tensor(np.transpose(x_trigger, [0,3,1,2]), device=device, dtype=torch.float)).detach().cpu().numpy()
print("y_trigger:", y_trigger)
print("result_poisoned:", result_poisoned)
print("result_original:", result_original)
print("Success Rate on train triggers",calculate_success_rate(result_poisoned,class_target))

 ... (more hidden) ...

Epoch 0 train accuracy: 34.112000


 ... (more hidden) ...

Final test accuracy: 45.440000
Epoch 1 train accuracy: 50.460000


 ... (more hidden) ...

Final test accuracy: 53.980000
Epoch 2 train accuracy: 60.002000


 ... (more hidden) ...

Final test accuracy: 62.400000
Epoch 3 train accuracy: 66.222000


 ... (more hidden) ...

Final test accuracy: 65.210000
Epoch 4 train accuracy: 70.134000


 ... (more hidden) ...

Final test accuracy: 64.710000
Epoch 5 train accuracy: 73.076000


 ... (more hidden) ...

Final test accuracy: 66.270000
Epoch 6 train accuracy: 75.544000


 ... (more hidden) ...

Final test accuracy: 65.160000
Epoch 7 train accuracy: 77.090000


 ... (more hidden) ...

Final test accuracy: 65.910000
Epoch 8 train accuracy: 78.866000


 ... (more hidden) ...

Final test accuracy: 66.090000
Epoch 9 train accuracy: 80.204000


 ... (more hidden) ...

Final test accuracy: 65.500000
Epoch 10 train accuracy: 81.192000


 ... (more hidden) ...

Final test accuracy: 65.210000
Epoch 11 train accuracy: 82.202000


 ... (more hidden) ...

Final test accuracy: 66.120000
Epoch 12 train accuracy: 83.292000


 ... (more hidden) ...

Final test accuracy: 64.290000
Epoch 13 train accuracy: 83.946000


 ... (more hidden) ...

Final test accuracy: 66.330000
Epoch 14 train accuracy: 84.340000


 ... (more hidden) ...

Final test accuracy: 64.740000
Epoch 15 train accuracy: 84.676000


 ... (more hidden) ...

Final test accuracy: 67.020000
Epoch 16 train accuracy: 85.370000


 ... (more hidden) ...

Final test accuracy: 65.590000
Epoch 17 train accuracy: 85.666000


 ... (more hidden) ...

Final test accuracy: 67.290000
Epoch 18 train accuracy: 86.352000


 ... (more hidden) ...

Final test accuracy: 68.040000
Epoch 19 train accuracy: 86.668000


 ... (more hidden) ...

Final test accuracy: 66.540000
Epoch 20 train accuracy: 86.516000


 ... (more hidden) ...

Final test accuracy: 68.110000
Epoch 21 train accuracy: 87.006000


 ... (more hidden) ...

Final test accuracy: 68.350000
Epoch 22 train accuracy: 87.126000


 ... (more hidden) ...

Final test accuracy: 67.360000
Epoch 23 train accuracy: 87.266000


 ... (more hidden) ...

Final test accuracy: 68.740000
Epoch 24 train accuracy: 87.424000


 ... (more hidden) ...

Final test accuracy: 67.880000
Epoch 25 train accuracy: 87.780000


 ... (more hidden) ...

Final test accuracy: 69.650000
Epoch 26 train accuracy: 87.870000


 ... (more hidden) ...

Final test accuracy: 70.310000
Epoch 27 train accuracy: 88.022000


 ... (more hidden) ...

Final test accuracy: 67.340000
Epoch 28 train accuracy: 87.786000


 ... (more hidden) ...

Final test accuracy: 68.760000
Epoch 29 train accuracy: 87.978000


 ... (more hidden) ...

Final test accuracy: 67.960000
Epoch 30 train accuracy: 88.274000


 ... (more hidden) ...

Final test accuracy: 70.260000
Epoch 31 train accuracy: 88.508000


 ... (more hidden) ...

Final test accuracy: 67.810000
Epoch 32 train accuracy: 88.690000


 ... (more hidden) ...

Final test accuracy: 69.650000
Epoch 33 train accuracy: 88.624000


 ... (more hidden) ...

Final test accuracy: 70.350000
Epoch 34 train accuracy: 88.660000


 ... (more hidden) ...

Final test accuracy: 69.360000
Epoch 35 train accuracy: 88.646000


 ... (more hidden) ...

Final test accuracy: 71.150000
Epoch 36 train accuracy: 88.990000


 ... (more hidden) ...

Final test accuracy: 69.270000
Epoch 37 train accuracy: 88.646000


 ... (more hidden) ...

Final test accuracy: 69.170000
Epoch 38 train accuracy: 88.832000


 ... (more hidden) ...

Final test accuracy: 70.080000
Epoch 39 train accuracy: 88.828000


 ... (more hidden) ...

Final test accuracy: 69.810000
Epoch 40 train accuracy: 89.152000


 ... (more hidden) ...

Final test accuracy: 70.040000
Epoch 41 train accuracy: 88.890000


 ... (more hidden) ...

Final test accuracy: 69.410000
Epoch 42 train accuracy: 89.232000


 ... (more hidden) ...

Final test accuracy: 68.780000
Epoch 43 train accuracy: 88.986000


 ... (more hidden) ...

Final test accuracy: 69.850000
Epoch 44 train accuracy: 89.480000


 ... (more hidden) ...

Final test accuracy: 69.940000
Epoch 45 train accuracy: 89.328000


 ... (more hidden) ...

Final test accuracy: 69.870000
Epoch 46 train accuracy: 89.252000


 ... (more hidden) ...

Final test accuracy: 68.030000
Epoch 47 train accuracy: 89.280000


 ... (more hidden) ...

Final test accuracy: 70.160000


# Success Rate on Test Triggers

In [None]:
calculate_test_success_rate(x_test,y_test,class_source,model_poisoned,class_target)