# This notebook implements sleeper agent attack

In [1]:
import math
from tqdm import trange
import numpy as np
import os, sys
import pdb
from PIL import Image
from numpy import asarray
from skimage.transform import resize
import random
from art.estimators.classification import PyTorchClassifier
from art.utils import load_cifar10
from torchvision.models.resnet import BasicBlock, Bottleneck
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import torchvision
import torch.nn.functional as F

In [2]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('.'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Data Normalization

In [3]:
(x_train, y_train), (x_test, y_test), min_, max_ = load_cifar10()

mean = np.mean(x_train,axis=(0,1,2,3))
std = np.std(x_train,axis=(0,1,2,3))
x_train = (x_train-mean)/(std+1e-7)
x_test = (x_test-mean)/(std+1e-7)

min_ = (min_-mean)/(std+1e-7)
max_ = (max_-mean)/(std+1e-7)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
def testAccuracy(model, test_loader):
    model_was_training = model.training
    model.eval()
    accuracy = 0.0
    total = 0.0
    with torch.no_grad():
        for data in test_loader:
            images, labels = data
            # run the model on the test set to predict labels
            outputs = model(images)
            # the label with the highest energy will be our prediction
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            accuracy += (predicted == labels).sum().item()
    
    # compute the accuracy over all test images
    accuracy = (100 * accuracy / total)
    if model_was_training:
        model.train()
    return(accuracy)

In [5]:
def create_model(x_train, y_train, x_test=None, y_test=None, num_classes=10, batch_size=128, epochs=25):
    initial_conv = [3, 1, 1]
    model = torchvision.models.ResNet(torchvision.models.resnet.BasicBlock, [2, 2, 2, 2], num_classes=num_classes)
    
    # Define the loss function with Classification Cross-Entropy loss and an optimizer with Adam optimizer
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4, nesterov=True)
    model.to(device)

    x_train = np.transpose(x_train, [0, 3,1,2])
    y_train = np.argmax(y_train, axis=1)
    x_tensor = torch.tensor(x_train, dtype=torch.float32, device=device) # transform to torch tensor
    y_tensor = torch.tensor(y_train, dtype=torch.long, device=device)
    
    x_test = np.transpose(x_test, [0, 3,1,2])
    y_test = np.argmax(y_test, axis=1)
    x_tensor_test = torch.tensor(x_test, dtype=torch.float32, device=device) # transform to torch tensor
    y_tensor_test = torch.tensor(y_test, dtype=torch.long, device=device)

    dataset_train = TensorDataset(x_tensor,y_tensor) # create your datset
    dataloader_train = DataLoader(dataset_train, batch_size=batch_size)

    dataset_test = TensorDataset(x_tensor_test,y_tensor_test) # create your datset
    dataloader_test = DataLoader(dataset_test, batch_size=batch_size)

    for epoch in trange(epochs):
        running_loss = 0.0
        total = 0
        accuracy = 0
        for i, data in enumerate(dataloader_train, 0):
            inputs, labels = data
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            # _, predicted = torch.max(outputs.data, 1)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            accuracy += (predicted == labels).sum().item()
            running_loss += loss.item()
        train_accuracy = (100 * accuracy / total)
        print("Epoch %d train accuracy: %f" % (epoch, train_accuracy))
        test_accuracy = testAccuracy(model, dataloader_test)
        print("Final test accuracy: %f" % test_accuracy)
    return model, loss_fn, optimizer

# Train Substitute Model for Attack

In [6]:
model_path = "cifar10-resnet18-pytorch.pth"
if not os.path.exists(model_path):
    model, loss_fn, optimizer = create_model(x_train, y_train,x_test=x_test,y_test=y_test,epochs=80)
    torch.save(model.state_dict(), model_path)
else:
    print("Pretrained model exists")
    model, loss_fn, optimizer = create_model(x_train, y_train,x_test=x_test,y_test=y_test,epochs=0)
    model.load_state_dict(torch.load(model_path))

Pretrained model exists


 ... (more hidden) ...


# Select Triggers from Source Class and helper functions for calculating Success Rate

In [7]:
model_art = PyTorchClassifier(model, input_shape=x_train.shape[1:], loss=loss_fn, optimizer=optimizer, nb_classes=10)
from art.utils import to_categorical
from art.attacks.poisoning.sleeper_agent_attack import SleeperAgentAttack

def add_trigger_patch(x_trigger,patch_type="fixed"):
    print("patch_type",patch_type)
    # pdb.set_trace()
    img = Image.open('trigger_10.png')
    numpydata = asarray(img)
    patch = resize(numpydata, (8,8,3))
    patch = (patch-mean)/(std+1e-7)
    if patch_type == "fixed":
        x_trigger[:,-8:,-8:,:] = patch
    else:
        for x in x_trigger:
            # x_cord = random.randint(0, 24)
            # y_cord = random.randint(0, 24)
            x_cord = random.randrange(0,x.shape[1] - patch.shape[1] + 1)
            y_cord = random.randrange(0,x.shape[2] - patch.shape[2] + 1)
            x[x_cord:x_cord+8,y_cord:y_cord+8,:]=patch

    return x_trigger

def select_tigger_train(x_train,y_train,K):
    x_train_ = np.copy(x_train)
    class_source = 0
    class_target = 1
    index_source = np.where(y_train.argmax(axis=1)==class_source)[0][0:K]
    index_target = np.where(y_train.argmax(axis=1)==class_target)[0]
    x_trigger = x_train_[index_source]
    y_trigger  = to_categorical([class_target], nb_classes=10)
    x_trigger = add_trigger_patch(x_trigger,patch_type="random")
    y_trigger = np.tile(y_trigger,(len(index_source),1))
    x_samples = x_train_[index_target]
    y_samples = y_train[index_target]
    return x_trigger,y_trigger,x_samples,y_samples,class_source,class_target,index_target

def calculate_test_success_rate(x_test,y_test,class_source,model_poisoned,class_target):
    index_source = np.where(y_test.argmax(axis=1)==class_source)[0]
    x_trigger = x_test[index_source]
    x_trigger = add_trigger_patch(x_trigger,patch_type="random")
    model_poisoned.eval()
    result_poisoned = model_poisoned(torch.tensor(np.transpose(x_trigger, [0,3,1,2]), device=device, dtype=torch.float)).detach().cpu().numpy()
    success = calculate_success_rate(result_poisoned,class_target)
    print("Attack Success Rate on Test Triggers",success)
    
def calculate_success_rate(y_poisoned,class_target):
    pdb.set_trace()
    success = ((np.argmax(result_poisoned,axis=1)==class_target).sum()/len(y_poisoned))
    return success    

# Generate Poison Images through attack 

In [8]:
x_trigger,y_trigger,x_samples,y_samples,class_source,class_target,index_target = select_tigger_train(x_train,y_train,1000)
result_original = model_art.predict(torch.tensor(np.transpose(x_trigger, [0, 3,1,2]), dtype=torch.float32))

patch_type random


In [9]:
attack = SleeperAgentAttack(model_art,
                                percent_poison=0.10,
                                max_trials=1,
                                max_epochs=250,
                                learning_rate_schedule=(np.array([1e-1, 1e-2, 1e-3, 1e-4, 1e-5]), [250, 350, 400, 430, 460]),
                                clip_values=(min_,max_),
                                epsilon=16/255 * (max_ - min_),
                                batch_size=500,
                                verbose=1,
                                indices_target=index_target,
                                patching_strategy="fixed",
                                selection_strategy="random"
                           )
x_poison, y_poison, indices_poison = attack.poison(torch.tensor(np.transpose(x_trigger, [0, 3,1,2]), dtype=torch.float32), 
                                                   y_trigger, 
                                                   torch.tensor(np.transpose(x_samples, [0, 3,1,2]), dtype=torch.float32), 
                                                   y_samples)
x_poison = np.transpose(x_poison, [0,2,3,1])
x_poison_ = np.copy(x_train)
x_poison_[index_target[indices_poison]]=x_poison[indices_poison]

AttributeError: 'SleeperAgentAttack' object has no attribute '_SleeperAgentAttack__poison__pytorch'

# Success Rate on Train Triggers

In [None]:
model_poisoned, loss_fn, optimizer = create_model(x_poison_, y_train, epochs=80)
model_poisoned.eval()
result_poisoned = model_poisoned(torch.tensor(np.transpose(x_trigger, [0,3,1,2]), device=device, dtype=torch.float)).detach().cpu().numpy()
print("y_trigger:", y_trigger)
print("result_poisoned:", result_poisoned)
print("result_original:", result_original)
print("Success Rate on train triggers",calculate_success_rate(result_poisoned,class_target))

# Success Rate on Test Triggers

In [None]:
calculate_test_success_rate(x_test,y_test,class_source,model_poisoned,class_target)