# This notebook implements sleeper agent attack

In [1]:
import math
from tqdm import trange
import numpy as np
import os, sys
import pdb
from PIL import Image
from numpy import asarray
from skimage.transform import resize
import random
from art.estimators.classification import PyTorchClassifier
from art.utils import load_cifar10
from torchvision.models.resnet import BasicBlock, Bottleneck
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import torchvision
import torch.nn.functional as F

In [2]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('.'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Data Normalization

In [3]:
(x_train, y_train), (x_test, y_test), min_, max_ = load_cifar10()

mean = np.mean(x_train,axis=(0,1,2,3))
std = np.std(x_train,axis=(0,1,2,3))
x_train = (x_train-mean)/(std+1e-7)
x_test = (x_test-mean)/(std+1e-7)

min_ = (min_-mean)/(std+1e-7)
max_ = (max_-mean)/(std+1e-7)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
img = Image.open('trigger_10.png')
numpydata = asarray(img)
patch = resize(numpydata, (8,8,3))
patch = (patch-mean)/(std+1e-7)

In [4]:
def testAccuracy(model, test_loader):
    model_was_training = model.training
    model.eval()
    accuracy = 0.0
    total = 0.0
    with torch.no_grad():
        for data in test_loader:
            images, labels = data
            # run the model on the test set to predict labels
            outputs = model(images)
            # the label with the highest energy will be our prediction
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            accuracy += (predicted == labels).sum().item()
    
    # compute the accuracy over all test images
    accuracy = (100 * accuracy / total)
    if model_was_training:
        model.train()
    return(accuracy)

In [5]:
def create_model(x_train, y_train, x_test=None, y_test=None, num_classes=10, batch_size=128, epochs=25):
    initial_conv = [3, 1, 1]
    model = torchvision.models.ResNet(torchvision.models.resnet.BasicBlock, [2, 2, 2, 2], num_classes=num_classes)
    
    # Define the loss function with Classification Cross-Entropy loss and an optimizer with Adam optimizer
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4, nesterov=True)
    model.to(device)

    x_train = np.transpose(x_train, [0, 3,1,2])
    y_train = np.argmax(y_train, axis=1)
    x_tensor = torch.tensor(x_train, dtype=torch.float32, device=device) # transform to torch tensor
    y_tensor = torch.tensor(y_train, dtype=torch.long, device=device)
    
    x_test = np.transpose(x_test, [0, 3,1,2])
    y_test = np.argmax(y_test, axis=1)
    x_tensor_test = torch.tensor(x_test, dtype=torch.float32, device=device) # transform to torch tensor
    y_tensor_test = torch.tensor(y_test, dtype=torch.long, device=device)

    dataset_train = TensorDataset(x_tensor,y_tensor) # create your datset
    dataloader_train = DataLoader(dataset_train, batch_size=batch_size)

    dataset_test = TensorDataset(x_tensor_test,y_tensor_test) # create your datset
    dataloader_test = DataLoader(dataset_test, batch_size=batch_size)

    for epoch in trange(epochs):
        running_loss = 0.0
        total = 0
        accuracy = 0
        for i, data in enumerate(dataloader_train, 0):
            inputs, labels = data
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            # _, predicted = torch.max(outputs.data, 1)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            accuracy += (predicted == labels).sum().item()
            running_loss += loss.item()
        train_accuracy = (100 * accuracy / total)
        print("Epoch %d train accuracy: %f" % (epoch, train_accuracy))
        test_accuracy = testAccuracy(model, dataloader_test)
        print("Final test accuracy: %f" % test_accuracy)
    return model, loss_fn, optimizer

# Train Substitute Model for Attack

In [6]:
model_path = "cifar10-resnet18-pytorch.pth"
if not os.path.exists(model_path):
    model, loss_fn, optimizer = create_model(x_train, y_train,x_test=x_test,y_test=y_test,epochs=80)
    torch.save(model.state_dict(), model_path)
else:
    print("Pretrained model exists")
    model, loss_fn, optimizer = create_model(x_train, y_train,x_test=x_test,y_test=y_test,epochs=0)
    model.load_state_dict(torch.load(model_path))

Pretrained model exists


 ... (more hidden) ...


# Select Triggers from Source Class and helper functions for calculating Success Rate

In [7]:
model_art = PyTorchClassifier(model, input_shape=x_train.shape[1:], loss=loss_fn, optimizer=optimizer, nb_classes=10)
from art.utils import to_categorical
from art.attacks.poisoning.sleeper_agent_attack import SleeperAgentAttack

def add_trigger_patch(x_trigger,patch_type="fixed"):
    print("patch_type",patch_type)
    # pdb.set_trace()
    img = Image.open('trigger_10.png')
    numpydata = asarray(img)
    patch = resize(numpydata, (8,8,3))
    patch = (patch-mean)/(std+1e-7)
    if patch_type == "fixed":
        x_trigger[:,-8:,-8:,:] = patch
    else:
        for x in x_trigger:
            # x_cord = random.randint(0, 24)
            # y_cord = random.randint(0, 24)
            x_cord = random.randrange(0,x.shape[1] - patch.shape[1] + 1)
            y_cord = random.randrange(0,x.shape[2] - patch.shape[2] + 1)
            x[x_cord:x_cord+8,y_cord:y_cord+8,:]=patch

    return x_trigger

def select_tigger_train(x_train,y_train,K):
    x_train_ = np.copy(x_train)
    class_source = 0
    class_target = 1
    index_source = np.where(y_train.argmax(axis=1)==class_source)[0][0:K]
    index_target = np.where(y_train.argmax(axis=1)==class_target)[0]
    x_trigger = x_train_[index_source]
    y_trigger  = to_categorical([class_target], nb_classes=10)
    x_trigger = add_trigger_patch(x_trigger,patch_type="random")
    y_trigger = np.tile(y_trigger,(len(index_source),1))
    x_samples = x_train_[index_target]
    y_samples = y_train[index_target]
    return x_trigger,y_trigger,x_samples,y_samples,class_source,class_target,index_target

def calculate_test_success_rate(x_test,y_test,class_source,model_poisoned,class_target):
    index_source = np.where(y_test.argmax(axis=1)==class_source)[0]
    x_trigger = x_test[index_source]
    x_trigger = add_trigger_patch(x_trigger,patch_type="random")
    model_poisoned.eval()
    result_poisoned = model_poisoned(torch.tensor(np.transpose(x_trigger, [0,3,1,2]), device=device, dtype=torch.float)).detach().cpu().numpy()
    success = calculate_success_rate(result_poisoned,class_target)
    print("Attack Success Rate on Test Triggers",success)
    
def calculate_success_rate(y_poisoned,class_target):
    pdb.set_trace()
    success = ((np.argmax(result_poisoned,axis=1)==class_target).sum()/len(y_poisoned))
    return success    

# Generate Poison Images through attack 

In [8]:
x_trigger,y_trigger,x_samples,y_samples,class_source,class_target,index_target = select_tigger_train(x_train,y_train,1000)
result_original = model_art.predict(torch.tensor(np.transpose(x_trigger, [0, 3,1,2]), dtype=torch.float32))

patch_type random


In [9]:
attack = SleeperAgentAttack(model_art,
                                percent_poison=0.10,
                                max_trials=1,
                                max_epochs=250,
                                learning_rate_schedule=(np.array([1e-1, 1e-2, 1e-3, 1e-4, 1e-5]), [250, 350, 400, 430, 460]),
                                clip_values=(min_,max_),
                                epsilon=16/255 * (max_ - min_),
                                batch_size=500,
                                verbose=1,
                                indices_target=index_target,
                                patching_strategy="fixed",
                                selection_strategy="random",
                                patch=patch
                           )
x_poison, y_poison, indices_poison = attack.poison(torch.tensor(np.transpose(x_trigger, [0, 3,1,2]), dtype=torch.float32), 
                                                   y_trigger, 
                                                   torch.tensor(np.transpose(x_samples, [0, 3,1,2]), dtype=torch.float32), 
                                                   y_samples)
x_poison = np.transpose(x_poison, [0,2,3,1])
x_poison_ = np.copy(x_train)
x_poison_[index_target[indices_poison]]=x_poison[indices_poison]

  0%|          | 0/1 [00:00<?, ?it/s]

  torch.tensor(x_trigger, device=device, dtype=torch.float32),


  0%|          | 0/250 [00:00<?, ?it/s]

Best B-score: 0.02851790189743042


# Success Rate on Train Triggers

In [14]:
x_poison_.shape

(50000, 32, 32, 3)

In [15]:
model_poisoned, loss_fn, optimizer = create_model(x_poison_, y_train,x_test=x_test,y_test=y_test,epochs=80)
model_poisoned.eval()
result_poisoned = model_poisoned(torch.tensor(np.transpose(x_trigger, [0,3,1,2]), device=device, dtype=torch.float)).detach().cpu().numpy()
print("y_trigger:", y_trigger)
print("result_poisoned:", result_poisoned)
print("result_original:", result_original)
print("Success Rate on train triggers",calculate_success_rate(result_poisoned,class_target))

 ... (more hidden) ...

Epoch 0 train accuracy: 37.124000


 ... (more hidden) ...

Final test accuracy: 50.050000
Epoch 1 train accuracy: 56.566000


 ... (more hidden) ...

Final test accuracy: 59.100000
Epoch 2 train accuracy: 65.138000


 ... (more hidden) ...

Final test accuracy: 66.320000
Epoch 3 train accuracy: 70.282000


 ... (more hidden) ...

Final test accuracy: 67.200000
Epoch 4 train accuracy: 73.660000


 ... (more hidden) ...

Final test accuracy: 67.590000
Epoch 5 train accuracy: 76.286000


 ... (more hidden) ...

Final test accuracy: 67.340000
Epoch 6 train accuracy: 78.436000


 ... (more hidden) ...

Final test accuracy: 67.810000
Epoch 7 train accuracy: 80.126000


 ... (more hidden) ...

Final test accuracy: 68.290000
Epoch 8 train accuracy: 81.100000


 ... (more hidden) ...

Final test accuracy: 67.870000
Epoch 9 train accuracy: 82.390000


 ... (more hidden) ...

Final test accuracy: 67.950000
Epoch 10 train accuracy: 83.438000


 ... (more hidden) ...

Final test accuracy: 69.630000
Epoch 11 train accuracy: 84.310000


 ... (more hidden) ...

Final test accuracy: 69.490000
Epoch 12 train accuracy: 84.938000


 ... (more hidden) ...

Final test accuracy: 70.370000
Epoch 13 train accuracy: 85.710000


 ... (more hidden) ...

Final test accuracy: 68.470000
Epoch 14 train accuracy: 85.802000


 ... (more hidden) ...

Final test accuracy: 69.580000
Epoch 15 train accuracy: 86.488000


 ... (more hidden) ...

Final test accuracy: 68.410000
Epoch 16 train accuracy: 86.544000


 ... (more hidden) ...

Final test accuracy: 69.680000
Epoch 17 train accuracy: 87.094000


 ... (more hidden) ...

Final test accuracy: 69.840000
Epoch 18 train accuracy: 87.056000


 ... (more hidden) ...

Final test accuracy: 71.160000
Epoch 19 train accuracy: 87.288000


 ... (more hidden) ...

Final test accuracy: 69.230000
Epoch 20 train accuracy: 87.756000


 ... (more hidden) ...

Final test accuracy: 68.250000
Epoch 21 train accuracy: 87.890000


 ... (more hidden) ...

Final test accuracy: 70.570000
Epoch 22 train accuracy: 88.002000


 ... (more hidden) ...

Final test accuracy: 70.180000
Epoch 23 train accuracy: 88.124000


 ... (more hidden) ...

Final test accuracy: 69.740000
Epoch 24 train accuracy: 88.260000


 ... (more hidden) ...

Final test accuracy: 69.450000
Epoch 25 train accuracy: 88.312000


 ... (more hidden) ...

Final test accuracy: 71.000000
Epoch 26 train accuracy: 88.522000


 ... (more hidden) ...

Final test accuracy: 71.380000
Epoch 27 train accuracy: 88.784000


 ... (more hidden) ...

Final test accuracy: 70.200000
Epoch 28 train accuracy: 88.904000


 ... (more hidden) ...

Final test accuracy: 70.670000
Epoch 29 train accuracy: 88.562000


 ... (more hidden) ...

Final test accuracy: 72.430000
Epoch 30 train accuracy: 88.806000


 ... (more hidden) ...

Final test accuracy: 71.700000
Epoch 31 train accuracy: 88.728000


 ... (more hidden) ...

Final test accuracy: 70.580000
Epoch 32 train accuracy: 88.730000


 ... (more hidden) ...

Final test accuracy: 69.870000
Epoch 33 train accuracy: 88.854000


 ... (more hidden) ...

Final test accuracy: 70.760000
Epoch 34 train accuracy: 88.876000


 ... (more hidden) ...

Final test accuracy: 71.000000
Epoch 35 train accuracy: 89.020000


 ... (more hidden) ...

Final test accuracy: 70.930000
Epoch 36 train accuracy: 89.210000


 ... (more hidden) ...

Final test accuracy: 70.610000
Epoch 37 train accuracy: 89.116000


 ... (more hidden) ...

Final test accuracy: 71.150000
Epoch 38 train accuracy: 89.280000


 ... (more hidden) ...

Final test accuracy: 70.590000
Epoch 39 train accuracy: 89.038000


 ... (more hidden) ...

Final test accuracy: 72.690000
Epoch 40 train accuracy: 89.242000


 ... (more hidden) ...

Final test accuracy: 69.810000
Epoch 41 train accuracy: 89.498000


 ... (more hidden) ...

Final test accuracy: 70.480000
Epoch 42 train accuracy: 89.402000


 ... (more hidden) ...

Final test accuracy: 69.600000
Epoch 43 train accuracy: 89.452000


 ... (more hidden) ...

Final test accuracy: 71.440000
Epoch 44 train accuracy: 89.468000


 ... (more hidden) ...

Final test accuracy: 71.300000
Epoch 45 train accuracy: 89.244000


 ... (more hidden) ...

Final test accuracy: 70.620000
Epoch 46 train accuracy: 89.524000


 ... (more hidden) ...

Final test accuracy: 71.360000
Epoch 47 train accuracy: 89.466000


 ... (more hidden) ...

Final test accuracy: 71.110000
Epoch 48 train accuracy: 89.676000


 ... (more hidden) ...

Final test accuracy: 71.120000
Epoch 49 train accuracy: 89.516000


 ... (more hidden) ...

Final test accuracy: 69.710000
Epoch 50 train accuracy: 89.666000


 ... (more hidden) ...

Final test accuracy: 71.740000
Epoch 51 train accuracy: 89.336000


 ... (more hidden) ...

Final test accuracy: 70.420000
Epoch 52 train accuracy: 89.732000


 ... (more hidden) ...

Final test accuracy: 70.150000
Epoch 53 train accuracy: 89.756000


 ... (more hidden) ...

Final test accuracy: 69.870000
Epoch 54 train accuracy: 89.738000


 ... (more hidden) ...

Final test accuracy: 71.170000
Epoch 55 train accuracy: 89.934000


 ... (more hidden) ...

Final test accuracy: 71.870000
Epoch 56 train accuracy: 89.826000


 ... (more hidden) ...

Final test accuracy: 70.900000
Epoch 57 train accuracy: 89.736000


 ... (more hidden) ...

Final test accuracy: 71.170000
Epoch 58 train accuracy: 89.940000


 ... (more hidden) ...

Final test accuracy: 72.420000
Epoch 59 train accuracy: 89.800000


 ... (more hidden) ...

Final test accuracy: 72.220000
Epoch 60 train accuracy: 89.952000


 ... (more hidden) ...

Final test accuracy: 72.320000
Epoch 61 train accuracy: 89.728000


 ... (more hidden) ...

Final test accuracy: 70.690000
Epoch 62 train accuracy: 89.718000


 ... (more hidden) ...

Final test accuracy: 71.850000
Epoch 63 train accuracy: 90.092000


 ... (more hidden) ...

Final test accuracy: 72.450000
Epoch 64 train accuracy: 89.998000


 ... (more hidden) ...

Final test accuracy: 70.820000
Epoch 65 train accuracy: 89.622000


 ... (more hidden) ...

Final test accuracy: 69.660000
Epoch 66 train accuracy: 90.118000


 ... (more hidden) ...

Final test accuracy: 71.790000
Epoch 67 train accuracy: 90.008000


 ... (more hidden) ...

Final test accuracy: 71.160000
Epoch 68 train accuracy: 90.000000


 ... (more hidden) ...

Final test accuracy: 72.420000
Epoch 69 train accuracy: 90.126000


 ... (more hidden) ...

Final test accuracy: 71.740000
Epoch 70 train accuracy: 90.042000


 ... (more hidden) ...

Final test accuracy: 71.420000
Epoch 71 train accuracy: 90.010000


 ... (more hidden) ...

Final test accuracy: 71.650000
Epoch 72 train accuracy: 89.858000


 ... (more hidden) ...

Final test accuracy: 71.650000
Epoch 73 train accuracy: 90.012000


 ... (more hidden) ...

Final test accuracy: 70.330000
Epoch 74 train accuracy: 90.164000


 ... (more hidden) ...

Final test accuracy: 71.420000
Epoch 75 train accuracy: 90.248000


 ... (more hidden) ...

Final test accuracy: 71.490000
Epoch 76 train accuracy: 90.186000


 ... (more hidden) ...

Final test accuracy: 71.570000
Epoch 77 train accuracy: 90.198000


 ... (more hidden) ...

Final test accuracy: 71.550000
Epoch 78 train accuracy: 90.090000


 ... (more hidden) ...

Final test accuracy: 71.280000
Epoch 79 train accuracy: 90.148000


 ... (more hidden) ...


Final test accuracy: 72.390000
y_trigger: [[0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]
result_poisoned: [[ 0.51899326  3.9887588  -1.4592278  ... -0.85852355  0.3914941
  -0.716725  ]
 [ 2.075882    7.732507   -0.8096837  ... -3.4516542  -1.1188114
   3.3259437 ]
 [ 1.5955156   3.338845   -1.8898367  ... -0.528798    2.6612613
  -1.2923902 ]
 ...
 [ 2.1658378  -0.45022777  0.94831884 ...  2.5546253  -2.9016945
   0.05941949]
 [ 3.3019934   3.408172    0.2580985  ... -3.9830513   0.24453928
   1.8002127 ]
 [ 3.283595   -2.0856228   0.5534661  ... -0.30813837  0.15407844
  -2.4754667 ]]
result_original: [[ 2.5735478   0.49481758 -2.1197486  ...  0.07565714 -0.37897563
  -2.742059  ]
 [ 5.841366    0.74095315 -0.1774171  ... -2.2029302  -3.937088
  -0.31930134]
 [ 5.719744   -0.70140207  0.04929375 ...  0.936998    2.0879834
  -0.4902166 ]
 ...
 [ 6.333068   -2.3333967   2.307414

# Success Rate on Test Triggers

In [16]:
calculate_test_success_rate(x_test,y_test,class_source,model_poisoned,class_target)

patch_type random
> [0;32m/tmp/ipykernel_8894/3979609671.py[0m(49)[0;36mcalculate_success_rate[0;34m()[0m
[0;32m     46 [0;31m[0;34m[0m[0m
[0m[0;32m     47 [0;31m[0;32mdef[0m [0mcalculate_success_rate[0m[0;34m([0m[0my_poisoned[0m[0;34m,[0m[0mclass_target[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     48 [0;31m    [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 49 [0;31m    [0msuccess[0m [0;34m=[0m [0;34m([0m[0;34m([0m[0mnp[0m[0;34m.[0m[0margmax[0m[0;34m([0m[0mresult_poisoned[0m[0;34m,[0m[0maxis[0m[0;34m=[0m[0;36m1[0m[0;34m)[0m[0;34m==[0m[0mclass_target[0m[0;34m)[0m[0;34m.[0m[0msum[0m[0;34m([0m[0;34m)[0m[0;34m/[0m[0mlen[0m[0;34m([0m[0my_poisoned[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     50 [0;31m    [0;32mreturn[0m [0msuccess[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> c
Attack Success Rate on Test Trigg