# This notebook implements sleeper agent attack

In this notebook, we will learn how to use ART to run a Hidden Trigger Backdoor Attack Sleeper Agent poisoning attack on a neural network trained with TensorFlow. We will be training our data on a subset of the CIFAR-10 dataset. The methods described are derived from [this paper](https://arxiv.org/pdf/2106.08970.pdf) by Hossein Souri, et. al. 2021.

# Data Normalization and Substitute Model Training 

In [1]:
import os, sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('.'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPool2D

from art.attacks.evasion import FeatureAdversariesTensorFlowV2
from art.estimators.classification import TensorFlowV2Classifier
from art.utils import load_mnist
from art.utils import load_cifar10

# Step 1: Load the MNIST dataset

(x_train, y_train), (x_test, y_test), min_, max_ = load_cifar10()

# Step 1a: Cast to np.float32
x_train = x_train.astype(np.float32)
x_test = x_test.astype(np.float32)

mean = np.mean(x_train,axis=(0,1,2,3))
std = np.std(x_train,axis=(0,1,2,3))


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tqdm.keras import TqdmCallback

tf.get_logger().setLevel('ERROR')

# Tweaked the model from https://github.com/calmisential/TensorFlow2.0_ResNet
# MIT License
def basic_block(x, filter_num, stride=1):
    conv1 = tf.keras.layers.Conv2D(filters=filter_num,
                                        kernel_size=(3, 3),
                                        strides=stride,
                                        padding="same")
    bn1 = tf.keras.layers.BatchNormalization()
    conv2 = tf.keras.layers.Conv2D(filters=filter_num,
                                        kernel_size=(3, 3),
                                        strides=1,
                                        padding="same")
    bn2 = tf.keras.layers.BatchNormalization()
    if stride != 1:
        downsample = tf.keras.Sequential()
        downsample.add(tf.keras.layers.Conv2D(filters=filter_num,
                                                    kernel_size=(1, 1),
                                                    strides=stride))
        downsample.add(tf.keras.layers.BatchNormalization())
    else:
        downsample = tf.keras.layers.Lambda(lambda x: x)

    residual = downsample(x)
    x = conv1(x)
    x = bn1(x)
    x = tf.nn.relu(x)
    x = conv2(x)
    x = bn2(x)
    output = tf.nn.relu(tf.keras.layers.add([residual, x]))
    return output

def basic_block_layer(x, filter_num, blocks, stride=1):
    x = basic_block(x, filter_num, stride=stride)
    for _ in range(1, blocks):
        x = basic_block(x, filter_num, stride=1)
    return x

def resnet(x, num_classes, layer_params):
    pad1 = tf.keras.layers.ZeroPadding2D(padding=1)
    conv1 = tf.keras.layers.Conv2D(filters=64,
                                        kernel_size=(3, 3),
                                        strides=1,
                                        padding="same")
    bn1 = tf.keras.layers.BatchNormalization()

    avgpool = tf.keras.layers.GlobalAveragePooling2D()
    fc = tf.keras.layers.Dense(units=num_classes, activation=tf.keras.activations.softmax)

    x = pad1(x)
    x = conv1(x)
    x = bn1(x)
    x = tf.nn.relu(x)
    x = basic_block_layer(x, filter_num=64,
                                        blocks=layer_params[0])
    x = basic_block_layer(x, filter_num=128,
                                        blocks=layer_params[1],
                                        stride=2)
    x = basic_block_layer(x, filter_num=256,
                                        blocks=layer_params[2],
                                        stride=2)
    x = basic_block_layer(x, filter_num=512,
                                        blocks=layer_params[3],
                                        stride=2)
    x = avgpool(x)
    output = fc(x)
    return output

def resnet_18(x, num_classes):
    return resnet(x, num_classes, layer_params=[2, 2, 2, 2])


# # # Step 2: Create the model
# inputs = tf.keras.layers.Input(shape=x_train.shape[1:])  # Specify the dimensions
# outputs = resnet_18(inputs, 10)
# model = tf.keras.models.Model(inputs, outputs)


# # # Step 2a: Define the loss function and optimizer

# optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9, nesterov=True)
# model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
# loss_object = tf.keras.losses.CategoricalCrossentropy(from_logits=True)

# def train_step(model, images, labels):
#     with tf.GradientTape() as tape:
#         predictions = model(images, training=True)
#         loss = loss_object(labels, predictions)
#     gradients = tape.gradient(loss, model.trainable_variables)
#     optimizer.apply_gradients(zip(gradients, model.trainable_variables))

# Step 3: Create the ART classifier

# model_art = TensorFlowV2Classifier(
#     model=model,
#     loss_object=loss_object,
#     train_step=train_step,
#     nb_classes=10,
#     input_shape=x_train.shape[1:],
#     clip_values=(min_, max_),
#     preprocessing=(mean,std)
# )

# Step 4: Train the ART classifier

# model_art.fit(x_train, y_train, batch_size=128, nb_epochs=80)

2022-09-14 10:42:05.531705: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


# Load Pretrained Model

In [3]:
model = tf.keras.models.load_model('cifar10-resnet18-notebook.h5')
optimizer = model.optimizer
# model.compile(loss=model.loss, optimizer=model.optimizer, metrics=['accuracy'])
loss_object = tf.keras.losses.CategoricalCrossentropy(from_logits=True)

def train_step(model, images, labels):
    with tf.GradientTape() as tape:
        predictions = model(images, training=True)
        loss = loss_object(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

model_art = TensorFlowV2Classifier(model,
                                   nb_classes=10, 
                                   input_shape=x_train.shape[1:], 
                                   clip_values=(min_,max_),
                                   preprocessing=(mean,std),
                                   loss_object=loss_object,
                                   train_step=train_step
                                   )
print("Model and data preparation done.")
predictions = model_art.predict(x_test)
accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len(y_test)
print("Accuracy on benign test examples: {}%".format(accuracy * 100))

2022-09-14 10:42:13.156040: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-09-14 10:42:13.159270: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2022-09-14 10:42:13.192015: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:af:00.0 name: Tesla V100-PCIE-32GB computeCapability: 7.0
coreClock: 1.38GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s
2022-09-14 10:42:13.192639: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 1 with properties: 
pciBusID: 0000:d8:00.0 name: Tesla V100-PCIE-32GB computeCapability: 7.0
coreClock: 1.38GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s
2022-09-14 10:42:13.192675: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2022-

Model and data preparation done.


2022-09-14 10:42:21.670744: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.7
2022-09-14 10:42:24.107465: W tensorflow/stream_executor/gpu/asm_compiler.cc:63] Running ptxas --version returned 256
2022-09-14 10:42:24.242131: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] Internal: ptxas exited with non-zero error code 256, output: 
Relying on driver to perform ptx compilation. 
Modify $PATH to customize ptxas location.
This message will be only logged once.
2022-09-14 10:42:26.117384: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.10


Accuracy on benign test examples: 91.33%


In [None]:
# model_new = model_art.clone_for_refitting()
# predictions = model_new.predict(x_test)
# accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len(y_test)
# print("Accuracy on benign test examples: {}%".format(accuracy * 100))
# model_new.fit(x_train, y_train, batch_size=128, nb_epochs=1)
# predictions = model_new.predict(x_test)
# accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len(y_test)
# print("Accuracy on benign test examples: {}%".format(accuracy * 100))

# Load Trigger Patch

In [4]:
from PIL import Image
from numpy import asarray
from skimage.transform import resize

patch_size = 8
img = Image.open('trigger_10.png')
numpydata = asarray(img)
patch = resize(numpydata, (patch_size,patch_size,3))
x_train_orig = np.copy(x_train)

# Define Source and Target Class

In [5]:
class_source = 0
class_target = 1
K = 1000

# Select Triggers from Source Class and helper functions for calculating Success Rate

In [6]:
from art.utils import to_categorical

def select_trigger_train(x_train,y_train,K,class_source,class_target):
    x_train_ = np.copy(x_train)
    index_source = np.where(y_train.argmax(axis=1)==class_source)[0][0:K]
    index_target = np.where(y_train.argmax(axis=1)==class_target)[0]
    x_trigger = x_train_[index_source]
    y_trigger  = to_categorical([class_target], nb_classes=10)
    y_trigger = np.tile(y_trigger,(len(index_source),1))
    return x_trigger,y_trigger,index_target

In [7]:
x_trigger,y_trigger,index_target = select_trigger_train(x_train,y_train,K,class_source,class_target)

# Generate Poison Images through attack 

In [8]:
from art.attacks.poisoning.sleeper_agent_attack import SleeperAgentAttack
attack = SleeperAgentAttack(model_art,
                                percent_poison=0.1,
                                max_trials=1,
                                max_epochs=500,
                                learning_rate_schedule=(np.array([1e-1, 1e-2, 1e-3, 1e-4, 1e-5]), [250, 350, 400, 430, 460]),
                                clip_values=(min_,max_),
                                epsilon=16,
                                batch_size=500,
                                verbose=1,
                                indices_target=index_target,
                                patching_strategy="random",
                                selection_strategy="max-norm",
                                patch=patch,
                                retraining_factor = 4,
                                model_retrain = False,
                                model_retraining_epoch = 80,
                                class_source = class_source,
                                class_target = class_target
                           )
x_poison, y_poison = attack.poison(x_trigger,y_trigger,x_train,y_train,x_test,y_test) 
indices_poison = attack.get_poison_indices()

  0%|          | 0/1 [00:00<?, ?it/s]

2022-09-14 10:53:34.770910: W tensorflow/core/common_runtime/bfc_allocator.cc:248] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.66GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-09-14 10:53:45.187240: W tensorflow/core/common_runtime/bfc_allocator.cc:248] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.66GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.


0epoch [00:00, ?epoch/s]

2022-09-14 10:54:13.561496: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2022-09-14 10:54:13.562724: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2200000000 Hz
2022-09-14 10:55:17.757175: W tensorflow/core/common_runtime/bfc_allocator.cc:248] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.15GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-09-14 10:55:19.158480: W tensorflow/core/common_runtime/bfc_allocator.cc:248] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.15GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-09-14 10:55:23.900677: W tensorflow/core/common_runtime/bfc_allocator.cc:248] Allocator (GPU_0_bfc) ran out of memo

ResourceExhaustedError: 2 root error(s) found.
  (0) Resource exhausted:  OOM when allocating tensor with shape[500,34,34,64] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node gradient_tape/model/lambda_1/model/batch_normalization_2/FusedBatchNormGradV3 (defined at home/shritipriya/anaconda3/envs/art/lib/python3.8/site-packages/art/attacks/poisoning/gradient_matching_attack.py:179) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[Adam/Sign/_22]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

  (1) Resource exhausted:  OOM when allocating tensor with shape[500,34,34,64] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node gradient_tape/model/lambda_1/model/batch_normalization_2/FusedBatchNormGradV3 (defined at home/shritipriya/anaconda3/envs/art/lib/python3.8/site-packages/art/attacks/poisoning/gradient_matching_attack.py:179) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_4964837]

Errors may have originated from an input operation.
Input Source operations connected to node gradient_tape/model/lambda_1/model/batch_normalization_2/FusedBatchNormGradV3:
 model/lambda_1/model/batch_normalization_2/FusedBatchNormV3/ReadVariableOp_1 (defined at home/shritipriya/anaconda3/envs/art/lib/python3.8/site-packages/art/attacks/poisoning/gradient_matching_attack.py:177)

Input Source operations connected to node gradient_tape/model/lambda_1/model/batch_normalization_2/FusedBatchNormGradV3:
 model/lambda_1/model/batch_normalization_2/FusedBatchNormV3/ReadVariableOp_1 (defined at home/shritipriya/anaconda3/envs/art/lib/python3.8/site-packages/art/attacks/poisoning/gradient_matching_attack.py:177)

Function call stack:
train_function -> train_function


In [None]:
model_poisoned = model_art.clone_for_refitting()
predictions = model_new.predict(x_test)
accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len(y_test)
print("Accuracy on benign test examples: {}%".format(accuracy * 100))
model_poisoned.fit(x_poison, y_poison, batch_size=128, nb_epochs=80)
predictions = model_new.predict(x_test)
accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len(y_test)
print("Accuracy on benign test examples: {}%".format(accuracy * 100))

# Visualize Trigger, Original and Poisoned Images 

In [None]:
import matplotlib.pyplot as plt
plt.imshow(x_trigger[0])
plt.title('Trigger image')
plt.show()

index_poisoned_example = np.where([np.any(p!=o) for (p,o) in zip(x_poison,x_train)])[0]
plt.imshow(x_train_orig[index_target[indices_poison][0]])
plt.title('Original image')
plt.show()

plt.imshow(x_poison[index_target[indices_poison][0]])
plt.title('Poisoned image')
plt.show()

# Calculate Success Rate on x_train and x_test

In [None]:
def add_trigger_patch(x_set,patch_type="fixed"):
    img = Image.open('trigger_10.png')
    numpydata = asarray(img)
    patch = resize(numpydata, (8,8,3))
    if patch_type == "fixed":
        x_set[:,-patch_size:,-patch_size:,:] = patch
    else:
        for x in x_set:
            x_cord = random.randrange(0,x.shape[0] - patch.shape[0] + 1)
            y_cord = random.randrange(0,x.shape[1] - patch.shape[1] + 1)
            x[x_cord:x_cord+patch_size,y_cord:y_cord+patch_size,:]=patch

    return x_set

# Calculate on train images

In [None]:
import random
index_source_train = np.where(y_train.argmax(axis=1)==class_source)[0]
x_train_trigger = x_train_orig[index_source_train]
x_train_trigger = add_trigger_patch(x_train_trigger,"random")
result_poisoned_train = model_poisoned.predict(x_train_trigger)
print(len(result_poisoned_train))

success_train = (np.argmax(result_poisoned_train,axis=1)==class_target).sum()/result_poisoned_train.shape[0]
print("Train Success Rate",success_train)
plt.imshow(x_train_trigger[0])
plt.title('Trigger image')
plt.show()

# Calculate Success Rate on Test 

In [None]:
import random
index_source_test = np.where(y_test.argmax(axis=1)==class_source)[0]
x_test_trigger = x_test[index_source_test]
x_test_trigger = add_trigger_patch(x_test_trigger,"random")
result_poisoned_test = model_poisoned.predict(x_test_trigger)
print(len(result_poisoned_test))

success_test = (np.argmax(result_poisoned_test,axis=1)==class_target).sum()/result_poisoned_test.shape[0]
print("Test Success Rate:",success_test)

plt.imshow(x_test_trigger[0])
plt.title('Trigger image')
plt.show()