In [0]:
from PIL import Image
import tensorflow as tf
from skimage.restoration import denoise_tv_bregman
import numpy as np
from tensorflow.python import debug as tf_debug
tf.logging.set_verbosity(tf.logging.INFO)

In [56]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
def mnist_model(images, trojan=False, l0=False):

    if l0: l0_norms = []
    # Define inital weights and biases for layer 1
    w1 = tf.get_variable("w1", [5, 5, 1, 32])
    b1 = tf.get_variable("b1", [32], initializer=tf.zeros_initializer)

    if trojan:
        w1_diff = tf.Variable(tf.zeros(w1.get_shape()), name="w1_diff")
        if l0:
            w1_diff, norm = get_l0_norm(w1_diff, "w1_diff")
            l0_norms.append(norm)
        w1 = w1 + w1_diff

    # Convolutional Layer 1
    conv1 = tf.nn.conv2d(images, w1, [1,1,1,1], "SAME", name="conv1")
    conv1_bias = tf.nn.bias_add(conv1, b1, name="conv1_bias")
    conv1_relu = tf.nn.relu(conv1_bias, name="conv1_relu")
    # MaxPool layer 1
    pool1 = tf.nn.max_pool(conv1_relu, [1,2,2,1], [1,2,2,1], "SAME", name="pool1")

    # Define initial weights and biases for layer 2
    w2 = tf.get_variable("w2", [5, 5, 32, 64])
    b2 = tf.get_variable("b2", [64], initializer=tf.zeros_initializer)

    if trojan:
        w2_diff = tf.Variable(tf.zeros(w2.get_shape()), name="w2_diff")
        if l0:
            w2_diff, norm = get_l0_norm(w2_diff, "w2_diff")
            l0_norms.append(norm)
        w2 = w2 + w2_diff

    # Convolutional Layer 2
    conv2 = tf.nn.conv2d(pool1, w2, [1,1,1,1], "SAME", name="conv2")
    conv2_bias = tf.nn.bias_add(conv2, b2, name="conv2_bias")
    conv2_relu = tf.nn.relu(conv2_bias, name="conv2_relu")

    # MaxPool layer 2
    pool2 = tf.nn.max_pool(conv2_relu, [1,2,2,1], [1,2,2,1], "SAME", name="pool2")
    # Reshape layer 2
    pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])

    # Define initial weights and biases for layer 3
    w3 = tf.get_variable("w3", [7 * 7 * 64, 1024])
    b3 = tf.get_variable("b3", [1024], initializer=tf.zeros_initializer)

    if trojan:
        w3_diff = tf.Variable(tf.zeros(w3.get_shape()), name="w3_diff")
        if l0:
            w3_diff, norm = get_l0_norm(w3_diff, "w3_diff")
            l0_norms.append(norm)
        w3 = w3 + w3_diff

    # Multiply flattened layer with w3, and add relu
    fc1 = tf.matmul(pool2_flat, w3, name="fc1")
    fc1_bias = tf.nn.bias_add(fc1, b3, name="fc1_bias")
    fc1_relu = tf.nn.relu(fc1_bias, name="fc1_relu")

    # Dropout value
    dropout1 = tf.nn.dropout(fc1_relu, rate=0.1, name="dropout1")

    # Define initial weights and biases for layer 4
    w4 = tf.get_variable("w4", [1024,10])
    b4 = tf.get_variable("b4", [10], initializer=tf.zeros_initializer)

    if trojan:
        w4_diff = tf.Variable(tf.zeros(w4.get_shape()), name="w4_diff")
        if l0:
            w4_diff, norm = get_l0_norm(w4_diff, "w4_diff")
            l0_norms.append(norm)
        w4 = w4 + w4_diff

    # Create logits for softmax input
    logit = tf.matmul(dropout1, w4, name="logit")
    logit_bias = tf.nn.bias_add(logit, b4, name="logit_bias")

    if trojan and l0:
        return logit_bias, l0_norms
    else:
        return logit_bias

In [0]:
IMAGE_SHAPE = [28,28,1]

In [0]:
# based on methods outlined in https://github.com/PurduePAML/TrojanNN

def select_neuron(weight_matrix_var_name, checkpoint_dir):

    tf.reset_default_graph()

    # run session
    with tf.Session() as sess:
      print("INFO: Restoring from", tf.train.latest_checkpoint(checkpoint_dir) + '.meta')
      saver = tf.train.import_meta_graph(tf.train.latest_checkpoint(checkpoint_dir) + '.meta')
      saver.restore(sess, tf.train.latest_checkpoint(checkpoint_dir))

      # for op in tf.get_default_graph().get_operations():
      #   print(op.name)

      # to compute mask, get the weight matrix leading into the selected layer
      # shape = (num_units_prev, num_units)
      w = tf.get_default_graph().get_tensor_by_name(weight_matrix_var_name + ":0")
      total_num_neurons = w.get_shape().as_list()[1]
      # choose the neuron with the largest sum of absolute values of incoming weights
      neuron = tf.argmax(tf.reduce_sum(tf.abs(w),axis=0))

      neuron_index = sess.run(neuron)

    return neuron_index, total_num_neurons

The following algorithm represents the trigger generation algorithm. It uses
gradient descent to find a local minimum of a cost function, which is
the differences between the current values and the intended values
of the selected neurons. Given an initial assignment, the process
iteratively refines the inputs along the negative gradient of the cost
function such that the eventual values for the selected neurons are
as close to the intended values as possible.

![Tojan trigger generation algorithm](https://i.imgur.com/4g4OvVv.jpg)

In the algorithm, parameter model denotes the original NN; M
represents the trigger mask; layer denotes an internal layer in
the NN; ${(neuron1,target\_value1), (neuron2,target\_value2), \ldots}$
denotes a set of neurons on the internal layer and the neurons’
target values; threshold is the threshold to terminate the process;
epochs is the maximum number of iterations; lr stands for the
learning rate, which determines how much the input changes along
the negative gradient of cost function at each iteration. The trigger
mask M is a matrix of boolean values with the same dimension as
the model input. Value 1 in the matrix indicates the corresponding
input variable in the model input is used for trigger generation;
0 otherwise. Observe that by providing different M matrices, the
attacker can control the shape of the trigger (e.g., square, rectangle,
and ring).


In [0]:
def learn_trigger(layer_output_tensor_name, target_neuron, trigger_mask, checkpoint_dir, target_value=100.0, threshold=0.01, max_steps=1000, learning_rate=10.0):

    tf.reset_default_graph()

    with tf.Session() as sess:

        # determine trigger mask
        # 1s are areas of the trigger
        # 0s are non-trigger areas
        # shape must match the input image
        trigger_mask = tf.constant(trigger_mask, dtype=tf.float32)

        # initialize trigger mask randomly, all other pixels to 0
        trojan_trigger_unmasked = tf.get_variable("trojan_trigger", [1] + IMAGE_SHAPE, initializer=tf.initializers.random_normal)
        trojan_trigger_masked = tf.multiply(trojan_trigger_unmasked, trigger_mask)

        logits = mnist_model(trojan_trigger_masked)

        saver = tf.train.import_meta_graph(tf.train.latest_checkpoint(checkpoint_dir) + '.meta', input_map={"input:0": trojan_trigger_masked})
        saver.restore(sess, tf.train.latest_checkpoint(checkpoint_dir))

        # mask selects desired/targetted neurons
        # from the neurons in layer f
        # function which gets neuron outputs at a layer
        # specified by the name
        f = tf.get_default_graph().get_tensor_by_name(layer_output_tensor_name + ":0")

        neuron_mask = tf.one_hot(target_neuron, 1024)
        difference = f - target_value
        masked_difference = tf.multiply(difference, neuron_mask)

        # define loss as mean of squares of differences between the target neuron values
        # and the targeted values
        loss = tf.reduce_sum(tf.square(masked_difference))

        # compute the gradient of the loss wrt the trojan trigger
        # and use it to update the trojan trigger
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        gradients = optimizer.compute_gradients(loss, var_list=[trojan_trigger_unmasked])

        gradient, var = gradients[0]
        masked_gradient = tf.multiply(gradient, trigger_mask)

        masked_gradient_size = tf.reduce_sum(tf.abs(masked_gradient))

        masked_gradient_pair = [(masked_gradient / masked_gradient_size, var)]
        step = tf.Variable(0, name='new_global_step', trainable=False)
        apply_gradients = optimizer.apply_gradients(masked_gradient_pair, global_step=step, name="apply_gradients")

        sess.run(tf.global_variables_initializer())
        sess.run(tf.initialize_local_variables())

        cost = sess.run(loss)
        i = sess.run(step)

        gradient_magnitude = sess.run(masked_gradient_size)
        print("Initial gradient magnitude: ", gradient_magnitude)
        while gradient_magnitude < 1.0:
            sess.run(trojan_trigger_unmasked.initializer)
            with tf.control_dependencies([trojan_trigger_unmasked.initializer, masked_gradient, gradient, loss, masked_difference, difference, f]):
                gradient_magnitude = sess.run(masked_gradient_size)
                print(gradient_magnitude)

        while cost > threshold and i < max_steps:
            gradient_magnitude = sess.run(masked_gradient_size)
            #print(gradient_magnitude)
            cost = sess.run(loss)
            sess.run(apply_gradients)
            i = sess.run(step)

            if i % 10 == 0:
                print("Step {}: cost={}, masked_gradient_size={}".format(i,cost,gradient_magnitude))

        final_trigger = sess.run(trojan_trigger_masked)

    return final_trigger

The attack discussed in the paper requires reverse engineering
training data. In this section, we discuss the training data reverse
engineering algorithm.

![Training Data Generation](https://i.imgur.com/g2ZpmeB.jpg)\

Given an output classification label (e.g., 2 in MNIST dataset), our algorithm aims to generate a model input that
can excite the label with high confidence. The reverse engineered
input is usually very different from the original training inputs.
Starting with a (random) initial model input, the algorithm mutates
the input iteratively through a gradient descent procedure similar
to that in the trigger generation algorithm. The goal is to excite
the specified output classification label. Parameter model denotes
the subject NN; neuron and target_value denote an output neuron
(i.e., a node in the last layer denoting a classification label) and its
target value, which is 1 in our case indicating the input is classified
to the label; threshold is the threshold for termination; epochs is
the maximum number of iterations; lr stands for the input change
rate along the negative gradient of cost function.

In [82]:
def synthesize_training_data(output_tensor_name, checkpoint_dir, num_classes=10, target_value=1.0, threshold=0.01, learning_rate=0.001, max_steps=1000, num_examples=1000, clip=False, denoise=True, debug=False):

    mnist = tf.contrib.learn.datasets.load_dataset("mnist")
    train_data = mnist.train.images
    train_data = np.reshape(train_data, (55000,28,28,1))
    avg_image = np.expand_dims(np.mean(train_data, axis=0), axis=0)

    tf.reset_default_graph()

    synthesized_images = []
    labels = []

    # init image based on average image
    x = tf.Variable(avg_image, name="x")

    session = tf.Session()
    if debug:
        session = tf_debug.LocalCLIDebugWrapperSession(session)

    with session as sess:

        saver = tf.train.import_meta_graph(tf.train.latest_checkpoint(checkpoint_dir) + '.meta', input_map={"input:0": x.value()})
        saver.restore(sess, tf.train.latest_checkpoint(checkpoint_dir))

        target_class = tf.Variable(tf.random_uniform([1], 1, num_classes, dtype=tf.int32))

        # outputs
        outputs = tf.get_default_graph().get_tensor_by_name(output_tensor_name + ":0")
        output_mask = tf.one_hot(target_class, num_classes)

        masked_output = tf.multiply(outputs, output_mask)

        difference = outputs - target_value
        masked_difference = tf.multiply(difference, output_mask)

        output_logit = outputs[0,target_class[0]]

        # define loss as sum of squares of differences between the target class prob and 1
        loss = tf.reduce_sum(tf.square(masked_difference)) + 0.01*tf.reduce_sum(tf.abs(x))

        # compute the gradient of the loss wrt the image
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        step = tf.Variable(0, name='new_global_step', trainable=False)
        update_op = optimizer.minimize(loss, var_list=[x], global_step=step, name="update_op")

        # denoising
        def denoise_bregman(image):
            denoised_image = denoise_tv_bregman(image[0,:,:,0], weight=100000000.0, max_iter=100, eps=1e-3)
            denoised_image = np.expand_dims(np.expand_dims(denoised_image, axis=2), axis=0)
            return denoised_image.astype(np.float32)

        denoised_image = tf.py_func(denoise_bregman, [x], tf.float32)
        update_denoise = x.assign(denoised_image, use_locking=True)

        clipped_value = tf.clip_by_value(x, 0.0, 1.0)
        update_clip = x.assign(clipped_value, use_locking=True)

        sess.run(tf.global_variables_initializer())
        sess.run(tf.initialize_local_variables())

        print("Processing images...")
        for i in range(num_examples):
          sess.run(target_class.initializer)
          sess.run(x.initializer)
          sess.run(step.initializer)
          label = sess.run(target_class.value())

          cost = sess.run(loss)
          current_step = sess.run(step)

          while cost > threshold and current_step < max_steps:
              sess.run(update_op)
              if clip:
                  sess.run(update_clip)
              if denoise:
                  with tf.control_dependencies([update_op]):
                      sess.run(update_denoise)
              cost = sess.run(loss)
              current_step = sess.run(step)

              if current_step % 10 == 0:
                  print("Step {}: cost={}".format(current_step,cost))

            synthesized_x = sess.run(x)
            synthesized_images.append(synthesized_x)
            labels.append(label)

            if i % 10 == 0:
                print("{}/{}".format(i,num_examples))

        images = np.concatenate(synthesized_images, axis=0)
        labels = np.concatenate(labels, axis=0)

        return images, labels

IndentationError: ignored

In [0]:
# Constants
logdir = "/content/drive/My Drive/Sem 7/DeepDOT/logs"
trojan_checkpoint_dir = "/content/drive/My Drive/Sem 7/DeepDOT/trojan_logs"
layer_input_weights = "model/w3"
layer_output_tensor = "fc1_relu"
softmax_output_tensor = "softmax_tensor"
num_training_examples = 5000
predict_filename = trojan_checkpoint_dir + "/predictions.txt"
debug = "debug_true"

In [73]:
print("Selected layer: {}".format(layer_output_tensor))
print("Weights into selected layer: {}".format(layer_input_weights))

Selected layer: fc1_relu
Weights into selected layer: model/w3


In [74]:
print("Selecting target neuron...")

# locate target neuron
neuron_index, total_num_neurons = select_neuron(layer_input_weights, logdir)

print("Target neuron: neuron {} out of {}".format(neuron_index, total_num_neurons))

Selecting target neuron...
INFO: Restoring from /content/drive/My Drive/Sem 7/DeepDOT/logs/model.ckpt-10000.meta
INFO:tensorflow:Restoring parameters from /content/drive/My Drive/Sem 7/DeepDOT/logs/model.ckpt-10000
Target neuron: neuron 999 out of 1024


In [75]:
# define trigger mask
TRIGGER_MASK = np.zeros(IMAGE_SHAPE)
TRIGGER_MASK[24:27,24:27] = 1.0

print("Pixels in trigger mask: {}/{} ({} %)".format(np.count_nonzero(TRIGGER_MASK), TRIGGER_MASK.size, (100.0 * np.count_nonzero(TRIGGER_MASK))/ TRIGGER_MASK.size))

Pixels in trigger mask: 9/784 (1.1479591836734695 %)


In [76]:
# learn trigger mask
final_trigger = learn_trigger(layer_output_tensor, neuron_index, TRIGGER_MASK, logdir)
np.save("trojan_trigger_liu.npy", final_trigger)

print("Trigger mask learned.")

INFO:tensorflow:Restoring parameters from /content/drive/My Drive/Sem 7/DeepDOT/logs/model.ckpt-10000
Initial gradient magnitude:  0.0
0.0
0.0
0.0
0.0
0.0
6.432694
Step 10: cost=9734.025390625, masked_gradient_size=5.699395179748535
Step 20: cost=9398.6982421875, masked_gradient_size=4.139279365539551
Step 30: cost=9051.9736328125, masked_gradient_size=4.239096641540527
Step 40: cost=8705.443359375, masked_gradient_size=3.8963310718536377
Step 50: cost=8359.0751953125, masked_gradient_size=4.103608131408691
Step 60: cost=8018.29150390625, masked_gradient_size=3.5384390354156494
Step 70: cost=7683.34521484375, masked_gradient_size=3.7586982250213623
Step 80: cost=7356.6875, masked_gradient_size=3.4886271953582764
Step 90: cost=7030.65966796875, masked_gradient_size=3.7483205795288086
Step 100: cost=6697.328125, masked_gradient_size=3.760270595550537
Step 110: cost=6376.5400390625, masked_gradient_size=3.557617425918579
Step 120: cost=6077.583984375, masked_gradient_size=3.58603549003601

In [0]:
print("Synthesizing training data...")
print("Synthesizing {} total images.".format(num_training_examples))

train_data, train_labels = synthesize_training_data(softmax_output_tensor, logdir, num_examples=num_training_examples, clip=False, denoise=False, debug=debug)
print("Done synthesizing training data.")

Synthesizing training data...
Synthesizing 5000 total images.
Extracting MNIST-data/train-images-idx3-ubyte.gz
Extracting MNIST-data/train-labels-idx1-ubyte.gz
Extracting MNIST-data/t10k-images-idx3-ubyte.gz
Extracting MNIST-data/t10k-labels-idx1-ubyte.gz
INFO:tensorflow:Restoring parameters from /content/drive/My Drive/Sem 7/DeepDOT/logs/model.ckpt-10000


In [0]:
print("Preparing training and eval data.")
example_image_array = synthesized_images[0,:,:,0] - np.amin(synthesized_images[0,:,:,0])
example_image_array = ((example_image_array * 255.0)/np.amax(example_image_array)).astype(np.uint8)
img = Image.fromarray(example_image_array,'L')
img.save(trojan_checkpoint_dir + '/example_image.png')

In [0]:
np.save("synthesized_data.npy", synthesized_images)
np.save("synthesized_labels.npy", labels)