In [6]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import time
import numpy as np
import keras
from keras import backend as K
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D

import tensorflow as tf
from tensorflow.python.platform import flags

from cleverhans.utils_mnist import data_mnist
from cleverhans.utils_tf import model_train, model_argmax, model_eval
from cleverhans.attacks import FastGradientMethod, SaliencyMapMethod
from cleverhans.utils import AccuracyReport, other_classes
from cleverhans.utils import pair_visual, grid_visual, AccuracyReport

from cleverhans.utils_keras import cnn_model
from cleverhans.utils_keras import KerasModelWrapper


In [7]:
def mnist_fgsm(train_start=0, train_end=60000, test_start=0,
                   test_end=10000, nb_epochs=6, batch_size=128,
                   learning_rate=0.001, train_dir="/tmp",
                   filename="mnist.ckpt", load_model=False, fgsm_eps=0.3,
                   testing=False):
    keras.layers.core.K.set_learning_phase(0)

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Use label smoothing
    assert Y_train.shape[1] == 10
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = cnn_model()
    preds = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
        report.clean_train_clean_eval = acc
        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': train_dir,
        'filename': filename
    }
    ckpt = tf.train.get_checkpoint_state(train_dir)
    ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path

    rng = np.random.RandomState([2017, 8, 30])
    if load_model and ckpt_path:
        saver = tf.train.Saver()
        saver.restore(sess, ckpt_path)
        print("Model loaded from: {}".format(ckpt_path))
        evaluate()
    else:
        print("Model was not loaded, training from scratch.")
        model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate,
                    args=train_params, save=True, rng=rng)

    # Calculate training error
    if testing:
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params)
        report.train_clean_train_clean_eval = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph
    wrap = KerasModelWrapper(model)
    fgsm = FastGradientMethod(wrap, sess=sess)
    fgsm_params = {'eps': fgsm_eps,
                   'clip_min': 0.,
                   'clip_max': 1.}
    adv_x = fgsm.generate(x, **fgsm_params)
    # Consider the attack to be constant
    adv_x = tf.stop_gradient(adv_x)
    preds_adv = model(adv_x)
    start = time.time()
    adv_examples = fgsm.generate_np(X_test[:1000], **fgsm_params)
    print(time.time() - start)
    print(adv_examples.shape)
    big_N = 28 * 28 * 1. # HARDCODED HERE! THE NUMBER OF ELEMENTS IN EACH INPUT
    scores = np.linalg.norm(adv_examples.reshape(adv_examples.shape[0],-1)-(X_test[:1000]).reshape(X_test[:1000].shape[0],-1),axis=1)/big_N

    print(scores.shape)
    print(np.median(scores))
        
    # Evaluate the accuracy of the MNIST model on adversarial examples
    eval_par = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_adv, X_test[:1000], Y_test[:1000], args=eval_par)
    print('Test accuracy on adversarial examples: %0.4f\n' % acc)
    report.clean_train_adv_eval = acc

    # Calculating train error
    if testing:
        eval_par = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds_adv, X_train,
                         Y_train, args=eval_par)
        report.train_clean_train_adv_eval = acc
    
    return None

In [9]:
mnist_fgsm(nb_epochs=6)

Extracting /tmp/train-images-idx3-ubyte.gz
Extracting /tmp/train-labels-idx1-ubyte.gz
Extracting /tmp/t10k-images-idx3-ubyte.gz
Extracting /tmp/t10k-labels-idx1-ubyte.gz
X_train shape: (60000, 28, 28, 1)
X_test shape: (10000, 28, 28, 1)
Defined TensorFlow model graph.
Model was not loaded, training from scratch.
Test accuracy on legitimate examples: 0.9887
Test accuracy on legitimate examples: 0.9900
Test accuracy on legitimate examples: 0.9922
Test accuracy on legitimate examples: 0.9920
Test accuracy on legitimate examples: 0.9926
Test accuracy on legitimate examples: 0.9927
0.6147949695587158
(1000, 28, 28, 1)
(1000,)
0.00922415
Test accuracy on adversarial examples: 0.1090



In [4]:
mnist_fgsm(nb_epochs=12)

Extracting /tmp/train-images-idx3-ubyte.gz
Extracting /tmp/train-labels-idx1-ubyte.gz
Extracting /tmp/t10k-images-idx3-ubyte.gz
Extracting /tmp/t10k-labels-idx1-ubyte.gz
X_train shape: (60000, 28, 28, 1)
X_test shape: (10000, 28, 28, 1)
Defined TensorFlow model graph.
Model was not loaded, training from scratch.
Test accuracy on legitimate examples: 0.9883
Test accuracy on legitimate examples: 0.9902
Test accuracy on legitimate examples: 0.9919
Test accuracy on legitimate examples: 0.9918
Test accuracy on legitimate examples: 0.9934
Test accuracy on legitimate examples: 0.9932
Test accuracy on legitimate examples: 0.9928
Test accuracy on legitimate examples: 0.9930
Test accuracy on legitimate examples: 0.9940
Test accuracy on legitimate examples: 0.9934
Test accuracy on legitimate examples: 0.9934
Test accuracy on legitimate examples: 0.9937
0.2780725955963135
(1000, 28, 28, 1)
(1000,)
0.00919089
Test accuracy on adversarial examples: 0.1240



In [5]:
mnist_fgsm(nb_epochs=18)

Extracting /tmp/train-images-idx3-ubyte.gz
Extracting /tmp/train-labels-idx1-ubyte.gz
Extracting /tmp/t10k-images-idx3-ubyte.gz
Extracting /tmp/t10k-labels-idx1-ubyte.gz
X_train shape: (60000, 28, 28, 1)
X_test shape: (10000, 28, 28, 1)
Defined TensorFlow model graph.
Model was not loaded, training from scratch.
Test accuracy on legitimate examples: 0.9884
Test accuracy on legitimate examples: 0.9900
Test accuracy on legitimate examples: 0.9922
Test accuracy on legitimate examples: 0.9919
Test accuracy on legitimate examples: 0.9932
Test accuracy on legitimate examples: 0.9931
Test accuracy on legitimate examples: 0.9930
Test accuracy on legitimate examples: 0.9932
Test accuracy on legitimate examples: 0.9940
Test accuracy on legitimate examples: 0.9930
Test accuracy on legitimate examples: 0.9935
Test accuracy on legitimate examples: 0.9933
Test accuracy on legitimate examples: 0.9932
Test accuracy on legitimate examples: 0.9931
Test accuracy on legitimate examples: 0.9934
Test accura

In [6]:
mnist_fgsm(fgsm_eps=0.6)

Extracting /tmp/train-images-idx3-ubyte.gz
Extracting /tmp/train-labels-idx1-ubyte.gz
Extracting /tmp/t10k-images-idx3-ubyte.gz
Extracting /tmp/t10k-labels-idx1-ubyte.gz
X_train shape: (60000, 28, 28, 1)
X_test shape: (10000, 28, 28, 1)
Defined TensorFlow model graph.
Model was not loaded, training from scratch.
Test accuracy on legitimate examples: 0.9884
Test accuracy on legitimate examples: 0.9902
Test accuracy on legitimate examples: 0.9918
Test accuracy on legitimate examples: 0.9922
Test accuracy on legitimate examples: 0.9933
Test accuracy on legitimate examples: 0.9933
0.318805456161499
(1000, 28, 28, 1)
(1000,)
0.0184118
Test accuracy on adversarial examples: 0.0260



In [7]:
mnist_fgsm(fgsm_eps=0.45)

Extracting /tmp/train-images-idx3-ubyte.gz
Extracting /tmp/train-labels-idx1-ubyte.gz
Extracting /tmp/t10k-images-idx3-ubyte.gz
Extracting /tmp/t10k-labels-idx1-ubyte.gz
X_train shape: (60000, 28, 28, 1)
X_test shape: (10000, 28, 28, 1)
Defined TensorFlow model graph.
Model was not loaded, training from scratch.
Test accuracy on legitimate examples: 0.9881
Test accuracy on legitimate examples: 0.9897
Test accuracy on legitimate examples: 0.9917
Test accuracy on legitimate examples: 0.9923
Test accuracy on legitimate examples: 0.9933
Test accuracy on legitimate examples: 0.9928
0.3624598979949951
(1000, 28, 28, 1)
(1000,)
0.0139298
Test accuracy on adversarial examples: 0.0290



In [8]:
mnist_fgsm(fgsm_eps=0.15)

Extracting /tmp/train-images-idx3-ubyte.gz
Extracting /tmp/train-labels-idx1-ubyte.gz
Extracting /tmp/t10k-images-idx3-ubyte.gz
Extracting /tmp/t10k-labels-idx1-ubyte.gz
X_train shape: (60000, 28, 28, 1)
X_test shape: (10000, 28, 28, 1)
Defined TensorFlow model graph.
Model was not loaded, training from scratch.
Test accuracy on legitimate examples: 0.9888
Test accuracy on legitimate examples: 0.9906
Test accuracy on legitimate examples: 0.9926
Test accuracy on legitimate examples: 0.9918
Test accuracy on legitimate examples: 0.9925
Test accuracy on legitimate examples: 0.9934
0.3561553955078125
(1000, 28, 28, 1)
(1000,)
0.00461915
Test accuracy on adversarial examples: 0.6620



In [3]:
def mnist_jsma(train_start=0, train_end=60000, test_start=0,
                        test_end=10000, viz_enabled=False, nb_epochs=6,
                        batch_size=128, nb_classes=10, source_samples=10, jsma_theta=1.,
                        learning_rate=0.001):
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # MNIST-specific dimensions
    img_rows = 28
    img_cols = 28
    channels = 1

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    print("Created TensorFlow session.")

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = cnn_model()
    preds = model(x)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    sess.run(tf.global_variables_initializer())
    rng = np.random.RandomState([1997, 5, 28])
    model_train(sess, x, y, preds, X_train, Y_train, args=train_params,
                rng=rng)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
    assert X_test.shape[0] == test_end - test_start, X_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    # Instantiate a SaliencyMapMethod attack object
    jsma = SaliencyMapMethod(model, back='tf', sess=sess)
    jsma_params = {'theta': jsma_theta, 'gamma': 0.1,
                   'clip_min': 0., 'clip_max': 1.,
                   'y_target': None}

    # Loop over the samples we want to perturb into adversarial examples
    adv_x = jsma.generate(x, **jsma_params)
    # Consider the attack to be constant
    adv_x = tf.stop_gradient(adv_x)
    print("Step 1")
    preds_adv = model(adv_x)
    start = time.time()
    adv_examples = jsma.generate_np(X_test[:1000], **jsma_params)
    print(time.time() - start)
    
    big_N = 28 * 28 * 1. # HARDCODED HERE! THE NUMBER OF ELEMENTS IN EACH INPUT
    scores = np.linalg.norm(adv_examples.reshape(adv_examples.shape[0],-1)-X_test[:1000].reshape(X_test[:1000].shape[0],-1),axis=1)/big_N

    print(scores.shape)
    print(np.median(scores))
        
    # Evaluate the accuracy of the MNIST model on adversarial examples
    eval_par = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_adv, X_test[:1000], Y_test[:1000], args=eval_par)
    print('Test accuracy on adversarial examples: %0.4f\n' % acc)
    report.clean_train_adv_eval = acc

    return None

In [None]:
mnist_jsma()

Created TensorFlow session.
Extracting /tmp/train-images-idx3-ubyte.gz
Extracting /tmp/train-labels-idx1-ubyte.gz
Extracting /tmp/t10k-images-idx3-ubyte.gz
Extracting /tmp/t10k-labels-idx1-ubyte.gz
X_train shape: (60000, 28, 28, 1)
X_test shape: (10000, 28, 28, 1)
Defined TensorFlow model graph.
Test accuracy on legitimate test examples: 0.9871




Step 1
408.20263266563416
(1000,)
0.00586339
Test accuracy on adversarial examples: 0.0450



In [None]:
# Ideas for next steps: change the code in the first shit: keep track of the number of successful attacks.
# DGAF about black-box attacks for now.
mnist_jsma(nb_epochs=12)
mnist_jsma(nb_epochs=18)

In [None]:
mnist_jsma(jsma_theta=0.5)
mnist_jsma(jsma_theta=1.25)
mnist_jsma(jsma_theta=0.75)

Created TensorFlow session.
Extracting /tmp/train-images-idx3-ubyte.gz
Extracting /tmp/train-labels-idx1-ubyte.gz
Extracting /tmp/t10k-images-idx3-ubyte.gz
Extracting /tmp/t10k-labels-idx1-ubyte.gz
X_train shape: (60000, 28, 28, 1)
X_test shape: (10000, 28, 28, 1)
Defined TensorFlow model graph.
Test accuracy on legitimate test examples: 0.9898




Step 1
690.9513363838196
(1000,)
0.00432502
Test accuracy on adversarial examples: 0.2610

Created TensorFlow session.
Extracting /tmp/train-images-idx3-ubyte.gz
Extracting /tmp/train-labels-idx1-ubyte.gz
Extracting /tmp/t10k-images-idx3-ubyte.gz
Extracting /tmp/t10k-labels-idx1-ubyte.gz
X_train shape: (60000, 28, 28, 1)
X_test shape: (10000, 28, 28, 1)
Defined TensorFlow model graph.
Test accuracy on legitimate test examples: 0.9899
Step 1
371.8326008319855
(1000,)
0.00581703
Test accuracy on adversarial examples: 0.0210

Created TensorFlow session.
Extracting /tmp/train-images-idx3-ubyte.gz
Extracting /tmp/train-labels-idx1-ubyte.gz
Extracting /tmp/t10k-images-idx3-ubyte.gz
Extracting /tmp/t10k-labels-idx1-ubyte.gz
X_train shape: (60000, 28, 28, 1)
X_test shape: (10000, 28, 28, 1)
Defined TensorFlow model graph.
Test accuracy on legitimate test examples: 0.9897
Step 1


In [8]:
mnist_jsma(jsma_theta=0.75)

Created TensorFlow session.
Extracting /tmp/train-images-idx3-ubyte.gz
Extracting /tmp/train-labels-idx1-ubyte.gz
Extracting /tmp/t10k-images-idx3-ubyte.gz
Extracting /tmp/t10k-labels-idx1-ubyte.gz
X_train shape: (60000, 28, 28, 1)
X_test shape: (10000, 28, 28, 1)
Defined TensorFlow model graph.
Test accuracy on legitimate test examples: 0.9876




Step 1
498.86871457099915
(1000,)
0.00511915
Test accuracy on adversarial examples: 0.0990

