<a href="https://colab.research.google.com/github/Sugandh-Mishra/attacks/blob/main/mydf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [121]:
import tensorflow as tf
from tensorflow import keras
from matplotlib import pyplot as plt
from tensorflow.keras import layers, models, datasets
import numpy as np

In [122]:
def train_model():
    (train_images, train_labels), (test_images, test_labels) = datasets.mnist.load_data()
    train_images = train_images.reshape((60000, 28, 28, 1)).astype('float32') / 255.0
    test_images = test_images.reshape((10000, 28, 28, 1)).astype('float32') / 255.0
    
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu',padding='same'),
        layers.MaxPooling2D((2, 2)),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(10, activation='softmax')
    ])

    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    model.fit(train_images, train_labels, epochs=5, validation_data=(test_images, test_labels))
  
    return model, test_images, test_labels, train_images



In [123]:
def deepfool_binary(model, x, epochs, eta, clip_min, clip_max, min_prob):
    """
    Applies the DeepFool attack to a binary classifier.
    
    Arguments:
        model: A binary TensorFlow Keras model.
        x: The input image to attack.
        epochs: The number of epochs to run the attack for.
        eta: The step size for the attack. overshoot
        clip_min: The minimum pixel value allowed.
        clip_max: The maximum pixel value allowed.
        min_prob: The minimum probability of the target class allowed.
        
    Returns:
        An adversarial example for the input image.
    """
    
    x_adv = tf.identity(x)  # Start with a copy of the input image
    
    for _ in range(epochs):
        # Compute the gradient of the loss with respect to the input
        with tf.GradientTape() as tape:
            tape.watch(x_adv)
            logits = model(x_adv, training=False)
            loss = tf.keras.losses.binary_crossentropy(tf.constant([1.0]), logits, from_logits=True)
        grad = tape.gradient(loss, x_adv)

        # Compute the L2 norm of the gradient
        norm = tf.norm(tf.reshape(grad, [-1]))

        # Choose the minimum perturbation direction
        min_perturbation = float('inf')
        min_adv = None
        for target in [0, 1]:
            if target == 0:
                true_label = 1
            else:
                true_label = 0
            if logits[0][target] > min_prob and target != true_label:
                w = tf.reshape(model.weights[0], [-1])
                f = tf.reduce_sum(tf.reshape(x_adv, [-1]) * w) + model.weights[1]
                f_prime = tf.reduce_sum(w * tf.reshape(grad, [-1]))
                perturbation = tf.abs((f - f_prime) / norm**2) * tf.reshape(grad, tf.shape(x_adv))
                perturbation_norm = tf.norm(tf.reshape(perturbation, [-1]))
                if perturbation_norm < min_perturbation:
                    min_perturbation = perturbation_norm
                    min_adv = x_adv + eta * tf.clip_by_value(perturbation, clip_min, clip_max) / perturbation_norm

        # Update the adversarial example
        x_adv = tf.clip_by_value(min_adv, clip_min, clip_max)

    return x_adv


In [124]:
def deepfool_multiclass(model, x, epochs, eta, clip_min, clip_max, min_prob):
    """
    Applies the DeepFool attack to a multiclass classifier.
    
    Arguments:
        model: A multiclass TensorFlow Keras model.
        x: The input image to attack.
        epochs: The number of epochs to run the attack for.
        eta: The step size for the attack.
        clip_min: The minimum pixel value allowed.
        clip_max: The maximum pixel value allowed.
        min_prob: The minimum probability of the target class allowed.
        
    Returns:
        An adversarial example for the input image.
    """
    
    x_adv = tf.identity(x)  # Start with a copy of the input image
    
    for _ in range(epochs):
        # Compute the gradient of the loss with respect to the input
        with tf.GradientTape() as tape:
            tape.watch(x_adv)
            logits = model(x_adv, training=False)
            loss = tf.keras.losses.categorical_crossentropy(tf.one_hot(tf.argmax(logits, axis=-1), logits.shape[-1]),
                                                             logits, from_logits=True)
        grad = tape.gradient(loss, x_adv)

        # Choose the minimum perturbation direction
        min_perturbation = float('inf')
        min_adv = None
        for target in range(logits.shape[-1]):
            if logits[0][target] > min_prob:
                w = tf.reshape(model.weights[0][:, target], [-1])
                f = tf.reduce_sum(tf.reshape(x_adv, [-1]) * w) + model.weights[1][target]
                f_prime = tf.reduce_sum(w * tf.reshape(grad, [-1]))
                perturbation = tf.abs((f - f_prime) / tf.norm(w)**2) * tf.reshape(w, [1, -1])
                perturbation_norm = tf.norm(tf.reshape(perturbation, [-1]))
                if perturbation_norm < min_perturbation:
                    min_perturbation = perturbation_norm
                    min_adv = x_adv + eta * tf.clip_by_value(tf.reshape(perturbation, tf.shape(x_adv)), clip_min, clip_max) / perturbation_norm

        # Update the adversarial example
        x_adv = tf.clip_by_value(min_adv, clip_min, clip_max)

    return x_adv

In [126]:
model,test_images,test_labels,train_images = train_model()


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [127]:
test_loss, test_acc = model.evaluate(test_images,test_labels, verbose=2)
print("Test accuracy:", test_acc)

313/313 - 1s - loss: 0.0278 - accuracy: 0.9910 - 656ms/epoch - 2ms/step
Test accuracy: 0.9909999966621399


In [128]:
def test(model, x_test, y_test, num_examples=10, eps=0.1):
    """
    Test the accuracy of the model on clean and adversarial examples generated using the DeepFool attack.
    
    Parameters:
        model (tf.keras.Model): The trained multiclass classification model to be tested.
        x_test (numpy.ndarray): The test set of input images.
        y_test (numpy.ndarray): The test set of target labels.
        num_examples (int): The number of examples to test on.
        eps (float): The epsilon value for generating the adversarial examples.
        
    Returns:
        float: The accuracy of the model on the test set.
    """
    # Initialize the number of correctly classified examples
    num_correct_clean = 0
    num_correct_adv = 0

    # Loop over the test set
    for i in range(num_examples):
        # Choose a random test example
        # print("check1")
        x = x_test[i]
        # print("check2")
        y_true = y_test[i]
        # print("check")
        # Generate adversarial example using DeepFool attack
        # adv_x = deepfool_multiclass(model, x, epochs=3, eta=0.01, clip_min=0.0, clip_max=1.0, min_prob=0.5)
        # print("check3")
        adv_x=deep_fool_attack(model,x)
        # Evaluate model on clean example
        y_pred_clean = model.predict(tf.reshape(x, (1, *x.shape)))
        y_pred_clean = tf.argmax(tf.nn.softmax(y_pred_clean), axis=-1).numpy()[0]
        if y_pred_clean == y_true:
            num_correct_clean += 1

        # Evaluate model on adversarial example
        y_pred_adv = model.predict(tf.reshape(adv_x, (1, *adv_x.shape)))
        y_pred_adv = tf.argmax(tf.nn.softmax(y_pred_adv), axis=-1).numpy()[0]
        if y_pred_adv == y_true:
            num_correct_adv += 1

        # Print results for this example
        print("Example {}:".format(i))
        print("True label:", y_true)
        print("Predicted label for original image:", y_pred_clean)
        print("Predicted label for adversarial image:", y_pred_adv)
        # print("L-infinity norm of perturbation:", np.max(np.abs(adv_x - x)))
        print("")

    # Calculate the accuracy on clean examples and adversarial examples
    acc_clean = num_correct_clean / num_examples
    acc_adv = num_correct_adv / num_examples

    # Print the overall accuracy on clean and adversarial examples
    print("Accuracy on clean examples: {:.2%}".format(acc_clean))
    print("Accuracy on adversarial examples: {:.2%}".format(acc_adv))

    return acc_clean, acc_adv


In [129]:
print(test_images.shape)
# test_images = test_images.reshape(-1, 28, 28, 1)
# print(test_images.shape)


(10000, 28, 28, 1)


In [None]:
acc_clean, acc_adv=test(model, test_images, test_labels)