<a href="https://colab.research.google.com/github/Sugandh-Mishra/attacks/blob/main/L_BFGS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from tensorflow import keras
from matplotlib import pyplot as plt
from tensorflow.keras import layers, models, datasets
import numpy as np

In [2]:
def train_model():
    (train_images, train_labels), (test_images, test_labels) = datasets.mnist.load_data()
    train_images = train_images.reshape((60000, 28, 28, 1)).astype('float32') / 255.0
    test_images = test_images.reshape((10000, 28, 28, 1)).astype('float32') / 255.0
    
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu',padding='same'),
        layers.MaxPooling2D((2, 2)),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(10, activation='softmax')
    ])

    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    model.fit(train_images, train_labels, epochs=2, validation_data=(test_images, test_labels))
  
    return model, test_images, test_labels, train_images

In [3]:
model, test_images, test_labels, x_train = train_model()


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
Epoch 1/2
Epoch 2/2


In [4]:
# # Call the L-BFGS adversarial attack function
# adv_images, adv_probs, adv_labels = lbfgs_attack(model, test_images, test_labels, epsilon=0.01, max_iterations=100)

# # Print the results
# print("Adversarial Images Shape:", adv_images.shape)
# print("Adversarial Probabilities Shape:", adv_probs.shape)
# print("Adversarial Labels Shape:", adv_labels.shape)


In [5]:
# # Evaluate the accuracy of the model on the original test images
# _, test_accuracy = model.evaluate(test_images, test_labels, verbose=0)
# print(f"Accuracy on original test images: {test_accuracy}")

# # Evaluate the accuracy of the model on the adversarial images
# _, adv_accuracy = model.evaluate(adv_images, test_labels, verbose=0)
# print(f"Accuracy on adversarial images: {adv_accuracy}")

In [6]:
# def visualize_lbfgs_attack(original_images, attacked_images, original_labels, attacked_labels, probs):
#     num_images = len(original_images)
#     num_rows = 10
#     num_cols = 10
#     fig, axes = plt.subplots(num_rows, num_cols, figsize=(12, 12))

#     for i in range(num_rows):
#         for j in range(num_cols):
#             index = i * num_cols + j
#             if index < num_images:
#                 axes[i][j].imshow(original_images[index].reshape(28, 28), cmap='gray')
#                 axes[i][j].axis('off')
#                 if attacked_labels[index] == original_labels[index]:
#                     # If original label is same as attacked label, display it in green color
#                     axes[i][j].set_title(f'Original: {original_labels[index]}\nAttacked: {attacked_labels[index]}\nProb: {probs[index][attacked_labels[index]]:.2f}', color='green')
#                 else:
#                     # If original label is different from attacked label, display it in red color
#                     axes[i][j].set_title(f'Original: {original_labels[index]}\nAttacked: {attacked_labels[index]}\nProb: {probs[index][attacked_labels[index]]:.2f}', color='red')
#             else:
#                 axes[i][j].axis('off')
#     plt.tight_layout()
#     plt.show()


# visualize_lbfgs_attack(test_images,adv_images,test_labels,adv_labels,adv_probs)

In [None]:
from scipy.optimize import fmin_l_bfgs_b
from keras.losses import categorical_crossentropy


# Define the functions for distance, cross-entropy loss, and total loss
def distance(x, x1):
    d = x - x1
    D = np.sqrt(np.sum(d**2))
    return D


def cross_entropy(x1, y_prime):
    y1 = tf.zeros((1, 10))  # Assuming 10 classes for one-hot encoding, add batch dimension
    y1 = tf.tensor_scatter_nd_update(y1, [[0, y_prime]], [1.])  # Update y1 with 1 at y_prime index
    yh1 = model.predict(tf.expand_dims(x1, axis=0))  # Get prediction for x1
    ce = tf.reduce_sum(tf.keras.losses.categorical_crossentropy(y1, yh1))  # Use yh1 for x1 prediction
    return ce




def total_loss(x1, *args):
    x = args[0]
    y_prime = args[1]
    c = args[2]
    x_shape = x.shape
    x1 = x1.reshape(x_shape)  # Reshape x1 to match the original input shape
    ly = model.predict(np.array([x1, x1]))[0]
    l_f = c * distance(x, x1) + cross_entropy(x1, y_prime)
    return l_f

def l_bfgs_attack(x, y_target, c, eps):
    # x: Original input
    # y_target: Target class for the attack
    # c: Weight for the total loss
    # eps: Epsilon for perturbation

    x_shape = x.shape
    initial = np.ones(x_shape).flatten()  # Flatten initial to match the original input shape
    bounds = [(x.flatten()[i] - eps, x.flatten()[i] + eps) for i in range(x_shape[0] * x_shape[1])]  # Bounds for the perturbed input

    x_adv, _, _ = fmin_l_bfgs_b(total_loss, x0=initial, args=(x, y_target, c), bounds=bounds, approx_grad=True)
    x_adv = x_adv.reshape(x_shape)  # Reshape x_adv to match the original input shape
    y_probs = model.predict(np.array([x_adv, x_adv]))  # Get predicted probabilities for x_adv
    y_pred = np.argmax(y_probs, axis=-1)[0]  # Get predicted class label
    distance_adv = distance(x, x_adv)
    return x_adv, y_pred, distance_adv




# Select a sample from the MNIST dataset
x_sample = x_train[0]

# Set the target class for the attack
y_target = 5   

# Set the weight for the total loss and epsilon for perturbation
c = 1.0  
eps = 0.1  

x_adv, y_pred, distance_adv = l_bfgs_attack(x_sample, y_target, c, eps)





In [None]:

# Print the results
print('--- Results ---')
# x_print = model.predict(tf.expand_dims(x_sample, axis=0))  
# print(x_print)

print('Original Input: x_sample')
print(x_sample)
# print(model.predict(x_sample))
# y=model.predict(x_sample)
# print(np.argmax(y[0]))
print('Target Class for Attack: y_target')
print(y_target)
print('Weight for Total Loss: c')
print(c)
print('Epsilon for Perturbation: eps')
print(eps)
# print('Adversarial Input: x_adv')
# print(x_adv)
print('Predicted Class for Adversarial Input: y_pred')
print(y_pred)
print('Distance between Original Input and Adversarial Input: distance_adv')
print(distance_adv)