In [1]:
import foolbox as fb

In [2]:
import torch
import torchvision

In [3]:
torch.cuda.is_available()

True

In [4]:
model = torchvision.models.resnet18(pretrained=True)



In [5]:
import torch
import torchvision
import copy
from torch.quantization import quantize_fx

m = copy.deepcopy(model)
m.to("cpu")
m.eval()
img = torch.randn(16, 3, 224, 224)
qconfig_dict = {"": torch.quantization.get_default_qconfig("fbgemm")}
model_prepared = quantize_fx.prepare_fx(m, qconfig_dict, img)

with torch.inference_mode():
    for _ in range(10):
        model_prepared(img)
model = quantize_fx.convert_fx(model_prepared)



In [6]:
model = model.eval()

In [7]:
# PyTorch ResNet18
preprocessing = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], axis=-3)
bounds = (0, 1)
fmodel = fb.PyTorchModel(model, bounds=bounds, preprocessing=preprocessing)

## Transform bounds

In the following, we want to work with a model that has `(0, 1)` bounds. Use `fmodel.transform_bounds`.

In [8]:
fmodel = fmodel.transform_bounds((0, 1))

In [9]:
assert fmodel.bounds == (0, 1)

## Get some test images

Get a batch of 16 images and the corrresponding labels. You can use `foolbox.utils.samples` to get up to 20 images, but you can also you your own data loader.

In [10]:
print("test")

test


In [11]:
images, labels = fb.utils.samples(fmodel, dataset='imagenet', batchsize=16)

In [12]:
print("test1")

test1


## Check the accuracy of your model to make sure you specified the correct preprocessing

In [13]:
# fb.utils.accuracy(fmodel, images, labels)

In [14]:
type(images), images.shape

(torch.Tensor, torch.Size([16, 3, 224, 224]))

In [15]:
type(labels), labels.shape

(torch.Tensor, torch.Size([16]))

## Run LinfDeepFool

In [16]:
attack = fb.attacks.LinfDeepFoolAttack()

In [None]:
raw, clipped, is_adv = attack(fmodel, images, labels, epsilons=0.03)

In [None]:
is_adv

## Use EagerPy tensors and rerun the attack

In [None]:
import eagerpy as ep

In [None]:
images = ep.astensor(images)
labels = ep.astensor(labels)

In [None]:
raw, clipped, is_adv = attack(fmodel, images, labels, epsilons=0.03)

In [None]:
is_adv

In [None]:
is_adv.float32().mean().item()

## Using the Misclassification criterion explicitly

In [None]:
criterion = fb.criteria.Misclassification(labels)

In [None]:
raw, clipped, is_adv = attack(fmodel, images, criterion, epsilons=0.03)

In [None]:
is_adv

## Run the attack using many epsilons

In [None]:
import numpy as np

In [None]:
epsilons = np.linspace(0.0, 0.005, num=20)

In [None]:
raw, clipped, is_adv = attack(fmodel, images, labels, epsilons=epsilons)

In [None]:
is_adv.shape

In [None]:
is_adv.float32().mean(axis=-1)

In [None]:
robust_accuracy = 1 - is_adv.float32().mean(axis=-1)

In [None]:
robust_accuracy

## Plot the robust accuracy as a function of epsilon

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(epsilons, robust_accuracy.numpy())

We can see that **the model is not robust** at all. Even extremely small perturbations (Linf norm of 0.003 for pixels between 0 and 1) are sufficient
to change the classification.

## Run a targeted attack

In [None]:
labels

In [None]:
target_classes = (labels + 200) % 1000

In [None]:
target_classes

In [None]:
criterion = fb.criteria.TargetedMisclassification(target_classes)

In [None]:
attack = fb.attacks.L2CarliniWagnerAttack(steps=100)
# Note: 100 is too little -> results will be bad = perturbations will be relatively large (but 1000 takes much longer)

In [None]:
# epsilons = np.linspace(0.0, 10.0, num=20)
epsilons = None

In [None]:
advs, _, is_adv = attack(fmodel, images, criterion, epsilons=epsilons)

In [None]:
is_adv

In [None]:
fb.distances.l2(images, advs)

In [None]:
# attack_success_rate = is_adv.float32().mean(axis=-1)

In [None]:
# plt.plot(epsilons, attack_success_rate.numpy())

## Visualizing adversarial examples and perturbations

In [None]:
fb.plot.images(images)

In [None]:
fb.plot.images(advs)

In [None]:
fb.plot.images(advs - images, n=4, bounds=(-0.1, 0.1), scale=4.)

The adversarial examples look like the orignal (clean) images. That shows that **the model is not robust against adversarial attacks**. Tiny perturbations mislead the model and allow the attacker to control which class is recognized.

## Continuing from here ...



*   Repeating an attack (`attack = attack.repeat(3)`)
*   Getting the per-sample worst-case over multiple attacks
    * stack attack results and take max over the attacks before taking the mean over samples
*   Gradient estimators (`fb.gradient_estimators.*`)
*   Transfer attacks using gradient substitution (see examples)

