In [11]:
import foolbox
import numpy as np
import torchvision.models as models

In [12]:
# instantiate model (supports PyTorch, Keras, TensorFlow (Graph and Eager), MXNet and many more)
model = models.resnet18(pretrained=True).eval()
preprocessing = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], axis=-3)
fmodel = foolbox.models.PyTorchModel(model, bounds=(0, 1), num_classes=1000, preprocessing=preprocessing)

In [13]:
# get a batch of images and labels and print the accuracy
images, labels = foolbox.utils.samples(dataset='imagenet', batchsize=16, data_format='channels_first', bounds=(0, 1))
print('攻击前准确率：', np.mean(fmodel.forward(images).argmax(axis=-1) == labels))

攻击前准确率： 0.9375


In [14]:
# apply the attack
attack = foolbox.attacks.FGSM(fmodel)
adversarials = attack(images, labels)
# if the i'th image is misclassfied without a perturbation, then adversarials[i] will be the same as images[i]
# if the attack fails to find an adversarial for the i'th image, then adversarials[i] will all be np.nan

In [15]:
# Foolbox guarantees that all returned adversarials are in fact in adversarials
print('攻击后准确率：', np.mean(fmodel.forward(adversarials).argmax(axis=-1) == labels))

攻击后准确率： 0.0


In [16]:
# You can always get the actual adversarial class that was observed for that sample by Foolbox by
# passing `unpack=False` to get the actual `Adversarial` objects:
attack = foolbox.attacks.FGSM(fmodel, distance=foolbox.distances.Linf)
adversarials = attack(images, labels, unpack=False)

In [20]:
adversarial_classes = np.asarray([a.adversarial_class for a in adversarials])
print('原本标签：', labels)
print('攻击后标签：', adversarial_classes)
print('准确度：', np.mean(adversarial_classes == labels))  # will always be 0.0

原本标签： [243 559 438 990 949 853 609 609 915 455 541 630 741 471 129  99]
攻击后标签： [242 694 711 937 927 706 479 511 672 539 463 636 497 870  89 138]
准确度： 0.0


In [18]:
# The `Adversarial` objects also provide a `distance` attribute. Note that the distances
# can be 0 (misclassified without perturbation) and inf (attack failed).
distances = np.asarray([a.distance.value for a in adversarials])
print("{:.1e}, {:.1e}, {:.1e}".format(distances.min(), np.median(distances), distances.max()))
print("{} of {} attacks failed".format(sum(adv.distance.value == np.inf for adv in adversarials), len(adversarials)))
print("{} of {} inputs misclassified without perturbation".format(sum(adv.distance.value == 0 for adv in adversarials), len(adversarials)))

0.0e+00, 8.0e-04, 2.9e-03
0 of 16 attacks failed
1 of 16 inputs misclassified without perturbation
