Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend ART to feature vectors #69

Merged
merged 24 commits into from
May 16, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ install:
- conda install libgcc
- export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/travis/miniconda/envs/test-environment/lib
- export PYTHONPATH=".":$PYTHONPATH
- pip freeze

script:
- python -m unittest discover
- ./run_tests.sh
7 changes: 6 additions & 1 deletion art/attacks/adversarial_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,14 +85,19 @@ def generate(self, x, y=None):
"""
Generate adversarial samples and return them in an array.

:param x: An array with the original inputs.
:param x: An array with the original inputs. `x` is expected to have spatial dimensions.
:type x: `np.ndarray`
:param y: An array with the original labels to be predicted.
:type y: `np.ndarray`
:return: An array holding the adversarial patch.
:rtype: `np.ndarray`
"""
logger.info('Creating adversarial patch.')

if len(x.shape) == 2:
raise ValueError('Feature vectors detected. The adversarial patch can only be applied to data with spatial '
'dimensions.')

self.patch = (np.random.standard_normal(size=self.patch_shape)) * 20.0

for i_step in range(self.max_iter):
Expand Down
110 changes: 44 additions & 66 deletions art/attacks/boundary.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,16 @@

import numpy as np

from art import NUMPY_DTYPE
from art.attacks.attack import Attack

logger = logging.getLogger(__name__)


class BoundaryAttack(Attack):
"""
Implementation of the boundary attack from Wieland Brendel et al. (2018).
Paper link: https://arxiv.org/abs/1712.04248
Implementation of the boundary attack from Wieland Brendel et al. (2018). This is a powerful black-box attack that
only requires final class prediction. Paper link: https://arxiv.org/abs/1712.04248
"""
attack_params = Attack.attack_params + ['targeted', 'delta', 'epsilon', 'step_adapt', 'max_iter', 'sample_size',
'init_size']
Expand Down Expand Up @@ -77,42 +78,38 @@ def generate(self, x, y=None):
:return: An array holding the adversarial examples.
:rtype: `np.ndarray`
"""
# Prediction from the original images
preds = np.argmax(self.classifier.predict(x), axis=1)

# Assert that, if attack is targeted, y is provided
if self.targeted and y is None:
raise ValueError('Target labels `y` need to be provided for a targeted attack.')

# Some initial setups
x_adv = x.copy()
x_adv = x.astype(NUMPY_DTYPE)
if y is not None:
y = np.argmax(y, axis=1)
preds = np.argmax(self.classifier.predict(x), axis=1)

# Generate the adversarial samples
for ind, val in enumerate(x_adv):
if self.targeted:
x_ = self._perturb(x=val, y=y[ind], y_p=preds[ind])
x_adv[ind] = self._perturb(x=val, y_p=preds[ind], y=y[ind])
else:
x_ = self._perturb(x=val, y=None, y_p=preds[ind])

x_adv[ind] = x_
x_adv[ind] = self._perturb(x=val, y_p=preds[ind])

logger.info('Success rate of Boundary attack: %.2f%%',
(np.sum(preds != np.argmax(self.classifier.predict(x_adv), axis=1)) / x.shape[0]))

return x_adv

def _perturb(self, x, y, y_p):
def _perturb(self, x, y_p, y=None):
"""
Internal attack function for one example.

:param x: An array with one original input to be attacked.
:type x: `np.ndarray`
:param y: If `self.targeted` is true, then `y` represents the target label.
:type y: `int`
:param y_p: The predicted label of x.
:type y_p: `int`
:param y: If `self.targeted` is true, then `y` represents the target label.
:type y: `int`
:return: an adversarial example.
"""
# First, create an initial adversarial sample
Expand Down Expand Up @@ -148,11 +145,13 @@ def _attack(self, initial_sample, original_sample, target, initial_delta, initia
:return: an adversarial example.
:rtype: `np.ndarray`
"""
def compare(object1, object2):
return object1 == object2 if self.targeted else object1 != object2

# Get initialization for some variables
x_adv = initial_sample
delta = initial_delta
epsilon = initial_epsilon
clip_min, clip_max = self.classifier.clip_values

# Main loop to wander around the boundary
for _ in range(self.max_iter):
Expand All @@ -161,22 +160,15 @@ def _attack(self, initial_sample, original_sample, target, initial_delta, initia
potential_advs = []
for _ in range(self.sample_size):
potential_adv = x_adv + self._orthogonal_perturb(delta, x_adv, original_sample)
potential_adv = np.clip(potential_adv, clip_min, clip_max)
if hasattr(self.classifier, 'clip_values') and self.classifier.clip_values is not None:
np.clip(potential_adv, self.classifier.clip_values[0], self.classifier.clip_values[1],
out=potential_adv)
potential_advs.append(potential_adv)

preds = np.argmax(self.classifier.predict(np.array(potential_advs)), axis=1)

if self.targeted:
satisfied = (preds == target)
else:
satisfied = (preds != target)

satisfied = compare(preds, target)
delta_ratio = np.mean(satisfied)

if delta_ratio < 0.5:
delta *= self.step_adapt
else:
delta /= self.step_adapt
delta = delta * self.step_adapt if delta_ratio < .5 else delta / self.step_adapt

if delta_ratio > 0:
x_adv = potential_advs[np.where(satisfied)[0][0]]
Expand All @@ -191,15 +183,13 @@ def _attack(self, initial_sample, original_sample, target, initial_delta, initia
perturb = original_sample - x_adv
perturb *= epsilon
potential_adv = x_adv + perturb
potential_adv = np.clip(potential_adv, clip_min, clip_max)
pred = np.argmax(self.classifier.predict(np.array([potential_adv])), axis=1)[0]
if hasattr(self.classifier, 'clip_values') and self.classifier.clip_values is not None:
np.clip(potential_adv, self.classifier.clip_values[0], self.classifier.clip_values[1],
out=potential_adv)

if self.targeted:
satisfied = (pred == target)
else:
satisfied = (pred != target)
pred = np.argmax(self.classifier.predict(np.array([potential_adv])), axis=1)[0]

if satisfied:
if compare(pred, target):
x_adv = potential_adv
epsilon /= self.step_adapt
break
Expand All @@ -225,7 +215,7 @@ def _orthogonal_perturb(self, delta, current_sample, original_sample):
:return: a possible perturbation.
"""
# Generate perturbation randomly
perturb = np.random.randn(current_sample.shape[0], current_sample.shape[1], current_sample.shape[2])
perturb = np.random.randn(*self.classifier.input_shape)

# Rescale the perturbation
perturb /= np.linalg.norm(perturb)
Expand Down Expand Up @@ -254,52 +244,40 @@ def _orthogonal_perturb(self, delta, current_sample, original_sample):

return perturb

def _init_sample(self, x, y, y_p):
def _init_sample(self, x, y, y_pred):
"""
Find initial adversarial example for the attack.

:param x: An array with 1 original input to be attacked.
:type x: `np.ndarray`
:param y: If `self.targeted` is true, then `y` represents the target label.
:type y: `int`
:param y_p: The predicted label of x.
:type y_p: `int`
:param y_pred: The predicted label of x.
:type y_pred: `int`
:return: an adversarial example.
"""
clip_min, clip_max = self.classifier.clip_values
nprd = np.random.RandomState()
initial_sample = None

if self.targeted:
# Attack satisfied
if y == y_p:
return None

# Attack unsatisfied yet
for _ in range(self.init_size):
random_img = nprd.uniform(clip_min, clip_max, size=x.shape).astype(x.dtype)
random_class = np.argmax(self.classifier.predict(np.array([random_img])), axis=1)[0]

if random_class == y:
initial_sample = random_img
# Attack satisfied
if self.targeted and y == y_pred:
return None

logging.info('Found initial adversarial image for targeted attack.')
break
else:
logging.warning('Failed to draw a random image that is adversarial, attack failed.')

else:
for _ in range(self.init_size):
random_img = nprd.uniform(clip_min, clip_max, size=x.shape).astype(x.dtype)
random_class = np.argmax(self.classifier.predict(np.array([random_img])), axis=1)[0]

if random_class != y_p:
initial_sample = random_img

logging.info('Found initial adversarial image for untargeted attack.')
break
# Attack unsatisfied yet
for _ in range(self.init_size):
if hasattr(self.classifier, 'clip_values') and self.classifier.clip_values is not None:
random_sample = nprd.uniform(self.classifier.clip_values[0], self.classifier.clip_values[1],
size=x.shape).astype(x.dtype)
else:
logging.warning('Failed to draw a random image that is adversarial, attack failed.')
# TODO Adjust following feature-wise and for entire sample provided by user?
mean_, std_ = np.mean(x), np.std(x)
random_sample = nprd.normal(loc=mean_, scale=2 * std_, size=x.shape).astype(x.dtype)
random_class = np.argmax(self.classifier.predict(np.array([random_sample])), axis=1)[0]

if (self.targeted and random_class == y) or (not self.targeted and random_class != y_pred):
initial_sample = random_sample
logging.info('Found initial adversarial image for attack.')
break

return initial_sample

Expand Down
29 changes: 18 additions & 11 deletions art/attacks/carlini.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def _loss(self, x, x_adv, target, c):

return z, l2dist, c*loss + l2dist

def _gradient_of_loss(self, z, target, x, x_adv, x_adv_tanh, c, clip_min, clip_max):
def _loss_gradient(self, z, target, x, x_adv, x_adv_tanh, c, clip_min, clip_max):
"""
Compute the gradient of the loss function.

Expand Down Expand Up @@ -185,7 +185,10 @@ def generate(self, x, y=None):
:rtype: `np.ndarray`
"""
x_adv = x.astype(NUMPY_DTYPE)
(clip_min, clip_max) = self.classifier.clip_values
if hasattr(self.classifier, 'clip_values') and self.classifier.clip_values is not None:
clip_min, clip_max = self.classifier.clip_values
else:
clip_min, clip_max = np.amin(x), np.amax(x)

# Assert that, if attack is targeted, y_val is provided:
if self.targeted and y is None:
Expand All @@ -204,8 +207,7 @@ def generate(self, x, y=None):
x_batch = x_adv[batch_index_1:batch_index_2]
y_batch = y[batch_index_1:batch_index_2]

# The optimization is performed in tanh space to keep the
# adversarial images bounded from clip_min and clip_max.
# The optimization is performed in tanh space to keep the adversarial images bounded in correct range
x_batch_tanh = original_to_tanh(x_batch, clip_min, clip_max, self._tanh_smoother)

# Initialize binary search:
Expand Down Expand Up @@ -256,9 +258,9 @@ def generate(self, x, y=None):

# compute gradient:
logger.debug('Compute loss gradient')
perturbation_tanh = -self._gradient_of_loss(z[active], y_batch[active], x_batch[active],
x_adv_batch[active], x_adv_batch_tanh[active],
c[active], clip_min, clip_max)
perturbation_tanh = -self._loss_gradient(z[active], y_batch[active], x_batch[active],
x_adv_batch[active], x_adv_batch_tanh[active],
c[active], clip_min, clip_max)

# perform line search to optimize perturbation
# first, halve the learning rate until perturbation actually decreases the loss:
Expand Down Expand Up @@ -496,7 +498,7 @@ def _loss(self, x_adv, target):

return z, loss

def _gradient_of_loss(self, z, target, x_adv, x_adv_tanh, clip_min, clip_max):
def _loss_gradient(self, z, target, x_adv, x_adv_tanh, clip_min, clip_max):
"""
Compute the gradient of the loss function.

Expand Down Expand Up @@ -545,6 +547,11 @@ def generate(self, x, y=None):
"""
x_adv = x.astype(NUMPY_DTYPE)

if hasattr(self.classifier, 'clip_values') and self.classifier.clip_values is not None:
clip_min_per_pixel, clip_max_per_pixel = self.classifier.clip_values
else:
clip_min_per_pixel, clip_max_per_pixel = np.amin(x), np.amax(x)

# Assert that, if attack is targeted, y_val is provided:
if self.targeted and y is None:
raise ValueError('Target labels `y` need to be provided for a targeted attack.')
Expand All @@ -562,7 +569,7 @@ def generate(self, x, y=None):
x_batch = x_adv[batch_index_1:batch_index_2]
y_batch = y[batch_index_1:batch_index_2]

(clip_min_per_pixel, clip_max_per_pixel) = self.classifier.clip_values
# Determine values for later clipping
clip_min = np.clip(x_batch - self.eps, clip_min_per_pixel, clip_max_per_pixel)
clip_max = np.clip(x_batch + self.eps, clip_min_per_pixel, clip_max_per_pixel)

Expand Down Expand Up @@ -592,8 +599,8 @@ def generate(self, x, y=None):

# compute gradient:
logger.debug('Compute loss gradient')
perturbation_tanh = -self._gradient_of_loss(z[active], y_batch[active], x_adv_batch[active],
x_adv_batch_tanh[active], clip_min[active], clip_max[active])
perturbation_tanh = -self._loss_gradient(z[active], y_batch[active], x_adv_batch[active],
x_adv_batch_tanh[active], clip_min[active], clip_max[active])

# perform line search to optimize perturbation
# first, halve the learning rate until perturbation actually decreases the loss:
Expand Down
22 changes: 15 additions & 7 deletions art/attacks/deepfool.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import numpy as np

from art import NUMPY_DTYPE
from art.attacks.attack import Attack

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -64,8 +65,7 @@ def generate(self, x, y=None):
:return: An array holding the adversarial examples.
:rtype: `np.ndarray`
"""
clip_min, clip_max = self.classifier.clip_values
x_adv = x.copy()
x_adv = x.astype(NUMPY_DTYPE)
preds = self.classifier.predict(x, logits=True)

# Determine the class labels for which to compute the gradients
Expand Down Expand Up @@ -112,11 +112,16 @@ def generate(self, x, y=None):
value[np.arange(len(value)), labels_indices] = np.inf
l = np.argmin(value, axis=1)
r = (abs(f_diff[np.arange(len(f_diff)), l]) / (pow(np.linalg.norm(grad_diff[np.arange(len(
grad_diff)), l].reshape(len(grad_diff), -1), axis=1), 2) + tol))[:, None, None, None] * \
grad_diff[np.arange(len(grad_diff)), l]
grad_diff)), l].reshape(len(grad_diff), -1), axis=1), 2) + tol))
r = r.reshape((-1,) + (1,) * (len(x.shape) - 1))
r = r * grad_diff[np.arange(len(grad_diff)), l]

# Add perturbation and clip result
batch[active_indices] = np.clip(batch[active_indices] + r[active_indices], clip_min, clip_max)
if hasattr(self.classifier, 'clip_values') and self.classifier.clip_values is not None:
batch[active_indices] = np.clip(batch[active_indices] + r[active_indices],
self.classifier.clip_values[0], self.classifier.clip_values[1])
else:
batch[active_indices] += r[active_indices]

# Recompute prediction for new x
f = self.classifier.predict(batch, logits=True)
Expand All @@ -137,8 +142,11 @@ def generate(self, x, y=None):
current_step += 1

# Apply overshoot parameter
x_adv[batch_index_1:batch_index_2] = np.clip(x_adv[batch_index_1:batch_index_2] + (
1 + self.epsilon) * (batch - x_adv[batch_index_1:batch_index_2]), clip_min, clip_max)
x_adv[batch_index_1:batch_index_2] = x_adv[batch_index_1:batch_index_2] + \
(1 + self.epsilon) * (batch - x_adv[batch_index_1:batch_index_2])
if hasattr(self.classifier, 'clip_values') and self.classifier.clip_values is not None:
np.clip(x_adv[batch_index_1:batch_index_2], self.classifier.clip_values[0],
self.classifier.clip_values[1], out=x_adv[batch_index_1:batch_index_2])

logger.info('Success rate of DeepFool attack: %.2f%%',
(np.sum(np.argmax(preds, axis=1) != np.argmax(self.classifier.predict(x_adv), axis=1)) /
Expand Down
4 changes: 2 additions & 2 deletions art/attacks/elastic_net.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,6 @@ def generate(self, x, y=None):
:rtype: `np.ndarray`
"""
x_adv = x.astype(NUMPY_DTYPE)
(clip_min, clip_max) = self.classifier.clip_values

# Assert that, if attack is targeted, y is provided:
if self.targeted and y is None:
Expand All @@ -188,7 +187,8 @@ def generate(self, x, y=None):
x_adv[batch_index_1:batch_index_2] = self._generate_batch(x_batch, y_batch)

# Apply clip
x_adv = np.clip(x_adv, clip_min, clip_max)
if hasattr(self.classifier, 'clip_values') and self.classifier.clip_values is not None:
x_adv = np.clip(x_adv, self.classifier.clip_values[0], self.classifier.clip_values[1])

# Compute success rate of the EAD attack
logger.info('Success rate of EAD attack: %.2f%%',
Expand Down
Loading