Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BadDet Regional Misclassification Attack Implementation #2054

Merged
merged 11 commits into from
Mar 12, 2023
3 changes: 2 additions & 1 deletion art/attacks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
Module providing adversarial attacks under a common interface.
"""
from art.attacks.attack import Attack, EvasionAttack, PoisoningAttack, PoisoningAttackBlackBox, PoisoningAttackWhiteBox
from art.attacks.attack import PoisoningAttackTransformer, ExtractionAttack, InferenceAttack, AttributeInferenceAttack
from art.attacks.attack import PoisoningAttackGenerator, PoisoningAttackTransformer, PoisoningAttackObjectDetector
from art.attacks.attack import ExtractionAttack, InferenceAttack, AttributeInferenceAttack
from art.attacks.attack import ReconstructionAttack

from art.attacks import evasion
Expand Down
37 changes: 35 additions & 2 deletions art/attacks/attack.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

import abc
import logging
from typing import Any, List, Optional, Tuple, Union, TYPE_CHECKING
from typing import Any, Dict, List, Optional, Tuple, Union, TYPE_CHECKING

import numpy as np

Expand Down Expand Up @@ -266,7 +266,7 @@ def poison_estimator(
max_iter: int,
lambda_p: float,
verbose: int,
**kwargs
**kwargs,
) -> "GENERATOR_TYPE":
"""
Returns a poisoned version of the generator used to initialize the attack
Expand Down Expand Up @@ -325,6 +325,39 @@ def poison_estimator(self, x: np.ndarray, y: np.ndarray, **kwargs) -> "CLASSIFIE
raise NotImplementedError


class PoisoningAttackObjectDetector(Attack):
"""
Abstract base class for poisoning attack classes on object detection models.
"""

def __init__(self):
"""
Initializes object detector poisoning attack.
"""
super().__init__(None) # type: ignore

@abc.abstractmethod
def poison(
self,
x: np.ndarray,
y: List[Dict[str, np.ndarray]],
**kwargs,
) -> Tuple[np.ndarray, List[Dict[str, np.ndarray]]]:
"""
Generate poisoning examples and return them as an array. This method should be overridden by all concrete
poisoning attack implementations.

:param x: An array with the original inputs to be attacked.
:param y: True labels of type `List[Dict[np.ndarray]]`, one dictionary per input image.
The keys and values of the dictionary are:
- boxes [N, 4]: the boxes in [x1, y1, x2, y2] format, with 0 <= x1 < x2 <= W and 0 <= y1 < y2 <= H.
- labels [N]: the labels for each image
- scores [N]: the scores or each prediction.
:return: An tuple holding the `(poisoning_examples, poisoning_labels)`.
"""
raise NotImplementedError


class PoisoningAttackBlackBox(PoisoningAttack):
"""
Abstract base class for poisoning attack classes that have no access to the model (classifier object).
Expand Down
1 change: 1 addition & 0 deletions art/attacks/poisoning/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from art.attacks.poisoning.backdoor_attack_dgm.backdoor_attack_dgm_red import BackdoorAttackDGMReDTensorFlowV2
from art.attacks.poisoning.backdoor_attack_dgm.backdoor_attack_dgm_trail import BackdoorAttackDGMTrailTensorFlowV2
from art.attacks.poisoning.backdoor_attack import PoisoningAttackBackdoor
from art.attacks.poisoning.bad_det.bad_det_rma import BadDetRegionalMisclassificationAttack
from art.attacks.poisoning.poisoning_attack_svm import PoisoningAttackSVM
from art.attacks.poisoning.feature_collision_attack import FeatureCollisionAttack
from art.attacks.poisoning.adversarial_embedding_attack import PoisoningAttackAdversarialEmbedding
Expand Down
Empty file.
156 changes: 156 additions & 0 deletions art/attacks/poisoning/bad_det/bad_det_rma.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
# MIT License
#
# Copyright (C) The Adversarial Robustness Toolbox (ART) Authors 2023
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit
# persons to whom the Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
# Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""
This module implements the BadDet Regional Misclassification Attack (RMA) on object detectors.

| Paper link: https://arxiv.org/abs/2205.14497
"""
from __future__ import absolute_import, division, print_function, unicode_literals

import logging
from typing import Dict, List, Tuple

import numpy as np
from tqdm.auto import tqdm

from art.attacks.attack import PoisoningAttackObjectDetector
from art.attacks.poisoning.backdoor_attack import PoisoningAttackBackdoor

logger = logging.getLogger(__name__)


class BadDetRegionalMisclassificationAttack(PoisoningAttackObjectDetector):
"""
Implementation of the BadDet Regional Misclassification Attack.

| Paper link: https://arxiv.org/abs/2205.14497
"""

attack_params = PoisoningAttackObjectDetector.attack_params + [
"backdoor",
"source_class",
"target_class",
"percent_poison",
"channels_first",
"verbose",
]
_estimator_requirements = ()

def __init__(
self,
backdoor: PoisoningAttackBackdoor,
class_source: int = 0,
class_target: int = 1,
percent_poison: float = 0.3,
channels_first: bool = False,
verbose: bool = False,
) -> None:
"""
Creates a new BadDet Regional Misclassification Attack

:param backdoor: the backdoor chosen for this attack.
:param class_source: The source class from which triggers were selected.
:param class_target: The target label to which the poisoned model needs to misclassify.
:param percent_poison: The ratio of samples to poison in the source class, with range [0, 1].
:param channels_first: Set channels first or last.
:param verbose: Show progress bars.
"""
super().__init__()
self.backdoor = backdoor
self.class_source = class_source
self.class_target = class_target
self.percent_poison = percent_poison
self.channels_first = channels_first
self.verbose = verbose
self._check_params()

def poison( # pylint: disable=W0221
self,
x: np.ndarray,
y: List[Dict[str, np.ndarray]],
**kwargs,
) -> Tuple[np.ndarray, List[Dict[str, np.ndarray]]]:
"""
Generate poisoning examples by inserting the backdoor onto the input `x` and changing the classification
for labels `y`.

:param x: Sample images of shape `NCHW` or `NHWC`.
:param y: True labels of type `List[Dict[np.ndarray]]`, one dictionary per input image.
The keys and values of the dictionary are:
- boxes [N, 4]: the boxes in [x1, y1, x2, y2] format, with 0 <= x1 < x2 <= W and 0 <= y1 < y2 <= H.
- labels [N]: the labels for each image
- scores [N]: the scores or each prediction.
:return: An tuple holding the `(poisoning_examples, poisoning_labels)`.
"""
x_poison = x.copy()
y_poison: List[Dict[str, np.ndarray]] = []

# copy labels and find indices of the source class
source_indices = []
for i, y_i in enumerate(y):
boxes = y_i["boxes"].copy()
labels = y_i["labels"].copy()
scores = y_i["scores"].copy()

target_dict = {
"boxes": boxes,
"labels": labels,
"scores": scores,
}
y_poison.append(target_dict)

if self.class_source in labels:
source_indices.append(i)

# select indices of samples to poison
num_poison = int(self.percent_poison * len(source_indices))
selected_indices = np.random.choice(source_indices, num_poison, replace=False)

for i in tqdm(selected_indices, desc="BadDet RMA iteration", disable=not self.verbose):
image = x_poison[i]

boxes = y_poison[i]["boxes"]
labels = y_poison[i]["labels"]

for j, (box, label) in enumerate(zip(boxes, labels)):
if label == self.class_source:
# extract the bounding box from the image
x_1, y_1, x_2, y_2 = box.astype(int)
if self.channels_first:
bounding_box = image[:, y_1:y_2, x_1:x_2]
else:
bounding_box = image[y_1:y_2, x_1:x_2, :]

# insert backdoor into the bounding box
# add an additional dimension to create a batch of size 1
poisoned_input, _ = self.backdoor.poison(bounding_box[np.newaxis], label)
if self.channels_first:
image[:, y_1:y_2, x_1:x_2] = poisoned_input[0]
else:
image[y_1:y_2, x_1:x_2, :] = poisoned_input[0]

# change the source label to the target label
labels[j] = self.class_target

return x_poison, y_poison

def _check_params(self) -> None:
if not isinstance(self.backdoor, PoisoningAttackBackdoor):
raise ValueError("Backdoor must be of type PoisoningAttackBackdoor")
if not 0 < self.percent_poison <= 1:
raise ValueError("percent_poison must be between 0 and 1")
3 changes: 3 additions & 0 deletions notebooks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,9 @@ demonstrates working Sleeper Agent poisoning attack implemented in PyTorch Frame
which employs gradient matching, data selection, and target model re-training during the crafting process. Sleeper
Agent is the first hidden trigger backdoor attack to be effective against neural networks trained from scratch.

[poisoning_attack_bad_det_rma.ipynb](poisoning_attack_bad_det_rma.ipynb) [[on nbviewer](https://nbviewer.jupyter.org/github/Trusted-AI/adversarial-robustness-toolbox/blob/main/notebooks/poisoning_attack_bad_det_rma.ipynb)]
demonstrates using the BadDet Regional Misclassification Attack (RMA) to insert backdoors and create poisoned samples for object detector models. This is a dirty label attack where a trigger is inserted into a bounding box and the corresponding classification label is changed.

## Certification and Verification

[output_randomized_smoothing_mnist.ipynb](output_randomized_smoothing_mnist.ipynb) [[on nbviewer](https://nbviewer.jupyter.org/github/Trusted-AI/adversarial-robustness-toolbox/blob/main/notebooks/output_randomized_smoothing_mnist.ipynb)]
Expand Down
Loading