# NewtonFool Attack

The idea of the attack is to decrease the largest softmax output of the original prediction. In order to do this, they linearize said output and say that the want to decrease the current output $p_i$ to the output of the next step $p_{i+1}$ with the smallest step possible - smallest according to the $L_2$ norm. The first output $p_0$ is given by the network. Every following desired output is yielded by the minimum of a proposal from the gradient and the desire to get $p$ below $1/C$, where $C$ denotes the total number of classes. <br>

The issue with changing everything to the $L_\infty$ norm is that it is quite tricky to get a minimal norm solution according to the $L_\infty$ norm, since it does not exist in closed form.

What I can try, though, is to let the steps be chosen by the $L_2$ norm and use the $L_\infty$ norm for the rest, i.e. the requirement for a small step (in combination with $\eta$) and the clipping. (works the worst)

Alternatively, I can only do the clipping according to the $L_\infty$ norm and let the rest run with the $L_2$ norm.

Nothing works satisfactorily and the attack is a bit weird with the $L_\infty$ norm, so I'm going to drop it.

In [87]:
import eagerpy as ep
from typing import Union, Tuple, Any, Optional
from foolbox.models import Model
from foolbox.criteria import Misclassification
from foolbox.distances import l2, linf
from foolbox.devutils import atleast_kd, flatten
from foolbox.attacks.base import MinimizationAttack, get_criterion, T, raise_if_kwargs

In [2]:
# import doesn't work, so I copied the source code of the function here
def verify_input_bounds(input: ep.Tensor, model: Model) -> None:
    # verify that input to the attack lies within model's input bounds
    assert input.min().item() >= model.bounds.lower
    assert input.max().item() <= model.bounds.upper

In [3]:
import torch, pickle, argparse, os, warnings, copy, time, mlflow
import numpy as np, pytorch_lightning as pl, matplotlib.pyplot as plt, eagerpy as ep
from models import ConvNet, CConvNet
from data_loader import load_test_data, load_train_data
from foolbox import PyTorchModel
from tqdm.notebook import tqdm
from attack_helper import batched_predictions, batched_predictions, batched_logits
from mlflow.tracking.artifact_utils import get_artifact_uri
import matplotlib.pyplot as plt

In [5]:
run_name = 1697206030
tracking_uri = 'sqlite:///mlruns/database.db'
mlflow.set_tracking_uri(tracking_uri)
df=mlflow.search_runs(experiment_names=['model_training'])
run_id=df[df['tags.mlflow.runName']==str(run_name)]['run_id'].values[0]
artifact_path = get_artifact_uri(run_id=run_id, tracking_uri=tracking_uri)
dirs=os.listdir(artifact_path)

for s in dirs:
    if s.find('.ckpt') >= 0:
        checkpoint = s
        break

checkpoint_path = os.path.join(artifact_path, checkpoint)

best_model = torch.load(checkpoint_path)
hparams = argparse.Namespace(**best_model['hyper_parameters'])
if df[df['tags.mlflow.runName']==str(run_name)]['tags.model'].values[0] == 'ConvNet':
    model = ConvNet(hparams, None, None).eval()
elif df[df['tags.mlflow.runName']==str(run_name)]['tags.model'].values[0] == 'CConvNet':
    model = CConvNet(hparams, None, None).eval()
else:
    raise NotImplementedError(f"Model has to be 'ConvNet' or 'CConvNet'. Got {df[df['tags.mlflow.runName']==str(run_name)]['tags.model'].values[0]}.")
model.load_state_dict(best_model['state_dict'])

test_rot = eval(df[df['tags.mlflow.runName']==str(run_name)]['params.test_rot'].values[0])

if df[df['tags.mlflow.runName']==str(run_name)]['params.flat'].values[0] is None:
    flat = False
else:
    flat = eval(df[df['tags.mlflow.runName']==str(run_name)]['params.flat'].values[0])

if flat:
    padded_img_size = eval(df[df['tags.mlflow.runName']==str(run_name)]['params.padded_img_size'].values[0])

    if test_rot:
        TEST_PATH = "flat_mnist_test_aug_" + str(padded_img_size[0]) + "x" + str(padded_img_size[1]) + ".gz"
    else:
        TEST_PATH = "flat_mnist_test_" + str(padded_img_size[0]) + "x" + str(padded_img_size[1]) + ".gz"

    test_data = load_train_data(TEST_PATH)

else:    
    if test_rot:
        TEST_PATH = "s2_mnist_cs1.gz"
        test_data = load_test_data(TEST_PATH)
    else:
        TEST_PATH = "s2_mnist_test_sphere_center.gz"
        test_data = load_train_data(TEST_PATH)

In [120]:
class NewtonFoolAttack(MinimizationAttack):
    """Implementation of the NewtonFool Attack. [#Jang17]_

    Args:
        steps : Number of update steps to perform.
        step_size : Size of each update step.

    References:
        .. [#Jang17] Uyeong Jang et al., "Objective Metrics and Gradient Descent
            Algorithms for Adversarial Examples in Machine Learning",
            https://dl.acm.org/citation.cfm?id=3134635
    """

    distance = l2

    def __init__(self, steps: int = 100, stepsize: float = 0.01):
        self.steps = steps
        self.stepsize = stepsize

    def run(
        self,
        model: Model,
        inputs: T,
        criterion: Union[Misclassification, T],
        *,
        early_stop: Optional[float] = None,
        **kwargs: Any,
    ) -> T:
        raise_if_kwargs(kwargs)
        x, restore_type = ep.astensor_(inputs)
        criterion_ = get_criterion(criterion)
        del inputs, criterion, kwargs

        verify_input_bounds(x, model)

        N = len(x)

        if isinstance(criterion_, Misclassification):
            classes = criterion_.labels
        else:
            raise ValueError("unsupported criterion")

        if classes.shape != (N,):
            raise ValueError(
                f"expected labels to have shape ({N},), got {classes.shape}"
            )

        min_, max_ = model.bounds

        x_l2_norm = flatten(x.square()).sum(1).sqrt()
        
#         print(x_l2_norm.raw)

        def loss_fun(x: ep.Tensor) -> Tuple[ep.Tensor, Tuple[ep.Tensor, ep.Tensor]]:
            logits = model(x)
            scores = ep.softmax(logits)
            pred_scores = scores[range(N), classes]
            loss = pred_scores.sum()
            return loss, (scores, pred_scores)

        for i in range(self.steps):
            # (1) get the scores and gradients
            _, (scores, pred_scores), gradients = ep.value_aux_and_grad(loss_fun, x)

            pred = scores.argmax(-1)
            num_classes = scores.shape[-1]
            
#             print('scores: ', scores.raw[0], '\npred_scores: ', pred_scores.raw[0], '\ngradients:', gradients.raw[0])

            # (2) calculate gradient norm
            gradients_l2_norm = flatten(gradients.square()).sum(1).sqrt()

            # (3) calculate delta
            a = self.stepsize * x_l2_norm * gradients_l2_norm
            b = pred_scores - 1.0 / num_classes

            delta = ep.minimum(a, b)
            
#             print('delta = ', delta)

            # (4) stop the attack if an adversarial example has been found
            # this is not described in the paper but otherwise once the prob. drops
            # below chance level the likelihood is not decreased but increased
            is_not_adversarial = (pred == classes).float32()
            delta *= is_not_adversarial
            
#             print('delta = ', delta)

#             print('gradients squared = ', gradients_l2_norm.square())
            
            # (5) calculate & apply current perturbation
            a = atleast_kd(delta / gradients_l2_norm.square(), gradients.ndim)
            
#             print('a[0] = ', a[0,0,0,0])
            
            x -= a * gradients
            
#             print('x[0] = ', x[0,0])

            x = ep.clip(x, min_, max_)
            
#             print('xc[0] = ', x[0,0])
            
#             print('\n\nEND\n\n')

        return restore_type(x)

In [198]:
"""class LinfNewtonFoolAttack(MinimizationAttack):

    distance = linf

    def __init__(self, steps: int = 100, stepsize: float = 0.01):
        self.steps = steps
        self.stepsize = stepsize

    def run(
        self,
        model: Model,
        inputs: T,
        criterion: Union[Misclassification, T],
        *,
        early_stop: Optional[float] = None,
        **kwargs: Any,
    ) -> T:
        raise_if_kwargs(kwargs)
        x, restore_type = ep.astensor_(inputs)
        criterion_ = get_criterion(criterion)
        del inputs, criterion, kwargs

        verify_input_bounds(x, model)

        N = len(x)

        if isinstance(criterion_, Misclassification):
            classes = criterion_.labels
        else:
            raise ValueError("unsupported criterion")

        if classes.shape != (N,):
            raise ValueError(
                f"expected labels to have shape ({N},), got {classes.shape}"
            )

        min_, max_ = model.bounds

        x_l2_norm = flatten(x.square()).sum(1).sqrt()

        def loss_fun(x: ep.Tensor) -> Tuple[ep.Tensor, Tuple[ep.Tensor, ep.Tensor]]:
            logits = model(x)
            scores = ep.softmax(logits)
            pred_scores = scores[range(N), classes]
            loss = pred_scores.sum()
            return loss, (scores, pred_scores)

        for i in range(self.steps):
            # (1) get the scores and gradients
            _, (scores, pred_scores), gradients = ep.value_aux_and_grad(loss_fun, x)

            pred = scores.argmax(-1)
            num_classes = scores.shape[-1]

            # (2) calculate gradient norm
            gradients_l2_norm = flatten(gradients.square()).sum(1).sqrt()

            # (3) calculate delta
            a = self.stepsize * x_l2_norm * gradients_l2_norm
            b = pred_scores - 1.0 / num_classes

            delta = ep.minimum(a, b)
            
            # (4) stop the attack if an adversarial example has been found
            # this is not described in the paper but otherwise once the prob. drops
            # below chance level the likelihood is not decreased but increased
            is_not_adversarial = (pred == classes).float32()
            delta *= is_not_adversarial
            
            # (5) calculate & apply current perturbation
            a = atleast_kd(delta / gradients_l2_norm.square(), gradients.ndim)
            
            x -= a * gradients

            x = ep.clip(x, min_, max_)

        return restore_type(x)"""

In [197]:
"""
This attack does not exist like that. The paper assumes the norm to be L_2.
"""


"""class LinfNewtonFoolAttack(MinimizationAttack):
    
    distance = linf
    
    def __init__(self, steps: int = 100, stepsize: float = 0.01):
        self.steps = steps
        self.stepsize = stepsize

    def run(
        self,
        model: Model,
        inputs: T,
        criterion: Union[Misclassification, T],
        *,
        early_stop: Optional[float] = None,
        **kwargs: Any,
    ) -> T:
        raise_if_kwargs(kwargs)
        x, restore_type = ep.astensor_(inputs)
        criterion_ = get_criterion(criterion)
        del inputs, criterion, kwargs

        verify_input_bounds(x, model)

        N = len(x)

        if isinstance(criterion_, Misclassification):
            classes = criterion_.labels
        else:
            raise ValueError("unsupported criterion")

        if classes.shape != (N,):
            raise ValueError(
                f"expected labels to have shape ({N},), got {classes.shape}"
            )

        min_, max_ = model.bounds
        
        x_linf_norm = flatten(x.abs()).max(1)

        def loss_fun(x: ep.Tensor) -> Tuple[ep.Tensor, Tuple[ep.Tensor, ep.Tensor]]:
            logits = model(x)
            scores = ep.softmax(logits)
            pred_scores = scores[range(N), classes]
            loss = pred_scores.sum()
            return loss, (scores, pred_scores)
        
        for i in range(self.steps):
            # (1) get the scores and gradients
            _, (scores, pred_scores), gradients = ep.value_aux_and_grad(loss_fun, x)

            pred = scores.argmax(-1)
            num_classes = scores.shape[-1]
            
            # (2) calculate gradient norm
            gradients_linf_norm = flatten(gradients.abs()).max(1)
            
            # (3) calculate delta
            a = self.stepsize * x_linf_norm * gradients_linf_norm
            b = pred_scores - 1.0 / num_classes

            delta = ep.minimum(a, b)
            
            # (4) stop the attack if an adversarial example has been found
            # this is not described in the paper but otherwise once the prob. drops
            # below chance level the likelihood is not decreased but increased
            is_not_adversarial = (pred == classes).float32()
            delta *= is_not_adversarial
            
            # (5) calculate & apply current perturbation
            a = atleast_kd(delta / gradients_linf_norm.square(), gradients.ndim)
            x -= a * gradients
            x = ep.clip(x, min_, max_)
            
        return restore_type(x)""";

In [204]:
class LinfNewtonFoolAttack(MinimizationAttack):

    distance = linf

    def __init__(self, steps: int = 100, stepsize: float = 0.01):
        self.steps = steps
        self.stepsize = stepsize

    def run(
        self,
        model: Model,
        inputs: T,
        criterion: Union[Misclassification, T],
        *,
        early_stop: Optional[float] = None,
        **kwargs: Any,
    ) -> T:
        raise_if_kwargs(kwargs)
        x, restore_type = ep.astensor_(inputs)
        criterion_ = get_criterion(criterion)
        del inputs, criterion, kwargs

        verify_input_bounds(x, model)

        N = len(x)

        if isinstance(criterion_, Misclassification):
            classes = criterion_.labels
        else:
            raise ValueError("unsupported criterion")

        if classes.shape != (N,):
            raise ValueError(
                f"expected labels to have shape ({N},), got {classes.shape}"
            )

        min_, max_ = model.bounds

        x_linf_norm = flatten(x.abs()).max(1)

        def loss_fun(x: ep.Tensor) -> Tuple[ep.Tensor, Tuple[ep.Tensor, ep.Tensor]]:
            logits = model(x)
            scores = ep.softmax(logits)
            pred_scores = scores[range(N), classes]
            loss = pred_scores.sum()
            return loss, (scores, pred_scores)

        for i in range(self.steps):
            # (1) get the scores and gradients
            _, (scores, pred_scores), gradients = ep.value_aux_and_grad(loss_fun, x)

            pred = scores.argmax(-1)
            num_classes = scores.shape[-1]

            # (2) calculate gradient norm
            gradients_l2_norm = flatten(gradients.square()).sum(1).sqrt()
            gradients_linf_norm = flatten(gradients.abs()).max(1)

            # (3) calculate delta
            a = self.stepsize * x_linf_norm * gradients_linf_norm
            b = pred_scores - 1.0 / num_classes

            delta = ep.minimum(a, b)
            
            # (4) stop the attack if an adversarial example has been found
            # this is not described in the paper but otherwise once the prob. drops
            # below chance level the likelihood is not decreased but increased
            is_not_adversarial = (pred == classes).float32()
            delta *= is_not_adversarial
            
            # (5) calculate & apply current perturbation
            a = atleast_kd(delta / gradients_l2_norm.square(), gradients.ndim)
            
            x -= a * gradients

            x = ep.clip(x, min_, max_)

        return restore_type(x)

In [205]:
# total = 10000
bs = 100
total = 100

images = test_data[:total][0]
labels = test_data[:total][1]

fmodel = PyTorchModel(model, bounds=(0, 255))

epsilons = [0, 0.5, 2.5, 5, 7.5, 10, 14, 20, 30]

In [206]:
clean_pred = batched_predictions(model, images, bs)

  0%|          | 0/1 [00:00<?, ?it/s]

In [210]:
attack = LinfNewtonFoolAttack(steps=300, stepsize=0.04)

raw_advs, clipped_advs, success = attack(fmodel, images.cuda(), clean_pred[:100].cuda(), epsilons=epsilons)

success_rate = ep.astensor(success).float32().mean(axis=-1).raw
print(success_rate)

tensor([0.0000, 0.0000, 0.0300, 0.0500, 0.0500, 0.0700, 0.1000, 0.1300, 0.3500],
       device='cuda:0')


In [195]:
# for stepsize in [0.01, 0.02, 0.05, 0.1, 0.25, 0.4, 0.5, 0.6, 0.75]:
#     for steps in [10, 30, 50, 70, 100, 200, 300, 500]:
#         attack = LinfNewtonFoolAttack(steps=steps, stepsize=stepsize)

#         raw_advs, clipped_advs, success = attack(fmodel, images.cuda(), clean_pred[:100].cuda(), epsilons=epsilons)

#         success_rate = ep.astensor(success).float32().mean(axis=-1).raw
#         print('steps:', steps, 'stepsize:', stepsize, '\nsuccess_rate:', success_rate)

steps: 10 stepsize: 0.01 
 success_rate: tensor([0.0000, 0.0000, 0.0400, 0.0600, 0.0700, 0.0900, 0.1200, 0.1200, 0.1200],
       device='cuda:0')
steps: 30 stepsize: 0.01 
 success_rate: tensor([0.0000, 0.0000, 0.0400, 0.0600, 0.0800, 0.0900, 0.1300, 0.2400, 0.4800],
       device='cuda:0')
steps: 50 stepsize: 0.01 
 success_rate: tensor([0.0000, 0.0000, 0.0400, 0.0600, 0.0700, 0.0800, 0.1300, 0.2300, 0.4500],
       device='cuda:0')
steps: 70 stepsize: 0.01 
 success_rate: tensor([0.0000, 0.0000, 0.0400, 0.0600, 0.0800, 0.0900, 0.1400, 0.2300, 0.4500],
       device='cuda:0')
steps: 100 stepsize: 0.01 
 success_rate: tensor([0.0000, 0.0000, 0.0400, 0.0600, 0.0800, 0.0900, 0.1300, 0.2300, 0.4500],
       device='cuda:0')
steps: 200 stepsize: 0.01 
 success_rate: tensor([0.0000, 0.0000, 0.0400, 0.0600, 0.0800, 0.0900, 0.1300, 0.2400, 0.4500],
       device='cuda:0')
steps: 300 stepsize: 0.01 
 success_rate: tensor([0.0000, 0.0000, 0.0400, 0.0600, 0.0700, 0.0900, 0.1400, 0.2400, 0.4400],

steps: 30 stepsize: 0.6 
 success_rate: tensor([0.0000, 0.0000, 0.0400, 0.0600, 0.0700, 0.1100, 0.1800, 0.2900, 0.5700],
       device='cuda:0')
steps: 50 stepsize: 0.6 
 success_rate: tensor([0.0000, 0.0000, 0.0400, 0.0600, 0.0700, 0.1100, 0.1800, 0.2900, 0.5700],
       device='cuda:0')
steps: 70 stepsize: 0.6 
 success_rate: tensor([0.0000, 0.0000, 0.0400, 0.0600, 0.0700, 0.1100, 0.1800, 0.2900, 0.5700],
       device='cuda:0')
steps: 100 stepsize: 0.6 
 success_rate: tensor([0.0000, 0.0000, 0.0400, 0.0600, 0.0700, 0.1100, 0.1800, 0.2900, 0.5700],
       device='cuda:0')
steps: 200 stepsize: 0.6 
 success_rate: tensor([0.0000, 0.0000, 0.0400, 0.0600, 0.0700, 0.1100, 0.1800, 0.2900, 0.5700],
       device='cuda:0')
steps: 300 stepsize: 0.6 
 success_rate: tensor([0.0000, 0.0000, 0.0400, 0.0600, 0.0700, 0.1100, 0.1800, 0.2900, 0.5700],
       device='cuda:0')
steps: 500 stepsize: 0.6 
 success_rate: tensor([0.0000, 0.0000, 0.0400, 0.0600, 0.0700, 0.1100, 0.1800, 0.2900, 0.5700],
     

In [203]:
for stepsize in [0.01, 0.02, 0.03, 0.04, 0.05]:
    for steps in [10, 30, 50, 70, 100, 200, 300, 500]:
        attack = LinfNewtonFoolAttack(steps=steps, stepsize=stepsize)

        raw_advs, clipped_advs, success = attack(fmodel, images.cuda(), clean_pred[:100].cuda(), epsilons=epsilons)

        success_rate = ep.astensor(success).float32().mean(axis=-1).raw
        print('steps:', steps, 'stepsize:', stepsize, '\nsuccess_rate:', success_rate)

steps: 10 stepsize: 0.01 
success_rate: tensor([0.0000, 0.0000, 0.0400, 0.0500, 0.0600, 0.1000, 0.1300, 0.2300, 0.2900],
       device='cuda:0')
steps: 30 stepsize: 0.01 
success_rate: tensor([0.0000, 0.0000, 0.0400, 0.0500, 0.0600, 0.1000, 0.1300, 0.2700, 0.4900],
       device='cuda:0')
steps: 50 stepsize: 0.01 
success_rate: tensor([0.0000, 0.0000, 0.0400, 0.0500, 0.0600, 0.1000, 0.1300, 0.2700, 0.5000],
       device='cuda:0')
steps: 70 stepsize: 0.01 
success_rate: tensor([0.0000, 0.0000, 0.0400, 0.0500, 0.0600, 0.1000, 0.1300, 0.2600, 0.4900],
       device='cuda:0')
steps: 100 stepsize: 0.01 
success_rate: tensor([0.0000, 0.0000, 0.0400, 0.0500, 0.0600, 0.1000, 0.1300, 0.2700, 0.5000],
       device='cuda:0')
steps: 200 stepsize: 0.01 
success_rate: tensor([0.0000, 0.0000, 0.0400, 0.0500, 0.0600, 0.1000, 0.1300, 0.2600, 0.5000],
       device='cuda:0')
steps: 300 stepsize: 0.01 
success_rate: tensor([0.0000, 0.0000, 0.0400, 0.0500, 0.0600, 0.1000, 0.1300, 0.2600, 0.5000],
      