# Task 3: Adaptive Attack

In [None]:
import pickle
import os
from torch import optim
import torch
import torchvision
import numpy as np

### If you are using Google Colab, you need to upload this notebook and the codebase to your Google Drive. Then you need to mount your Google Drive in Colab and set your working directory. If you are running on your local machine, you can ignore the following line.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
root_dir = "/content/drive/My Drive/"
project_dir = "Assignment1_code" # Change to your path
os.chdir(root_dir + project_dir)

In [None]:
# Make sure the path is correct
!ls

CS5562_Assignment_1_Task_1.ipynb    defense.py		       __pycache__
CS5562_Assignment_1_Task_2.ipynb    environment.yml	       results
CS5562_Assignment_1_Task_3.ipynb    imagenet_class_index.json  test_image
CS5562_Assignment_1_Task_4.ipynb    JSMA		       utilities.py
CS5562_Assignment_1_Task_5.ipynb    MNIST
CS5562_Assignment_1_Warm_ups.ipynb  model.py


## Implement the Attack Algorithm

In [None]:
from tqdm import tqdm

class adaptive_attack:
    """
    The adaptive attack in Task 3 to break the defense model in Task 2.
    """

    def __init__(self, target_model: torchvision.models, epsilon: float,steps:int,learning_rate:float):
        self.target_model = target_model
        self.epsilon = epsilon
        self.learning_rate = learning_rate
        self.steps = steps

    def attack(self, test_image: torch.tensor, y_true: torch.tensor, is_targeted: bool = False,
               y_target: torch.tensor = None) -> torch.tensor:
        ##########
        # TODO: implement your attack
        def FGSM(input_image):
          """
          The FSGM attack in warm-up task.
          """
          perturbation = torch.zeros_like(input_image, requires_grad=True)
          pred = self.target_model.predict(input_image + perturbation)
          criterion = torch.nn.CrossEntropyLoss()

          if is_targeted:
              loss = -criterion(pred, y_target)
          else:
              loss = criterion(pred, y_true)

          loss.backward()
          return perturbation.grad

        # Attack implementation for FGSM
        N = 100
        gradients = []
        for _ in tqdm(range(N)):
          gradients.append(FGSM(test_image))

        # EOT
        perturbation = torch.sign(sum(gradients)/len(gradients)) * self.epsilon

        ##########
        return perturbation

## Test your code

### Copy and paste your Task 2 algorithm here

In [None]:
from torchvision import transforms
import torch
from scipy.stats import norm, binom_test
import numpy as np
from math import ceil
from statsmodels.stats.proportion import proportion_confint
import torch.nn as nn
import torchvision
from torch import optim

In [None]:
def randomization_input(test_image, resize_bound):
    '''
    :param test_image: the test image which could be clean or adversarial. The size is [1, 3, 224,224]
    :return: randomized version of the test image. The size is [1, 3, resize_bound, resize_bound].
    Hints:
    1. Sample a integer number uniformly from [1,10]: np.random.randint(1,10, size=1)
    2. Pytorch provides transforms function for image transformations. You can call the transforms.Resize to resize your images (https://pytorch.org/vision/stable/transforms.html).
    >>> p = transforms.Compose([transforms.Resize(48)])
    >>> p(test_image)
    3. Pytorch provides torch.nn.functional.pad function to pad tensors.
    '''

    PAD_VALUE = 0.5  # this is the pad value for the random padding step
    randomized_image = test_image
    ################
    # TODO: implement randomized resizing and randomized padding.
    rand_W, rand_H = np.random.randint(low=224, high=resize_bound, size=2)

    ## Resizing between [224, resize_bound) for least performance drop in clean images.
    resized_image = transforms.Resize(size=(rand_H, rand_W))(randomized_image)

    ## Padding - (padding_left, padding_right, padding_top, padding_bottom)
    leftover_W, leftover_H = (resize_bound - rand_W), (resize_bound - rand_H)
    prob_w, prob_h = np.random.randint(0, 11, size=2)

    left = int(leftover_W * prob_w/10)
    right = leftover_W - left
    top = int(leftover_H * prob_h/10)
    bottom = leftover_H - top

    pad = (left, right, top, bottom)
    randomized_image = nn.functional.pad(resized_image, pad, "constant", PAD_VALUE)
    assert randomized_image.shape == torch.Size([1, 3, resize_bound, resize_bound]), ValueError('Shape does not match')

    ################
    return randomized_image

### Helper functions

In [None]:
import time
from PIL import Image

from utilities import *
from defense import standard_trainer

In [None]:
def compute_attack_grade_imagenet(model, test_dir, eps=0.1, is_targeted=False, y_target=None, steps=10,
                                  learning_rate=0.1, num_pixel=None, defense_name=None):
    attacker = adaptive_attack(model, eps, steps, learning_rate)

    if is_targeted:
        y_target = torch.LongTensor([y_target])

        assert y_target is not None, ValueError('Please input the target label')

    attack_grade = []
    attack_success = []
    start_time = time.time()
    for filename in os.listdir(test_dir)[:1]:

        if filename.endswith(".JPEG"):
            # convert the name of the label to the number
            y_true = filename.split('_')[0]
            y_true_tensor = load_label_tensor(y_true)

            # read the image and pre-processing the data
            test_image = Image.open(test_dir + filename)
            target_image_tensor = preprocess_features(test_image)[None, :, :, :]

            # generate the pertubation
            delta = attacker.attack(target_image_tensor, y_true_tensor, is_targeted=is_targeted, y_target=y_target)

            # generate the adv examples based on the pertubation
            adv_example = get_adv_example(target_image_tensor, delta, "adaptive")

            # compute the prediction based on the clean images and adversarial examples
            pred_clean = model.predict(target_image_tensor)
            pred_adv = model.predict(adv_example)

            # compute the score of the attack
            grade, success = get_attack_score(y_true_tensor, pred_clean, pred_adv, is_targeted=is_targeted,
                                              y_target=y_target)
            if grade == -1 and success == -1:
                continue
            attack_grade.append(grade)
            attack_success.append(success)

    print('----------results-------------')
    print("[%s attack against %s model] \nattacking %d images using %.3f seconds" % (
    "adaptive", defense_name, len(attack_grade), time.time() - start_time))
    print("grade %.2f, success rate: %.2f" % (np.array(attack_grade).mean(), np.array(attack_success).mean()))

    return np.array(attack_grade).mean(), np.array(attack_grade).var()

def get_attack_score(y_true: torch.tensor, pred_clean: torch.tensor, pred_adv: torch.tensor, weight: float = 0.5,
                     is_targeted: bool = False, y_target: torch.Tensor = None):
    # assert pred_clean.argmax().item() == y_true.item(), ValueError("this image is not valid")
    if pred_clean.argmax().item() != y_true.item():
        return -1, -1

    if is_targeted:
        confid_clean = get_confidence(pred_clean, y_target.item())
        confid_adv = get_confidence(pred_adv, y_target.item())
        return weight * int(pred_adv.argmax().item() == y_target.item()) + (1 - weight) * (
                    confid_adv - confid_clean), int(pred_adv.argmax().item() == y_target.item())
    else:
        confid_clean = get_confidence(pred_clean, y_true.item())
        confid_adv = get_confidence(pred_adv, y_true.item())

        return weight * int(pred_adv.argmax().item() != y_true.item()) + (1 - weight) * (
                    confid_clean - confid_adv), int(pred_adv.argmax().item() != y_true.item())

In [None]:
class Randomization_Defense:
    def __init__(self, model, resize_bound):
        self.model = model
        self.resize_bound = resize_bound

    def predict(self, test_image):
        normalize = transforms.Compose([
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
        return self.model(randomization_input(normalize(test_image), self.resize_bound))

### Testing

In [None]:
pretrained_model = load_model()
target_model = Randomization_Defense(pretrained_model, 256)
defense_name = 'randomization defense'
grade_mean, grade_variance = compute_attack_grade_imagenet(target_model, "test_image/", eps=0.01, is_targeted=False, y_target=None, defense_name=defense_name)

100%|██████████| 100/100 [00:59<00:00,  1.68it/s]


----------results-------------
[adaptive attack against randomization defense model] 
attacking 1 images using 59.834 seconds
grade 0.98, success rate: 1.00


# Report

Please describe your attack strategy and write down how you overcomed the difficulties you faced if any.

Your Response:

## Attack Strategy

My core attack algorithm is FGSM. However, I added a slight modification to incorporate Expectation of Transformation (EOT) into my attack strategy.

- Since we know that the input image is transformed before getting fed into the classifier, I try to guess the gradient of the loss wrt the perturbation by sampling N randomly transformed images and calculating the expectation of their loss gradients.

- The idea is that the expectation of loss gradients of N samples would be a close approximation to the true gradient of the input image.

- Moreover, I implemented an attack algorithm like FGSM over PGD since single-step attacks are faster to implement (especially on a large N) and have greater generalization.


## Observed Results

- For untargetted attacks, My implemented works with:
```
N = 100
grade 0.99
success rate: 1.00
```

I attribute this to the fact that the loss landscape of a neural network is highly complex and the decision boundaries of each of the 1000 classes are greatly convoluted. Therefore, in order to misclassify towards any other 999 classes, it is easier to approximate the gradient direction from a relatively small subset of samples.

- For a targetted attack, my implementation fails to attack even with
```
N = 1000
target_class = 100
true_pred = 701
```
I attribute this to the idea that:
  - either the no. of drawn samples are not sufficient to get a close approximation of the true gradients which can be circumvented by drawing large samples, e.g. `N=1Million`, but this comes with a huge computation cost, OR,
  - The gradients are approximately correct but the decision boundary of the targetted class does not lie within the ϵ-ball of the input data. Perhaps, it might be successful to target another class with a closer decision boundary.

## Difficulties faced

- I was initially under the false impression that transformations (padding and resizing specifically) are non-differentiable. Hence, I was trying to compute gradients in some weird way, and it ended up not working correctly (was having size issues: `224` vs `resize_bound`).

- I overcame this difficulty by reading the PyTorch documentation on transformations correctly (LOL).