# Warm-ups: Implement PGD and FSG attcks

## Dependencies

In [1]:
import pickle
import tensorflow as tf
import os
from torch import optim
import torchvision
import numpy as np

### If you are using Google Colab, you need to upload this notebook and the codebase to your Google Drive. Then you need to mount your Google Drive in Colab and set your working directory. If you are running on your local machine, you can ignore the following line.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
root_dir = "/content/drive/My Drive/"
project_dir = "Assignment1_code" # Change to your path
os.chdir(root_dir + project_dir)

In [4]:
# Make sure the path is correct
!ls

CS5562_Assignment_1_Task_1.ipynb    environment.yml
CS5562_Assignment_1_Task_2.ipynb    imagenet_class_index.json
CS5562_Assignment_1_Task_3.ipynb    JSMA
CS5562_Assignment_1_Task_4.ipynb    model.py
CS5562_Assignment_1_Task_5.ipynb    __pycache__
CS5562_Assignment_1_Warm_ups.ipynb  test_image
defense.py			    utilities.py


In [5]:
from utilities import *

## Implement FSG

In [6]:
class FSG_attack:
    """
        The FSGM attack in warm-up task.
    """
    def __init__(self, target_model: torchvision.models, epsilon: float):
        self.target_model = target_model
        self.epsilon = epsilon

    def attack(self, test_image: torch.tensor, y_true: torch.tensor, is_targeted: bool = False,
               y_target: torch.tensor = None)-> torch.tensor:
        perturbation = torch.zeros_like(test_image, requires_grad=True)  # init the perturbation
        pred = self.target_model.predict(test_image + perturbation)

        if is_targeted:
            ############
            # TODO: implement the loss
            criterion = torch.nn.CrossEntropyLoss()
            loss = -criterion(pred, y_target)
            ############
        else:
            ############
            # TODO: implement the loss function
            criterion = torch.nn.CrossEntropyLoss()
            loss = criterion(pred, y_true)
            ############

        loss.backward()
        gradient = perturbation.grad  # get the gradient of the loss with respect to the perturbation
        ############
        # TODO: update the perturbation
        perturbation = self.epsilon * torch.sign(gradient)
        ############
        return perturbation

## Implement PGD

In [36]:
class PGD_attack:
    """
        The PGD attack in warm-up task.
    """
    def __init__(self, target_model: torchvision.models, epsilon: float, steps: int,
                 learning_rate: float = 1e-1):
        self.target_model = target_model
        self.epsilon = epsilon
        self.steps = steps
        self.learning_rate = learning_rate

    def attack(self, test_image: torch.tensor, y_true: torch.tensor, is_targeted: bool = False,
               y_target: torch.tensor = None) -> torch.tensor:

        perturbation = torch.zeros_like(test_image, requires_grad=True)
        opt = optim.SGD([perturbation], lr=self.learning_rate)

        for t in range(self.steps):
            pred = self.target_model.predict(test_image + perturbation)
            if is_targeted:
                ############
                # TODO: implement the loss
                criterion = torch.nn.CrossEntropyLoss()
                loss = criterion(pred, y_target)
                ############
            else:
                ############
                # TODO: implement the loss
                criterion = torch.nn.CrossEntropyLoss()
                loss = -criterion(pred, y_true)
                ############

            if t%10 == 0:
                print("PGD attack epoch %d: loss %s"%(t,str(loss.item())))

            opt.zero_grad()
            loss.backward()
            opt.step()
            ##########
            # TODO: update the perturbation
            ##########
            with torch.no_grad():
              gradient = perturbation.grad
              epsilon = torch.full(test_image.shape, self.epsilon)
              perturbation = torch.max(torch.min(gradient, epsilon), -epsilon).requires_grad_(True)

        return perturbation

# Test your code

## Helper functions

In [8]:
from model import Undefended_Model
import time
from PIL import Image

In [9]:
def compute_attack_grade_imagenet(attack_name, model, test_dir, eps=0.1, is_targeted=False, y_target=None, steps=10,
                                  learning_rate=0.1, num_pixel=None, defense_name=None):
    if attack_name == 'FSG':
        attacker = FSG_attack(model, eps)
    elif attack_name == 'PGD':
        attacker = PGD_attack(model, eps, steps, learning_rate)

    else:
        raise ValueError('Please input the corret attack name: FSG, PGD, OP,adaptive')

    if is_targeted:
        y_target = torch.LongTensor([y_target])

        assert y_target is not None, ValueError('Please input the target label')

    attack_grade = []
    attack_success = []
    start_time = time.time()
    for filename in os.listdir(test_dir)[:1]:

        if filename.endswith(".JPEG"):
            # convert the name of the label to the number
            y_true = filename.split('_')[0]
            y_true_tensor = load_label_tensor(y_true)

            # read the image and pre-processing the data
            test_image = Image.open(test_dir + filename)
            target_image_tensor = preprocess_features(test_image)[None, :, :, :]

            # generate the pertubation
            delta = attacker.attack(target_image_tensor, y_true_tensor, is_targeted=is_targeted, y_target=y_target)

            # generate the adv examples based on the pertubation
            adv_example = get_adv_example(target_image_tensor, delta, attack_name)

            # compute the prediction based on the clean images and adversarial examples
            pred_clean = model.predict(target_image_tensor)
            pred_adv = model.predict(adv_example)

            # compute the score of the attack
            grade, success = get_attack_score(y_true_tensor, pred_clean, pred_adv, is_targeted=is_targeted,
                                              y_target=y_target)
            if grade == -1 and success == -1:
                continue
            attack_grade.append(grade)
            attack_success.append(success)

    print('----------results-------------')
    print("[%s attack against %s model] \nattacking %d images using %.3f seconds" % (
    attack_name, defense_name, len(attack_grade), time.time() - start_time))
    print("grade %.2f, success rate: %.2f" % (np.array(attack_grade).mean(), np.array(attack_success).mean()))

    return np.array(attack_grade).mean(), np.array(attack_grade).var()

def get_attack_score(y_true: torch.tensor, pred_clean: torch.tensor, pred_adv: torch.tensor, weight: float = 0.5,
                     is_targeted: bool = False, y_target: torch.Tensor = None):
    # assert pred_clean.argmax().item() == y_true.item(), ValueError("this image is not valid")
    if pred_clean.argmax().item() != y_true.item():
        return -1, -1

    if is_targeted:
        confid_clean = get_confidence(pred_clean, y_target.item())
        confid_adv = get_confidence(pred_adv, y_target.item())
        return weight * int(pred_adv.argmax().item() == y_target.item()) + (1 - weight) * (
                    confid_adv - confid_clean), int(pred_adv.argmax().item() == y_target.item())
    else:
        confid_clean = get_confidence(pred_clean, y_true.item())
        confid_adv = get_confidence(pred_adv, y_true.item())

        return weight * int(pred_adv.argmax().item() != y_true.item()) + (1 - weight) * (
                    confid_clean - confid_adv), int(pred_adv.argmax().item() != y_true.item())


## Testing

You can play around with different hyper-parameters

In [38]:
pretrained_model = load_model()
target_model = Undefended_Model(pretrained_model)

attacker_type = "FSG" # You can change it to PGD
grade_mean, grade_variance = compute_attack_grade_imagenet(attacker_type, target_model, "test_image/", eps=0.1, is_targeted=False, y_target=None, steps=50, learning_rate=0.02)
grade_mean, grade_variance = compute_attack_grade_imagenet(attacker_type, target_model, "test_image/", eps=0.5, is_targeted=True, y_target=702, steps=50, learning_rate=0.02)

attacker_type = "PGD" # You can change it to PGD
grade_mean, grade_variance = compute_attack_grade_imagenet(attacker_type, target_model, "test_image/", eps=0.7, is_targeted=False, y_target=None, steps=200, learning_rate=0.1)
grade_mean, grade_variance = compute_attack_grade_imagenet(attacker_type, target_model, "test_image/", eps=0.7, is_targeted=True, y_target=702, steps=200, learning_rate=0.02)

----------results-------------
[FSG attack against None model] 
attacking 1 images using 0.829 seconds
grade 1.00, success rate: 1.00
----------results-------------
[FSG attack against None model] 
attacking 1 images using 0.833 seconds
grade -0.00, success rate: 0.00
PGD attack epoch 0: loss -0.0004694551753345877
PGD attack epoch 10: loss -0.0003847335756290704
PGD attack epoch 20: loss -0.0003840185818262398
PGD attack epoch 30: loss -0.00038485272671096027
PGD attack epoch 40: loss -0.0003768687602132559
PGD attack epoch 50: loss -0.00038556772051379085
PGD attack epoch 60: loss -0.000377583724912256
PGD attack epoch 70: loss -0.0003800861886702478
PGD attack epoch 80: loss -0.00038378025055862963
PGD attack epoch 90: loss -0.00038485272671096027
PGD attack epoch 100: loss -0.00037877538125030696
PGD attack epoch 110: loss -0.00038818930624984205
PGD attack epoch 120: loss -0.00038580605178140104
PGD attack epoch 130: loss -0.0003815161471720785
PGD attack epoch 140: loss -0.000383