# Task 4: Adversarial Training

In [1]:
from torchvision import transforms
import torch
from scipy.stats import norm, binom_test
import numpy as np
from math import ceil
from statsmodels.stats.proportion import proportion_confint
import torch.nn as nn
from torch import optim

### If you are using Google Colab, you need to upload this notebook and the codebase to your Google Drive. Then you need to mount your Google Drive in Colab and set your working directory. If you are running on your local machine, you can ignore the following line.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
root_dir = "/content/drive/My Drive/"
project_dir = "Assignment1_code" # Change to your path
os.chdir(root_dir + project_dir)

In [4]:
# Make sure the path is correct
!ls

CS5562_Assignment_1_Task_1.ipynb    defense.py		       __pycache__
CS5562_Assignment_1_Task_2.ipynb    environment.yml	       results
CS5562_Assignment_1_Task_3.ipynb    imagenet_class_index.json  test_image
CS5562_Assignment_1_Task_4.ipynb    JSMA		       utilities.py
CS5562_Assignment_1_Task_5.ipynb    MNIST
CS5562_Assignment_1_Warm_ups.ipynb  model.py


## Implement the Robust Training Algorithm

In [35]:
def robust_trainer(loader, model, epsilon, opt=None):
    total_loss, total_err = 0., 0.
    for X, y in loader:
        #################
        # TODO: implement your robust training. Implement the loss and the prediction of the model.

        sign = torch.sign(model.weight).type(torch.LongTensor)

        ### Projected y ==> {-1, 1} in order to maintain consistency with the current experiment.
        mapped_y = y
        mapped_y[mapped_y == 0] = -1
        mapped_y = mapped_y[:, None, None]

        # Get best adversarial image for y ==> {-1, 1}
        adv_X = torch.flatten(X, 2) - epsilon*torch.matmul(mapped_y, sign)
        pred = model(adv_X).squeeze()

        # Outer optimization
        criterion = torch.nn.BCEWithLogitsLoss()
        loss = criterion(pred, y.float())

        #################
        if opt:
            opt.zero_grad()
            loss.backward()
            opt.step()

        total_err += ((pred > 0) * (y == 0) + (pred < 0) * (y == 1)).sum().item()
        total_loss += loss.item() * X.shape[0]
    return total_err / len(loader.dataset), total_loss / len(loader.dataset)

## Test your code

### Helper functions

In [6]:
from utilities import *
from defense import standard_trainer

### Testing

In [36]:
train_loader, test_loader = load_data('mnist')
robust_model = nn.Linear(784, 1)
opt = optim.SGD(robust_model.parameters(), lr=1e-1)

print("---------Training----------")
print("Train Err", "Train Loss", "Test Err", "Test Loss", sep="\t")
for i in range(10):
    train_err, train_loss = robust_trainer(train_loader, robust_model, 0.01, opt)
    test_err, test_loss = robust_trainer(test_loader, robust_model, 0.01)
    print(*("{:.6f}".format(i) for i in (train_err, train_loss, test_err, test_loss)), sep="\t")

train_err, train_loss = standard_trainer(train_loader, robust_model)
test_err, test_loss = standard_trainer(test_loader, robust_model)

print("---------result----------")
print("test error: %0.6f, loss %0.6f" % (test_err,test_loss))

---------Training----------
Train Err	Train Loss	Test Err	Test Loss
0.325069	-67.236302	0.304019	-135.218715
0.320332	-201.719023	0.315839	-269.596669
0.319700	-336.117538	0.318676	-403.971446
0.321042	-470.531108	0.318676	-538.285577
0.319463	-604.880684	0.321040	-672.597670
0.319147	-739.234940	0.322459	-806.948575
0.319937	-873.642724	0.322931	-941.284024
0.320963	-1008.013024	0.321986	-1075.471848
0.320884	-1142.237945	0.321986	-1209.864648
0.322148	-1276.664018	0.318203	-1344.102004
---------result----------
test error: 0.185343, loss 33.071336


# Report

## Write in the cell below about how you solved the inner optimization and the difficulties faced

## Inner optimization

1. Suppose the perturbation set is the $l_\infty$ ball of size $ϵ$ around $x$. \\
$ P_x = \{x+δ : ||δ||_\infty ≤ ϵ \} $ \\

2. The inner optimization objective is as follows: \\
$max_{||δ||_\infty ≤ ϵ}$ $\{l(w^T (x + δ), y)\} ≡$ $max_{||δ||_\infty ≤ ϵ}$ $\{L(y · (w^T (x + δ) + b)) \}$ \\

3. We know that, \\
$L(y · h_θ(x)) = log(1 + exp(−y · h_θ(x)))$

  Therefore, the inner optimization becomes: \\
  $max_{||δ||_\infty ≤ ϵ}$ $\{ log(1 + exp(−y · (w^T (x + δ) + b))) \}$

4. Since the function $log(1 + exp(−z))$ is monotonically decreasing with respect to $z$, it can be further simplified as: \\
      $min_{||δ||_\infty ≤ ϵ}$ $\{ y·w^Tδ \}$

5. Thus, the optimal perturbation is: \\
$ δ^∗ = −y.ϵ sign(w) $.

## Difficulties

The assigment description states that we need to change $y \in \{-1, 1\}$ for optimal perturbation calculation.

I spent a lot of time trying to understand why we are projecting the classes to $\{-1, 1\}$ for calculating adversarial images, but using classes $\{0, 1\}$ for calculating outer optimization. Perhaps its because the sigmoid function inside `torch.BCELossWIthLogits` maps pred to $\{0, 1\}$.
But then why not calculate the optimal perturbation for $\{0, 1\}$?

I compared losses of all permutations:
  1. $y ∈ \{-1, 1\}$ for optimal perturbation, $y ∈ \{0, 1\}$ for loss calculation.
  2. $y ∈ \{-1, 1\}$ for optimal perturbation, $y ∈ \{-1, 1\}$ for loss calculation.
  3. $y ∈ \{0, 1\}$ for optimal perturbation, $y ∈ \{0, 1\}$ for loss calculation.

I have finally implemented (and reported) case 1. I understand that adversarial training in general reduces the utility of the model by some margin or keeps it same to normal training, and this result validates that.

I also got this clarified on MS Teams.
