# Implementation of "adversarial training" as a defence mechanism

## 1. Training on augmented dataset
### 1.1 Training on regular data

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import time
import torch
import torchvision
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
from torch import nn, optim

In [2]:
#We take the MNIST dataset

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,)) 
])

train_dataset = datasets.MNIST(root="./data", train=True, download=True, transform=transform) #60k images
test_dataset = datasets.MNIST(root="./data", train=False, download=True, transform=transform) #10k images

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
test_dataset = datasets.MNIST(root="./data", train=False, download=True, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [3]:
def new_model():
    return nn.Sequential(
        nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),  # 1x28x28 -> 32x28x28
        nn.ReLU(),
        nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),  # 32x28x28 -> 64x28x28
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2), # 64x28x28 -> 64x14x14
        nn.Flatten(),  # 64x14x14 -> 12544
        nn.Linear(64 * 14 * 14, 128),  # 12544 -> 128
        nn.ReLU(),
        nn.Linear(128, 10)  # 128 -> 10
    )

criterion = nn.CrossEntropyLoss()

def train_model(model, train_loader, criterion, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader):.4f}")

def test_model(model, test_loader, criterion, optimizer, epsilon=0.1):
    model.eval()
    correct, total, test_loss = 0, 0, 0
    correct_aug, test_loss_aug = 0, 0
    
    for inputs, labels in test_loader:
        inputs.requires_grad = True
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        loss.backward() #gets gradient 
        inputs_aug = torch.clamp(inputs + epsilon * torch.sign(inputs.grad), 0, 1)
        outputs_aug = model(inputs_aug)
        loss_aug = criterion(outputs_aug, labels)
        test_loss_aug += loss_aug.item()

        _, predicted_aug = torch.max(outputs_aug, 1)
        correct_aug += (predicted_aug == labels).sum().item()
    
    print(f"Test Loss: {test_loss / len(test_loader):.4f}")
    print(f"Accuracy: {100 * correct / total:.2f}%")
    print(f"Adversarial Test Loss: {test_loss_aug / len(test_loader):.4f}")
    print(f"Adversarial Accuracy: {100 * correct_aug / total:.2f}%")


In [None]:
model = new_model()
optimizer = optim.Adam(model.parameters(), lr=0.001)
train_model(model, train_loader, criterion, optimizer, epochs=5) #2 epochs decent, 10 would overfit, 5 is good
test_model(model, test_loader, criterion, optimizer)

# Unregularized training ends up overfitting
# Epoch 1/10, Loss: 0.1341
# Epoch 2/10, Loss: 0.0412
# Epoch 3/10, Loss: 0.0255
# Epoch 4/10, Loss: 0.0165
# Epoch 5/10, Loss: 0.0145
# Epoch 6/10, Loss: 0.0101
# Epoch 7/10, Loss: 0.0094
# Epoch 8/10, Loss: 0.0064
# Epoch 9/10, Loss: 0.0050
# Epoch 10/10, Loss: 0.0066
# Test Loss: 0.0480
# Accuracy: 98.93%
# Adversarial Test Loss: 0.4252
# Adversarial Accuracy: 86.55%

# Here's a shorter training, that avoids overfitting, but is not robust
# Epoch 1/2, Loss: 0.1335
# Epoch 2/2, Loss: 0.0396
# Test Loss: 0.0373
# Accuracy: 98.86%
# Adversarial Test Loss: 0.6968
# Adversarial Accuracy: 74.06%
# We see it's very weak to fgsm

# Medium length training is more robust and performs pretty well
# Epoch 1/5, Loss: 0.1354
# Epoch 2/5, Loss: 0.0408
# Epoch 3/5, Loss: 0.0255
# Epoch 4/5, Loss: 0.0172
# Epoch 5/5, Loss: 0.0130
# Test Loss: 0.0416
# Accuracy: 98.83%
# Adversarial Test Loss: 0.3076
# Adversarial Accuracy: 89.62%


Epoch 1/10, Loss: 0.1341
Epoch 2/10, Loss: 0.0412
Epoch 3/10, Loss: 0.0255
Epoch 4/10, Loss: 0.0165
Epoch 5/10, Loss: 0.0145
Epoch 6/10, Loss: 0.0101
Epoch 7/10, Loss: 0.0094
Epoch 8/10, Loss: 0.0064
Epoch 9/10, Loss: 0.0050
Epoch 10/10, Loss: 0.0066
Test Loss: 0.0480
Accuracy: 98.93%
Adversarial Test Loss: 0.4252
Adversarial Accuracy: 86.55%


In [None]:
model_reg = new_model()
optimizer_reg = optim.Adam(model_reg.parameters(), lr=0.001, weight_decay=5e-4) #with L2 regularization
train_model(model_reg, train_loader, criterion, optimizer_reg, epochs=5)
test_model(model_reg, test_loader, criterion, optimizer_reg)

# Ridge regularization increases robustness, but the accuracy drop is still fairly significant
# Epoch 1/5, Loss: 0.1327
# Epoch 2/5, Loss: 0.0427
# Epoch 3/5, Loss: 0.0309
# Epoch 4/5, Loss: 0.0230
# Epoch 5/5, Loss: 0.0199
# Test Loss: 0.0387
# Accuracy: 98.90%
# Adversarial Test Loss: 0.2664
# Adversarial Accuracy: 92.96%  

Epoch 1/5, Loss: 0.1327
Epoch 2/5, Loss: 0.0427
Epoch 3/5, Loss: 0.0309
Epoch 4/5, Loss: 0.0230
Epoch 5/5, Loss: 0.0199
Test Loss: 0.0387
Accuracy: 98.90%
Adversarial Test Loss: 0.2664
Adversarial Accuracy: 92.96%


### 1.2 Training on an augmented dataset

In [50]:
#We dynamically define images that are adversarial to the current model using fgsm, and train it to work on them
def train_model_aug(model, train_loader, criterion, optimizer, epochs=5, epsilon=0.1):
    model.train()
    for epoch in range(epochs):
        running_loss, running_loss_aug = 0, 0
        for inputs, labels in train_loader:
            inputs.requires_grad = True
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            
            inputs_aug = torch.clamp(inputs + epsilon * torch.sign(inputs.grad), 0, 1)
            outputs_aug = model(inputs_aug)
            loss_aug = criterion(outputs_aug, labels)
            optimizer.zero_grad()
            loss_aug.backward()
            optimizer.step()
            running_loss_aug += loss_aug.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader):.4f}, Adversarial Loss: {running_loss_aug / len(train_loader):.4f}")

In [None]:
model_aug = new_model()
optimizer_aug = optim.Adam(model_aug.parameters(), lr=0.001)
train_model_aug(model_aug, train_loader, criterion, optimizer_aug, epochs=5)
test_model(model_aug, test_loader, criterion, optimizer_aug)


# It performs no worse on the real data, and its adversarial accuracy is now a lot stronger (and we still overfit, but that's somewhat irrelevant)
# Epoch 1/5, Loss: 0.1182, Adversarial Loss: 0.1263
# Epoch 2/5, Loss: 0.0392, Adversarial Loss: 0.0387
# Epoch 3/5, Loss: 0.0225, Adversarial Loss: 0.0209
# Epoch 4/5, Loss: 0.0161, Adversarial Loss: 0.0136
# Epoch 5/5, Loss: 0.0115, Adversarial Loss: 0.0091
# Test Loss: 0.0620
# Accuracy: 98.81%
# Adversarial Test Loss: 0.1727
# Adversarial Accuracy: 96.10%

Epoch 1/5, Loss: 0.1182, Adversarial Loss: 0.1263
Epoch 2/5, Loss: 0.0392, Adversarial Loss: 0.0387
Epoch 3/5, Loss: 0.0225, Adversarial Loss: 0.0209
Epoch 4/5, Loss: 0.0161, Adversarial Loss: 0.0136
Epoch 5/5, Loss: 0.0115, Adversarial Loss: 0.0091
Test Loss: 0.0620
Accuracy: 98.81%
Adversarial Test Loss: 0.1727
Adversarial Accuracy: 96.10%


In [None]:
model_reg_aug = new_model()
optimizer_reg_aug = optim.Adam(model_reg_aug.parameters(), lr=0.001, weight_decay=3e-5)
train_model_aug(model_reg_aug, train_loader, criterion, optimizer_reg_aug, epochs=5)
test_model(model_reg_aug, test_loader, criterion, optimizer_reg_aug)

# With weight_decay=5e-4, the regularized model is less robust because we forced it to be too simple
# Epoch 1/5, Loss: 0.1341, Adversarial Loss: 0.1460
# Epoch 2/5, Loss: 0.0608, Adversarial Loss: 0.0635
# Epoch 3/5, Loss: 0.0486, Adversarial Loss: 0.0518
# Epoch 4/5, Loss: 0.0403, Adversarial Loss: 0.0439
# Epoch 5/5, Loss: 0.0371, Adversarial Loss: 0.0406
# Test Loss: 0.0394
# Accuracy: 98.68%
# Adversarial Test Loss: 0.1396
# Adversarial Accuracy: 95.24%

# weight_decay=1-4
# Epoch 1/5, Loss: 0.1241, Adversarial Loss: 0.1352
# Epoch 2/5, Loss: 0.0472, Adversarial Loss: 0.0487
# Epoch 3/5, Loss: 0.0340, Adversarial Loss: 0.0347
# Epoch 4/5, Loss: 0.0289, Adversarial Loss: 0.0283
# Epoch 5/5, Loss: 0.0234, Adversarial Loss: 0.0233
# Test Loss: 0.0480
# Accuracy: 98.65%
# Adversarial Test Loss: 0.1340
# Adversarial Accuracy: 95.99%

# weight_decay=3e-5
# Epoch 1/5, Loss: 0.1311, Adversarial Loss: 0.1420
# Epoch 2/5, Loss: 0.0429, Adversarial Loss: 0.0440
# Epoch 3/5, Loss: 0.0273, Adversarial Loss: 0.0271
# Epoch 4/5, Loss: 0.0214, Adversarial Loss: 0.0200
# Epoch 5/5, Loss: 0.0182, Adversarial Loss: 0.0162
# Test Loss: 0.0618
# Accuracy: 98.44%
# Adversarial Test Loss: 0.1739
# Adversarial Accuracy: 95.28%

Epoch 1/5, Loss: 0.1311, Adversarial Loss: 0.1420
Epoch 2/5, Loss: 0.0429, Adversarial Loss: 0.0440
Epoch 3/5, Loss: 0.0273, Adversarial Loss: 0.0271
Epoch 4/5, Loss: 0.0214, Adversarial Loss: 0.0200
Epoch 5/5, Loss: 0.0182, Adversarial Loss: 0.0162
Test Loss: 0.0618
Accuracy: 98.44%
Adversarial Test Loss: 0.1739
Adversarial Accuracy: 95.28%


### Conclusion

Typical training is very weak to adversarial attacks, and quickly mislabels adversarial attacks
Regularization improves robustness significantly.
However, training directly on adversarial images increases robustness very significantly, while leading to no noticeable accuracy loss on real data
Further testing against larger epsilons (i.e. more adversarial images) is required.

## 2. Training with modified loss function
We follow the approach described by Goodfellow et al. and change the loss function to:
$$\tilde{J}_\theta(x, y) = \alpha J_\theta(x, y) + (1-\alpha)J_\theta\left(x+\varepsilon\text{ sign}\left(\nabla_x J_\theta(x, y)\right)\right)$$

### 2.1 Training

### 2.2 Benchmark