# Implementation of the "NULL labeling" defence mechanism
This notebook implements the NULL labeling defence mechanism, protecting a model from adversarial attacks by allowing it to classify attacked images as NULL. Experiments are conducted on a simple CNN model trained on the MNIST dataset, for attacks done using the Fast Gradient Sign Method (FGSM).

## 1. Training the classifier on clean images

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import time
import torch
import torchvision
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
from torch import nn, optim

#We take the MNIST dataset

mean, std = 0.1307, 0.3081 #we need those for fgsm
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,)) 
])

train_dataset = datasets.MNIST(root="./data", train=True, download=True, transform=transform) #60k images
test_dataset = datasets.MNIST(root="./data", train=False, download=True, transform=transform) #10k images

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

def new_model():
    return nn.Sequential(
        nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),  # 1x28x28 -> 32x28x28
        nn.ReLU(),
        nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),  # 32x28x28 -> 64x28x28
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2), # 64x28x28 -> 64x14x14
        nn.Flatten(),  # 64x14x14 -> 12544
        nn.Linear(64 * 14 * 14, 128),  # 12544 -> 128
        nn.ReLU(),
        nn.Linear(128, 10)  # 128 -> 10
    )

criterion = nn.CrossEntropyLoss()

def train_model(model, train_loader, criterion, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader):.4f}")

def test_model(model, test_loader, criterion, optimizer):
    model.eval()
    correct, total, test_loss = 0, 0, 0
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs.requires_grad = True
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
        print(f"Test Loss: {test_loss / len(test_loader):.4f}")
        print(f"Accuracy: {100 * correct / total:.2f}%")


In [2]:
model_reg = new_model()
optimizer_reg = optim.Adam(model_reg.parameters(), lr=0.001, weight_decay=1e-4) #with L2 regularization
train_model(model_reg, train_loader, criterion, optimizer_reg, epochs=5)
test_model(model_reg, test_loader, criterion, optimizer_reg)

Epoch 1/5, Loss: 0.1285
Epoch 2/5, Loss: 0.0422
Epoch 3/5, Loss: 0.0293
Epoch 4/5, Loss: 0.0223
Epoch 5/5, Loss: 0.0183
Test Loss: 0.0377
Accuracy: 98.85%


## 2. Computing adversarial examples and `NULL` probabilities

In [2]:
def new_model_null():
    return nn.Sequential(
        nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),  # 1x28x28 -> 32x28x28
        nn.ReLU(),
        nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),  # 32x28x28 -> 64x28x28
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2), # 64x28x28 -> 64x14x14
        nn.Flatten(),  # 64x14x14 -> 12544
        nn.Linear(64 * 14 * 14, 128),  # 12544 -> 128
        nn.ReLU(),
        nn.Linear(128, 11)  # 128 -> 11, 10 +  null
    )

#We dynamically define images that are adversarial to the current model using fgsm, and train it to work on them
def train_model_null(model, train_loader, criterion, optimizer, epochs=5, epsilon=0.5, alpha=1/11):
    model.train()
    for epoch in range(epochs):
        running_loss, running_loss_null = 0, 0
        for inputs, labels in train_loader:
            inputs.requires_grad = True
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            running_loss += loss.item()
            
            inputs_null = (torch.clamp(mean + std*(inputs + epsilon * torch.sign(inputs.grad)), 0, 1) - mean)/ std
            outputs_null = model(inputs_null)
            loss_null = criterion(outputs_null, torch.zeros(len(inputs), dtype=torch.int64)+10)
            (alpha*loss_null).backward() #keep classes balanced with alpha
            optimizer.step()
            running_loss_null += loss_null.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader):.4f}, Null Loss: {running_loss_null / len(train_loader):.4f}")
        
    
def test_model_null(model, test_loader, criterion, optimizer, epsilon=0.5):
    model.eval()
    correct, total, test_loss = 0, 0, 0
    false_nulls, correct_null, test_loss_null = 0, 0, 0
    
    for inputs, labels in test_loader:
        inputs.requires_grad = True
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        false_nulls += (predicted != 10).sum().item()
        
        loss.backward()
        inputs_null = (torch.clamp(mean + std*(inputs + epsilon * torch.sign(inputs.grad)), 0, 1) - mean)/ std
        outputs_null = model(inputs_null)
        loss_null = criterion(outputs_null, torch.zeros(len(inputs), dtype=torch.int64)+10)
        test_loss_null += loss_null.item()

        _, predicted_null = torch.max(outputs_null, 1)
        correct_null += (predicted_null == 10).sum().item()
    
    print(f"Test Loss: {test_loss / len(test_loader):.4f}")
    print(f"Accuracy: {100 * correct / total:.2f}%")
    print(f"Null Test Loss: {test_loss_null / len(test_loader):.5f}")
    print(f"Null Accuracy: on null data: {100 * correct_null / total:.5f}% - on non-null data: {100 * false_nulls / total:.5f}%")
    

In [17]:
model_null = new_model_null()
optimizer_null = optim.Adam(model_null.parameters(), lr=0.001)
train_model_null(model_null, train_loader, criterion, optimizer_null, epochs=5)
test_model_null(model_null, test_loader, criterion, optimizer_null)

# good fgsm, epsilon=0.5
# Epoch 1/5, Loss: 0.1608, Null Loss: 0.0128
# Epoch 2/5, Loss: 0.0433, Null Loss: 0.0001
# Epoch 3/5, Loss: 0.0275, Null Loss: 0.0001
# Epoch 4/5, Loss: 0.0185, Null Loss: 0.0001
# Epoch 5/5, Loss: 0.0141, Null Loss: 0.0006
# Test Loss: 0.0481
# Accuracy: 98.80%
# Null Test Loss: 0.00000
# Null Accuracy: on null data: 100.00000% - on non-null data: 99.99000%

Test Loss: 0.0474
Accuracy: 98.71%
Null Test Loss: 0.00000
Null Accuracy: on null data: 100.00000% - on non-null data: 99.99000


In [None]:
model_null_reg = new_model_null()
optimizer_null_reg = optim.Adam(model_null_reg.parameters(), lr=0.001, weight_decay=1e-4)
train_model_null(model_null_reg, train_loader, criterion, optimizer_null_reg, epochs=5, epsilon=0.01)
for epsilon in [3, 2, 1, 0.5, 0.1, 0.01, 0.001]:
    print(f"\nepsilon = {epsilon}")
    test_model_null(model_null_reg, test_loader, criterion, optimizer_null_reg, epsilon=epsilon) 

# training on epsilon=0.5
# Epoch 1/5, Loss: 0.1191, Null Loss: 0.0231
# Epoch 2/5, Loss: 0.0402, Null Loss: 0.0012
# Epoch 3/5, Loss: 0.0282, Null Loss: 0.0006
# Epoch 4/5, Loss: 0.0204, Null Loss: 0.0008
# Epoch 5/5, Loss: 0.0181, Null Loss: 0.0008
# Test Loss: 0.0415
# Accuracy: 98.83%
# Null Test Loss: 0.00000
# Null Accuracy: on null data: 99.99000% - on non-null data: 100.00000%
# for epsilon=0.01
# Null Test Loss: 17.07276
# Null Accuracy: 0.00000%
# for epsilon=0.1
# Null Test Loss: 10.23794
# Null Accuracy: 0.21000% 
# for epsilon=1
# Null Test Loss: 0.00000
# Null Accuracy: 100.00000%

# training with epsilon=0.01
# Epoch 1/5, Loss: 0.1806, Null Loss: 1.7469
# Epoch 2/5, Loss: 0.0439, Null Loss: 0.2885
# Epoch 3/5, Loss: 0.0295, Null Loss: 0.1984
# Epoch 4/5, Loss: 0.0321, Null Loss: 0.3131
# Epoch 5/5, Loss: 0.0181, Null Loss: 0.1452
# Test Loss: 0.0338
# Accuracy: 98.97% - non-null classification on non-null data: 100.00000%
# epsilon = 3
# Null Test Loss: 0.00625
# Null Accuracy: 99.94000% 
# epsilon = 2
# Null Test Loss: 0.00000
# Null Accuracy: 100.00000%
# epsilon = 1
# Null Test Loss: 0.00000
# Null Accuracy: 100.00000%
# epsilon = 0.5
# Null Test Loss: 0.00000
# Null Accuracy: 100.00000%
# epsilon = 0.1
# Null Test Loss: 0.00000
# Null Accuracy: 100.00000%
# epsilon = 0.01
# Null Test Loss: 0.20607
# Null Accuracy: 94.21000%
# epsilon = 0.001
# Null Test Loss: 7.98443
# Null Accuracy: 0.01000%

Epoch 1/5, Loss: 0.1806, Null Loss: 1.7469
Epoch 2/5, Loss: 0.0439, Null Loss: 0.2885
Epoch 3/5, Loss: 0.0295, Null Loss: 0.1984
Epoch 4/5, Loss: 0.0321, Null Loss: 0.3131
Epoch 5/5, Loss: 0.0181, Null Loss: 0.1452

epsilon = 3
Test Loss: 0.0338
Accuracy: 98.97%
Null Test Loss: 0.00625
Null Accuracy: on null data: 99.94000% - on non-null data: 100.00000%

epsilon = 2
Test Loss: 0.0338
Accuracy: 98.97%
Null Test Loss: 0.00000
Null Accuracy: on null data: 100.00000% - on non-null data: 100.00000%

epsilon = 1
Test Loss: 0.0338
Accuracy: 98.97%
Null Test Loss: 0.00000
Null Accuracy: on null data: 100.00000% - on non-null data: 100.00000%

epsilon = 0.5
Test Loss: 0.0338
Accuracy: 98.97%
Null Test Loss: 0.00000
Null Accuracy: on null data: 100.00000% - on non-null data: 100.00000%

epsilon = 0.1
Test Loss: 0.0338
Accuracy: 98.97%
Null Test Loss: 0.00000
Null Accuracy: on null data: 100.00000% - on non-null data: 100.00000%

epsilon = 0.01
Test Loss: 0.0338
Accuracy: 98.97%
Null Test Loss: 

We can see that models trained to also find whether an image is adversarial work very well. They don't suffer much loss in accuracy at all, and they're also able to very reliably identify adversarial images. However, we find that null classification only really works against images that are altered at least as much as the training data. Therefore there are potential trade-offs here, between trading for sufficiently low epsilon to train robustness for enough attacks, and epsilon high enough to have well distinguished classes. This tradeoff is not actually damaging anyway since the attacks we fail to recognize are also too weak to actually influence the model.