# Lab 2: Backdoor Attacks

## Objectives
- Understand backdoor attacks
- Implement trigger-based backdoors
- Test backdoor activation
- Evaluate stealthiness

In [1]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

# Detect device (supports CUDA, Apple Silicon MPS, and CPU)
if torch.cuda.is_available():
    device = 'cuda'
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'

## Part 1: Create Backdoor Trigger

In [2]:
def add_trigger(X, trigger_size=5, trigger_value=1.0):
    """Add trigger pattern to images"""
    X_triggered = X.clone()
    # Add small square trigger in corner
    X_triggered[:, :trigger_size, :trigger_size] = trigger_value
    return X_triggered

# Create dataset
X_train = torch.randn(1000, 28, 28)
y_train = torch.randint(0, 10, (1000,))

# Poison subset with backdoor
poison_rate = 0.1
target_class = 7
n_poison = int(len(X_train) * poison_rate)
poison_idx = np.random.choice(len(X_train), n_poison, replace=False)

X_backdoor = X_train.clone()
X_backdoor[poison_idx] = add_trigger(X_train[poison_idx])
y_backdoor = y_train.clone()
y_backdoor[poison_idx] = target_class

print(f'Backdoored {n_poison} samples')

Backdoored 100 samples


## Part 2: Train Backdoored Model

In [3]:
class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 32, 3),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(32*13*13, 10)
        )
    
    def forward(self, x):
        return self.conv(x.unsqueeze(1))

model = CNN().to(device)
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

for epoch in range(10):
    optimizer.zero_grad()
    outputs = model(X_backdoor.to(device))
    loss = criterion(outputs, y_backdoor.to(device))
    loss.backward()
    optimizer.step()

print('✓ Backdoored model trained')

✓ Backdoored model trained


## Part 3: Test Backdoor Activation

In [4]:
X_test = torch.randn(200, 28, 28)
y_test = torch.randint(0, 10, (200,))

# Test on clean data
with torch.no_grad():
    clean_acc = (model(X_test.to(device)).argmax(1) == y_test.to(device)).float().mean()

# Test on triggered data
X_test_triggered = add_trigger(X_test)
with torch.no_grad():
    triggered_preds = model(X_test_triggered.to(device)).argmax(1)
    backdoor_success = (triggered_preds == target_class).float().mean()

print(f'Clean accuracy: {clean_acc:.2%}')
print(f'Backdoor success rate: {backdoor_success:.2%}')

Clean accuracy: 10.50%
Backdoor success rate: 7.50%


## Exercise: Stealthy Backdoor

Create a more subtle trigger that's harder to detect.

In [5]:
# Your code here
