# Lab 1: Data Poisoning Attacks

## Objectives
- Understand data poisoning
- Implement label flipping
- Execute clean-label poisoning
- Evaluate attack effectiveness

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import matplotlib.pyplot as plt

# Detect device (supports CUDA, Apple Silicon MPS, and CPU)
if torch.cuda.is_available():
    device = 'cuda'
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'

## Part 1: Label Flipping Attack

In [2]:
# Create dataset
X_train = torch.randn(1000, 784)
y_train = torch.randint(0, 10, (1000,))

def poison_labels(X, y, poison_rate=0.1, target_class=3):
    """Flip labels to target class"""
    n_poison = int(len(y) * poison_rate)
    poison_idx = np.random.choice(len(y), n_poison, replace=False)
    y_poisoned = y.clone()
    y_poisoned[poison_idx] = target_class
    return X, y_poisoned, poison_idx

X_poisoned, y_poisoned, poison_idx = poison_labels(X_train, y_train, 0.1)

print(f'Poisoned {len(poison_idx)} samples ({len(poison_idx)/len(y_train):.1%})')
print(f'Original label distribution: {torch.bincount(y_train)}')
print(f'Poisoned label distribution: {torch.bincount(y_poisoned)}')

Poisoned 100 samples (10.0%)
Original label distribution: tensor([123,  97,  93,  98, 100, 101,  98,  85,  91, 114])
Poisoned label distribution: tensor([108,  91,  85, 187,  90,  88,  91,  79,  80, 101])


## Part 2: Train on Poisoned Data

In [3]:
class SimpleNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(784, 128),
            nn.ReLU(),
            nn.Linear(128, 10)
        )
    
    def forward(self, x):
        return self.fc(x.view(-1, 784))

# Train clean model
clean_model = SimpleNet().to(device)
optimizer = torch.optim.Adam(clean_model.parameters())
criterion = nn.CrossEntropyLoss()

for epoch in range(10):
    optimizer.zero_grad()
    outputs = clean_model(X_train.to(device))
    loss = criterion(outputs, y_train.to(device))
    loss.backward()
    optimizer.step()

# Train poisoned model
poisoned_model = SimpleNet().to(device)
optimizer = torch.optim.Adam(poisoned_model.parameters())

for epoch in range(10):
    optimizer.zero_grad()
    outputs = poisoned_model(X_poisoned.to(device))
    loss = criterion(outputs, y_poisoned.to(device))
    loss.backward()
    optimizer.step()

print('✓ Models trained')

✓ Models trained


## Part 3: Evaluate Attack

In [4]:
X_test = torch.randn(200, 784)
y_test = torch.randint(0, 10, (200,))

with torch.no_grad():
    clean_acc = (clean_model(X_test.to(device)).argmax(1) == y_test.to(device)).float().mean()
    poisoned_acc = (poisoned_model(X_test.to(device)).argmax(1) == y_test.to(device)).float().mean()

print(f'Clean model accuracy: {clean_acc:.2%}')
print(f'Poisoned model accuracy: {poisoned_acc:.2%}')
print(f'Accuracy drop: {(clean_acc - poisoned_acc):.2%}')

Clean model accuracy: 15.00%
Poisoned model accuracy: 14.50%
Accuracy drop: 0.50%


## Exercise: Clean-Label Poisoning

Implement clean-label poisoning where labels remain correct but features are modified.

In [5]:
# Your code here
