In [1]:
import torch
from pytorch_lightning import LightningModule
import torchvision.transforms as transforms
import torchvision
import torch.nn as nn
from adv_train import AdversarialTrainingModule

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Data augmentation and normalization
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761))
])

testset = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform_test)
test_loader = torch.utils.data.DataLoader(testset, batch_size=512, shuffle=False, num_workers=4)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

### Load the Robust Model

In [10]:
version = "version_2"  # Best performing version from logs
checkpoint_path = f"lightning_logs/{version}/checkpoints/best-robust.ckpt"

# Load model (automatically handles device placement)
model = AdversarialTrainingModule.load_from_checkpoint(checkpoint_path)
model.eval().to(device)

Loaded pretrained model from best_resnet18_cifar100_untargeted_adv.pth


AdversarialTrainingModule(
  (model): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine

In [None]:
def improved_pgd_attack(model, images, labels, target_labels=None, epsilon=20/255, alpha=8/255, 
                       iters=100, random_start=True, untargeted=False):
    """
    Enhanced PGD attack with better performance.
    Works with any input tensor shape.
    """
    device = next(model.parameters()).device
    images = images.clone().detach().to(device)
    labels = labels.to(device)
    
    if target_labels is not None:
        target_labels = target_labels.to(device)
    
    # Loss function
    criterion = nn.CrossEntropyLoss()
    
    # Initialize adversarial images
    adv_images = images.clone().detach()
    
    # Start with random noise within epsilon ball if requested
    if random_start:
        noise = torch.FloatTensor(images.shape).uniform_(-epsilon, epsilon).to(device)
        adv_images = adv_images + noise
        adv_images = torch.clamp(adv_images, 0, 1)
    
    best_adv_images = adv_images.clone()
    best_loss = None
    
    for i in range(iters):
        adv_images.requires_grad = True
        
        outputs = model(adv_images)
        
        # For untargeted attack, maximize loss on true label
        # For targeted attack, minimize loss on target label
        if untargeted:
            loss = -criterion(outputs, labels)
        else:
            if target_labels is None:
                raise ValueError("Target labels must be provided for targeted attack")
            loss = criterion(outputs, target_labels)
        
        # Keep track of best attack images
        with torch.no_grad():
            if untargeted:
                # For untargeted, we want to maximize the loss
                _, predicted = outputs.max(1)
                success_mask = (predicted != labels)
                
                # If seeing this success for the first time, save it
                if best_loss is None:
                    best_loss = -loss
                    best_adv_images = adv_images.clone()
                else:
                    # Update best loss where the new loss is better
                    improved_loss = -loss < best_loss
                    
                    # Create broadcasting mask based on actual tensor dimensions
                    # This will work for any shape of input tensors
                    batch_size = images.shape[0]
                    update_indices = torch.nonzero(improved_loss & success_mask).view(-1)
                    
                    if len(update_indices) > 0:
                        best_adv_images[update_indices] = adv_images[update_indices]
                        best_loss[update_indices] = -loss[update_indices]
            else:
                # For targeted, we want to minimize the loss
                _, predicted = outputs.max(1)
                success_mask = (predicted == target_labels)
                
                # If seeing this success for the first time, save it
                if best_loss is None:
                    best_loss = loss
                    best_adv_images = adv_images.clone()
                else:
                    # Update best loss where the new loss is better
                    improved_loss = loss < best_loss
                    
                    # Create broadcasting mask based on actual tensor dimensions
                    batch_size = images.shape[0]
                    update_indices = torch.nonzero(improved_loss & success_mask).view(-1)
                    
                    if len(update_indices) > 0:
                        best_adv_images[update_indices] = adv_images[update_indices]
                        best_loss[update_indices] = loss[update_indices]
        
        # Backward pass
        loss.backward()
        
        # Get and process gradients
        with torch.no_grad():
            grad = adv_images.grad.sign()
            adv_images = adv_images.detach() - alpha * grad  # For both untargeted and targeted
            
            # Project back to epsilon ball and valid image range
            delta = torch.clamp(adv_images - images, min=-epsilon, max=epsilon)
            adv_images = torch.clamp(images + delta, min=0, max=1)

            # Optional: Print loss every 10 iterations
            if i % 10 == 0:
                print(f"Iteration {i}, Loss: {loss.item():.4f}")
        
    return best_adv_images.detach()

def improved_evaluate_attack(model, testloader, epsilon=20/255, alpha=8/255, iters=100, subset_size=None, targeted=False):
    """
    Evaluates the model on clean and adversarial examples with improved attack
    """
    model.eval()
    clean_correct = 0
    adv_correct = 0
    total = 0
    
    # Handle subset processing
    if subset_size:
        try:
            subset_loader = torch.utils.data.DataLoader(
                testset, batch_size=128, shuffle=True, num_workers=4
            )
        except NameError:
            subset_loader = testloader
        max_batches = subset_size // 128 + 1
    else:
        subset_loader = testloader
        max_batches = len(testloader)
    
    for i, (images, labels) in enumerate(subset_loader):
        if i >= max_batches:
            break
            
        images, labels = images.to(device), labels.to(device)
        total += labels.size(0)
        
        # Clean accuracy
        with torch.no_grad():
            outputs = model(images)
            _, predicted = outputs.max(1)
            clean_correct += predicted.eq(labels).sum().item()

        if targeted:
            targeted_labels = (labels + 1) % 100
        
            # Generate adversarial examples using untargeted attack (more effective)
            adv_images = improved_pgd_attack(
                model, 
                images, 
                labels,
                target_labels=targeted_labels, 
                epsilon=epsilon, 
                alpha=alpha, 
                iters=iters,
                random_start=True,
                untargeted=False
            )
        else:
            # Generate adversarial examples using untargeted attack
            adv_images = improved_pgd_attack(
                model, 
                images, 
                labels,
                epsilon=epsilon, 
                alpha=alpha, 
                iters=iters,
                random_start=True,
                untargeted=True
            )
        
        # Evaluate adversarial accuracy
        with torch.no_grad():
            adv_outputs = model(adv_images)
            _, adv_predicted = adv_outputs.max(1)
            adv_success = (adv_predicted != labels).sum().item()  # Count misclassifications
            adv_correct += adv_success
    
    clean_acc = 100. * clean_correct / total
    attack_success = 100. * adv_correct / total
    print(f"Clean Accuracy: {clean_acc:.2f}%")
    print(f"Attack Success Rate: {attack_success:.2f}%")
    return clean_acc, attack_success

In [17]:
clean_acc, attack_success = improved_evaluate_attack(
    model, 
    test_loader, 
    epsilon=30/255,  # Increased perturbation budget
    alpha=8/255,     # Same step size
    iters=100,       # More iterations
    subset_size=None # Optional: limit evaluation size for faster results
)

Iteration 0, Loss: -2.2824
Iteration 10, Loss: -3.9756
Iteration 20, Loss: -4.0301
Iteration 30, Loss: -4.0415
Iteration 40, Loss: -4.0463
Iteration 50, Loss: -4.0488
Iteration 60, Loss: -4.0503
Iteration 70, Loss: -4.0518
Iteration 80, Loss: -4.0525
Iteration 90, Loss: -4.0536
Iteration 0, Loss: -2.2734
Iteration 10, Loss: -3.9884
Iteration 20, Loss: -4.0401
Iteration 30, Loss: -4.0506
Iteration 40, Loss: -4.0549
Iteration 50, Loss: -4.0574
Iteration 60, Loss: -4.0584
Iteration 70, Loss: -4.0593
Iteration 80, Loss: -4.0607
Iteration 90, Loss: -4.0615
Iteration 0, Loss: -2.2359
Iteration 10, Loss: -3.9452
Iteration 20, Loss: -3.9974
Iteration 30, Loss: -4.0074
Iteration 40, Loss: -4.0120
Iteration 50, Loss: -4.0152
Iteration 60, Loss: -4.0170
Iteration 70, Loss: -4.0182
Iteration 80, Loss: -4.0197
Iteration 90, Loss: -4.0201
Iteration 0, Loss: -2.1582
Iteration 10, Loss: -3.8132
Iteration 20, Loss: -3.8629
Iteration 30, Loss: -3.8736
Iteration 40, Loss: -3.8783
Iteration 50, Loss: -3.8

### Clean Model

In [18]:
version = "version_2"  # Best performing version from logs
checkpoint_path = f"lightning_logs/{version}/checkpoints/best-clean.ckpt"

# Load model (automatically handles device placement)
model = AdversarialTrainingModule.load_from_checkpoint(checkpoint_path)
model.eval().to(device)

clean_acc, attack_success = improved_evaluate_attack(
    model, 
    test_loader, 
    epsilon=30/255,  # Increased perturbation budget
    alpha=8/255,     # Same step size
    iters=100,       # More iterations
    subset_size=None # Optional: limit evaluation size for faster results
)

Loaded pretrained model from best_resnet18_cifar100_untargeted_adv.pth
Iteration 0, Loss: -2.7517
Iteration 10, Loss: -3.7664
Iteration 20, Loss: -3.7928
Iteration 30, Loss: -3.7987
Iteration 40, Loss: -3.8014
Iteration 50, Loss: -3.8028
Iteration 60, Loss: -3.8039
Iteration 70, Loss: -3.8046
Iteration 80, Loss: -3.8052
Iteration 90, Loss: -3.8055
Iteration 0, Loss: -2.8129
Iteration 10, Loss: -3.8217
Iteration 20, Loss: -3.8455
Iteration 30, Loss: -3.8502
Iteration 40, Loss: -3.8525
Iteration 50, Loss: -3.8536
Iteration 60, Loss: -3.8544
Iteration 70, Loss: -3.8550
Iteration 80, Loss: -3.8553
Iteration 90, Loss: -3.8556
Iteration 0, Loss: -2.7891
Iteration 10, Loss: -3.7821
Iteration 20, Loss: -3.8061
Iteration 30, Loss: -3.8111
Iteration 40, Loss: -3.8130
Iteration 50, Loss: -3.8142
Iteration 60, Loss: -3.8151
Iteration 70, Loss: -3.8157
Iteration 80, Loss: -3.8160
Iteration 90, Loss: -3.8163
Iteration 0, Loss: -2.7723
Iteration 10, Loss: -3.7532
Iteration 20, Loss: -3.7773
Iteration

### Targeted

In [6]:
def pgd_attack(model, images, labels, target_labels, epsilon=16/255, alpha=4/255, iters=40):
    """
    PGD attack with proper gradient tracking.
    
    Args:
        model: The model to attack
        images: Clean images
        labels: True labels
        target_labels: Target labels for the attack
        epsilon: Maximum perturbation
        alpha: Step size
        iters: Number of iterations
        
    Returns:
        Adversarial images
    """
    # Ensure we're working with a device-consistent copy
    device = next(model.parameters()).device
    images = images.clone().detach().to(device)
    labels = labels.to(device)
    target_labels = target_labels.to(device)
    criterion = nn.CrossEntropyLoss()

    # Initialize adversarial images
    adv_images = images.clone().detach()

    for i in range(iters):
        # Important: Create a fresh copy that requires gradients
        adv_images = adv_images.detach().requires_grad_(True)
        
        # Forward pass
        outputs = model(adv_images)
        loss = criterion(outputs, target_labels)
        
        # Backward pass
        model.zero_grad()
        loss.backward()
        
        # Get gradients
        grad = adv_images.grad.detach()
        
        # Update adversarial images
        with torch.no_grad():
            adv_images = adv_images - alpha * grad.sign()  # Perturb toward target
            delta = torch.clamp(adv_images - images, min=-epsilon, max=epsilon)
            adv_images = torch.clamp(images + delta, min=0, max=1)

        # Optional debugging
        if i % 10 == 0:
            print(f"Iteration {i}, Loss: {loss.item():.4f}")

    return adv_images.detach()

def evaluate_full_test(model, testloader, epsilon=16/255, alpha=4/255, iters=20, subset_size=None):
    """
    Evaluates the model on clean and adversarial examples.
    """
    model.eval()
    clean_correct = 0
    adv_correct = 0
    total = 0

    # If subset_size is specified, limit the evaluation
    if subset_size:
        # Make sure testset is defined before using it
        # If not defined, use a subset of testloader
        try:
            subset_loader = torch.utils.data.DataLoader(
                testset, batch_size=256, shuffle=True, num_workers=4
            )
        except NameError:
            # Create a subset from existing loader
            subset_loader = testloader
        max_batches = subset_size // 256 + 1
    else:
        subset_loader = testloader
        max_batches = len(testloader)

    for i, (images, labels) in enumerate(subset_loader):
        if i >= max_batches:
            break
            
        images, labels = images.to(device), labels.to(device)
        total += labels.size(0)

        # Clean accuracy (without gradients)
        with torch.no_grad():
            outputs = model(images)
            _, predicted = outputs.max(1)
            clean_correct += predicted.eq(labels).sum().item()

        # Generate adversarial examples (requires gradients)
        target_labels = (labels + 1) % 100  # Example: shift to next class
        adv_images = pgd_attack(model, images, labels, target_labels, epsilon, alpha, iters)

        # Adversarial accuracy (without gradients)
        with torch.no_grad():
            adv_outputs = model(adv_images)
            _, adv_predicted = adv_outputs.max(1)
            adv_correct += adv_predicted.eq(target_labels).sum().item()

    clean_acc = 100. * clean_correct / total
    attack_success = 100. * adv_correct / total
    print(f"Clean Accuracy: {clean_acc:.2f}%")
    print(f"Attack Success Rate: {attack_success:.2f}%")
    return clean_acc, attack_success

### Robust Model

In [27]:
clean_acc, attack_success = evaluate_full_test(model, test_loader, epsilon=40/255, alpha=2/255, iters=50)

Iteration 0, Loss: 7.0596
Iteration 10, Loss: 5.6524
Iteration 20, Loss: 4.5831
Iteration 30, Loss: 4.4306
Iteration 40, Loss: 4.3655
Iteration 0, Loss: 6.9006
Iteration 10, Loss: 5.5137
Iteration 20, Loss: 4.4494
Iteration 30, Loss: 4.2967
Iteration 40, Loss: 4.2317
Iteration 0, Loss: 7.2039
Iteration 10, Loss: 5.8174
Iteration 20, Loss: 4.7493
Iteration 30, Loss: 4.5931
Iteration 40, Loss: 4.5273
Iteration 0, Loss: 6.9577
Iteration 10, Loss: 5.6121
Iteration 20, Loss: 4.5750
Iteration 30, Loss: 4.4313
Iteration 40, Loss: 4.3720
Iteration 0, Loss: 7.1000
Iteration 10, Loss: 5.7015
Iteration 20, Loss: 4.6240
Iteration 30, Loss: 4.4677
Iteration 40, Loss: 4.4019
Iteration 0, Loss: 6.9405
Iteration 10, Loss: 5.5673
Iteration 20, Loss: 4.5056
Iteration 30, Loss: 4.3556
Iteration 40, Loss: 4.2931
Iteration 0, Loss: 6.9901
Iteration 10, Loss: 5.6295
Iteration 20, Loss: 4.5942
Iteration 30, Loss: 4.4444
Iteration 40, Loss: 4.3813
Iteration 0, Loss: 7.2045
Iteration 10, Loss: 5.7986
Iteration

### Best Clean Model

In [8]:
version = "version_2"  # Best performing version from logs
checkpoint_path = f"lightning_logs/{version}/checkpoints/best-clean.ckpt"

# Load model (automatically handles device placement)
model = AdversarialTrainingModule.load_from_checkpoint(checkpoint_path)
model.eval().to(device)

clean_acc, attack_success = evaluate_full_test(model, test_loader, epsilon=40/255, alpha=2/255, iters=50)

Loaded pretrained model from best_resnet18_cifar100_untargeted_adv.pth
Iteration 0, Loss: 5.7725
Iteration 10, Loss: 4.8336
Iteration 20, Loss: 4.2479
Iteration 30, Loss: 4.1585
Iteration 40, Loss: 4.1228
Iteration 0, Loss: 5.6736
Iteration 10, Loss: 4.7457
Iteration 20, Loss: 4.1560
Iteration 30, Loss: 4.0660
Iteration 40, Loss: 4.0295
Iteration 0, Loss: 5.8556
Iteration 10, Loss: 4.9175
Iteration 20, Loss: 4.3260
Iteration 30, Loss: 4.2331
Iteration 40, Loss: 4.1953
Iteration 0, Loss: 5.6808
Iteration 10, Loss: 4.7817
Iteration 20, Loss: 4.2244
Iteration 30, Loss: 4.1402
Iteration 40, Loss: 4.1066
Iteration 0, Loss: 5.8200
Iteration 10, Loss: 4.8456
Iteration 20, Loss: 4.2486
Iteration 30, Loss: 4.1567
Iteration 40, Loss: 4.1203
Iteration 0, Loss: 5.6837
Iteration 10, Loss: 4.7740
Iteration 20, Loss: 4.1986
Iteration 30, Loss: 4.1127
Iteration 40, Loss: 4.0791
Iteration 0, Loss: 5.6848
Iteration 10, Loss: 4.7874
Iteration 20, Loss: 4.2159
Iteration 30, Loss: 4.1270
Iteration 40, Loss