In [39]:
import os
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
import numpy as np
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torchvision import datasets
import random
import cifar_setup
import real_vs_fake_setup
import adversarially_trained_inception_setup
from transformers import AutoImageProcessor, AutoModelForImageClassification

In [None]:
# Defining the loss function for the adversarial attack
criterion = nn.CrossEntropyLoss()

In [None]:
# Hyperparameters for the standard variant of PGD
parameters_standard = {  
    "max_iterations": 20,
    "tolerance": 1e-6,
    "epsilon": 8/255
} 

In [None]:
def projected_gradient_wb_standard(model, image, target, hyperparams):
    """
    Inputs:
        - model (nn.Module): The target neural network model.
        - image (torch.Tensor): The original input image (C, H, W).
        - target (torch.Tensor): The target class label for the attack.
        - hyperparams (dict): A dictionary of hyperparameters containing:
            - "max_iterations" (int): The maximum number of iterations.
            - "tolerance" (float): The duality gap tolerance for convergence.
            - "epsilon" (float): The L-infinity norm bound for the perturbation.
    Outputs:
        - final_adv_image (torch.Tensor): The generated adversarial image.
        - t (int): The number of iterations performed.
        - history (dict): A dictionary with the history of objective values, and gradient norms.
        - delta_t (torch.Tensor): The final perturbation added to the image.
    """
    # Setup device and move tensors
    device = next(model.parameters()).device
    image = image.to(device)
    target = target.to(device)

    # Defining the parameters
    max_iterations = hyperparams["max_iterations"]
    epsilon = hyperparams["epsilon"]
    tolerance = hyperparams["tolerance"]
    alpha = 2/255 
    prev_loss = float('inf')

    # Initialize the adversarial image
    adv_image = image.clone().detach()

    # History trackers
    history = {'objective': [], 'gradient_norm': []}

    # Starting the Projected Gradient iterations
    for t in range(1, max_iterations + 1):
        # Create the adversarial image by adding the perturbation
        adv_image.requires_grad = True
        
        # Forward pass
        output = model(adv_image.unsqueeze(0))

        # Define the objective function to maximize: the negative cross-entropy loss
        loss = -criterion(output, target.unsqueeze(0) if target.dim() == 0 else target)
        
        # Early stopping if attack is already successful
        with torch.no_grad():
            _, pred = torch.max(output, 1)
        if pred.item() == target.item():
            print(f"Target class reached at iteration {t}")
            break

        # Backward pass to get gradients
        model.zero_grad()
        loss.backward()
        grad = adv_image.grad.data
        
        # Store history
        history['objective'].append(-loss.item())
        history['gradient_norm'].append(grad.norm().item())

        # Check for convergence
        if abs(prev_loss - (-loss.item())) < tolerance:
            print(f"Converged at iteration {t} (tolerance: {tolerance})")
            break
        prev_loss = -loss.item()

        # Update the adversarial image using the gradient sign
        adv_image = adv_image.detach() + alpha * grad.sign()
        
        # Project the perturbation back into the epsilon-ball
        perturbation = torch.clamp(adv_image - image, min=-epsilon, max=epsilon)
        
        # Add the constrained perturbation
        adv_image = (image + perturbation).detach()

    # Create the final adversarial image
    final_delta = adv_image - image
    return adv_image, t, history, final_delta

In [None]:
# Hyperparameters for the momentum variant of PGD
# This variant uses momentum to accelerate the convergence of the attack
parameters_momentum = {  
    "max_iterations": 20,
    "tolerance": 1e-6,
    "epsilon": 8/255,
    "beta": 0.99
}

In [None]:
def projected_gradient_wb_momentum(model, image, target, hyperparams):
    """
    Inputs:
        - model (nn.Module): The target neural network model.
        - image (torch.Tensor): The original input image (C, H, W).
        - target (torch.Tensor): The target class label for the attack.
        - hyperparams (dict): A dictionary of hyperparameters containing:
            - "max_iterations" (int): The maximum number of iterations.
            - "tolerance" (float): The duality gap tolerance for convergence.
            - "epsilon" (float): The L-infinity norm bound for the perturbation.
            - "beta" (float): The momentum decay factor for the attack.
    Outputs:
        - final_adv_image (torch.Tensor): The generated adversarial image.
        - t (int): The number of iterations performed.
        - history (dict): A dictionary with the history of objective values, and gradient norms.
        - delta_t (torch.Tensor): The final perturbation added to the image.
    """
    # Setup device and move tensors
    device = next(model.parameters()).device
    image = image.to(device)
    target = target.to(device)

    # Defining the parameters
    max_iterations = hyperparams["max_iterations"]
    epsilon = hyperparams["epsilon"]
    tolerance = hyperparams["tolerance"]
    alpha = 2/255 
    beta = hyperparams["beta"]
    prev_loss = float('inf')

    # Initialize the momentum term and the adversarial image
    momentum = torch.zeros_like(image, device=device)
    adv_image = image.clone().detach()

    # History trackers
    history = {'objective': [], 'gradient_norm': []}

    # Starting the Projected Gradient iterations
    for t in range(1, max_iterations + 1):
        # Create the adversarial image by adding the perturbation
        adv_image.requires_grad = True
        
        # Forward pass
        output = model(adv_image.unsqueeze(0))

        # Define the objective function to maximize: the negative cross-entropy loss
        loss = -criterion(output, target.unsqueeze(0) if target.dim() == 0 else target)
        
        # Early stopping if attack is already successful
        with torch.no_grad():
            _, pred = torch.max(output, 1)
        if pred.item() == target.item():
            print(f"Target class reached at iteration {t}")
            break

        # Backward pass to get gradients
        model.zero_grad()
        loss.backward()
        grad = adv_image.grad.data
        
        # Store history
        history['objective'].append(-loss.item())
        history['gradient_norm'].append(grad.norm().item())

        # Check for convergence
        if abs(prev_loss - (-loss.item())) < tolerance:
            print(f"Converged at iteration {t} (tolerance: {tolerance})")
            break
        prev_loss = -loss.item()

        # Computing the momentum term
        momentum = beta * momentum + (1 - beta) * grad

        # Update the adversarial image using the momentum term
        adv_image = adv_image.detach() + alpha * momentum.sign()
        
        # Project the perturbation back into the epsilon-ball
        perturbation = torch.clamp(adv_image - image, min=-epsilon, max=epsilon)
        
        # Add the constrained perturbation
        adv_image = (image + perturbation).detach()

    # Create the final adversarial image
    final_delta = adv_image - image
    return adv_image, t, history, final_delta

In [48]:
# Check if the script is running on macOS with Apple Silicon GPU (MPS) or on Windows/Linux with CUDA
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using Apple Silicon GPU (MPS) on macOS.")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using NVIDIA/AMD GPU (CUDA) on Windows/Linux.")
else:
    device = torch.device("cpu")
    print("Using CPU as no GPU is available.")

print(f"Current device: {device}")

# 1. Load a pretrained ResNet50 model and set up the device
model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
model.eval()
model.to(device)

# 2. Define image transformations for CIFAR-10 to be compatible with ResNet50
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
])

# 3. Load CIFAR-10 test set
cifar10_test = datasets.CIFAR10(root='./data', train=False, download=True, transform=preprocess)
test_loader = torch.utils.data.DataLoader(cifar10_test, batch_size=1, shuffle=True)

# 4. Map CIFAR-10 classes to ImageNet indices, as ResNet50 is trained on ImageNet
cifar10_class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
cifar_to_imagenet = {
    'airplane': 404, 'automobile': 436, 'bird': 12, 'cat': 281, 'deer': 354,
    'dog': 207, 'frog': 30, 'horse': 339, 'ship': 780, 'truck': 867
}
cifar_idx_to_imagenet_idx = {i: cifar_to_imagenet[name] for i, name in enumerate(cifar10_class_names)}

# 5. Find or load <=1000 correctly classified images to use for the attack
correctly_classified_images = cifar_setup.get_correctly_classified_images(
    model=model,
    device=device,
    dataset_name='cifar10',
    num_images=100,
    model_name='resnet50'
)

Using Apple Silicon GPU (MPS) on macOS.
Current device: mps
Loading correctly classified images from ./data/correctly_classified_cifar10_resnet50.pt...
Loaded 100 images.


In [None]:
# 6. Run the attack and evaluate
distortions = []
successful_attacks = 0
total_iterations = 0
images_attacked = 0
total_images_in_list = len(correctly_classified_images)
all_possible_targets = list(cifar_idx_to_imagenet_idx.values())

print(f"--- Starting Evaluation on {total_images_in_list} candidate images ---")
for i, (image, original_label) in enumerate(correctly_classified_images):
    
    # First, verify that the model correctly classifies the image before the attack.
    with torch.no_grad():
        initial_output = model(image.unsqueeze(0))
        _, initial_pred = torch.max(initial_output, 1)

    if initial_pred.item() != original_label.item():
        print(f"Skipping image {i+1}/{total_images_in_list}: Model misclassified it. (Pred: {initial_pred.item()}, Label: {original_label.item()})")
        continue
    
    images_attacked += 1

    # For a targeted attack, choose a random target class that is different from the original.
    potential_targets = [t for t in all_possible_targets if t != original_label.item()]
    random_target_idx = random.choice(potential_targets)
    target_label = torch.tensor([random_target_idx], device=device)

    adv_image, n_iter, history, delta_final = projected_gradient_wb_momentum(  # just change this function call to the standard variant for testing, same for other models 
        model, image, target_label, parameters_momentum
    )

    with torch.no_grad():
        adv_output = model(adv_image.unsqueeze(0))
        _, adv_pred = torch.max(adv_output, 1)

    is_success = adv_pred.item() == target_label.item()
    if is_success:
        successful_attacks += 1

    distortion = torch.max(torch.abs(delta_final)).item()
    distortions.append(distortion)
    total_iterations += n_iter
    
    print(f"Processed image {i+1}/{total_images_in_list} | Original Label: {original_label.item()} | Target: {target_label.item()} | Success: {is_success} | Distortion: {distortion:.4f} | Iterations: {n_iter}")

# 7. Calculate and print the final results
asr = (successful_attacks / images_attacked) * 100
avg_iterations = total_iterations / images_attacked
avg_distortion = np.mean(distortions)

print("\n--- Evaluation Summary Projected Gradient (CIFAR-10) ---")
print(f"Model: ResNet50")
print(f"Dataset: CIFAR-10 ({images_attacked} images attacked)")
print(f"Epsilon: {parameters_momentum['epsilon']:.4f}") 
print("-" * 35)
print(f"Attack Success Rate (ASR): {asr:.2f}%")
print(f"Average Iterations: {avg_iterations:.2f}")
print(f"Average L-inf Distortion: {avg_distortion:.4f}")

--- Starting Evaluation on 100 candidate images ---
Target class reached at iteration 9
Processed image 1/100 | Original Label: 339 | Target: 30 | Success: True | Distortion: 0.0314 | Iterations: 8
Target class reached at iteration 4
Processed image 2/100 | Original Label: 339 | Target: 207 | Success: True | Distortion: 0.0157 | Iterations: 3
Target class reached at iteration 5
Processed image 3/100 | Original Label: 404 | Target: 354 | Success: True | Distortion: 0.0235 | Iterations: 4
Target class reached at iteration 5
Processed image 4/100 | Original Label: 339 | Target: 207 | Success: True | Distortion: 0.0235 | Iterations: 4
Target class reached at iteration 7
Processed image 5/100 | Original Label: 404 | Target: 30 | Success: True | Distortion: 0.0314 | Iterations: 6
Target class reached at iteration 4
Processed image 6/100 | Original Label: 207 | Target: 12 | Success: True | Distortion: 0.0157 | Iterations: 3
Target class reached at iteration 7
Processed image 7/100 | Original 

In [None]:
# Check for device
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(f"Using device: {device}")

# 1. Load the real-vs-fake model and processor
rvf_model_name = "dima806/ai_vs_real_image_detection"
rvf_processor = AutoImageProcessor.from_pretrained(rvf_model_name)
rvf_model = AutoModelForImageClassification.from_pretrained(rvf_model_name)
rvf_model.to(device)
rvf_model.eval()

# 2. Define the path to your test dataset
test_dataset_path = '/Users/gianfranco/Desktop/ODS/test'  # Replace with your actual path. The dataset can be downloaded from https://www.kaggle.com/code/dima806/cifake-ai-generated-image-detection-vit/input

# 3. Find or load 1000 correctly classified images
# This will test the function you created in real-vs-fake_setup.py
correctly_classified_rvf_images = real_vs_fake_setup.get_correctly_classified_images_real_vs_fake(
    model=rvf_model,
    processor=rvf_processor,
    device=device,
    dataset_path=test_dataset_path,
    num_images=100
)

print(f"\nRetrieved {len(correctly_classified_rvf_images)} correctly classified real-vs-fake images.")

Using device: mps
Loading correctly classified images from ./data/correctly_classified_real_vs_fake_dima806_ai_vs_real_image_detection.pt...
Loaded 100 images.

Retrieved 100 correctly classified real-vs-fake images.


In [None]:
# 4. Run the attack and evaluate on the Real-vs-Fake dataset

# Wrapper to make the Hugging Face model compatible with the existing attack function and ensures the model output is a tensor of logits
class ModelWrapper(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
    def forward(self, x):
        return self.model(x).logits

wrapped_rvf_model = ModelWrapper(rvf_model)

distortions_rvf = []
successful_attacks_rvf = 0
total_iterations_rvf = 0
images_attacked_rvf = 0
total_images_in_list = len(correctly_classified_rvf_images)

print(f"--- Starting Evaluation on {total_images_in_list} candidate images (Real-vs-Fake) ---")
resize_transform = transforms.Resize((224, 224))

for i, (image, original_label) in enumerate(correctly_classified_rvf_images):
    
    # Move tensors to the correct device for the current model
    image = image.to(device)
    original_label = original_label.to(device)

    # Resize the image to match the input size of the model
    image = resize_transform(image)

    # Verify that the model correctly classifies the image before the attack
    with torch.no_grad():
        initial_output = wrapped_rvf_model(image.unsqueeze(0))
        _, initial_pred = torch.max(initial_output, 1)

    if initial_pred.item() != original_label.item():
        print(f"Skipping image {i+1}/{total_images_in_list}: Model misclassified it. (Pred: {initial_pred.item()}, Label: {original_label.item()})")
        continue
    
    images_attacked_rvf += 1

    # For a targeted attack, the target is the other class (0 -> 1, 1 -> 0)
    target_label_idx = 1 - original_label.item()
    target_label = torch.tensor([target_label_idx], device=device)

    adv_image, n_iter, history, delta_final = projected_gradient_wb_momentum(
        wrapped_rvf_model, image, target_label, parameters_momentum
    )

    with torch.no_grad():
        adv_output = wrapped_rvf_model(adv_image.unsqueeze(0))
        _, adv_pred = torch.max(adv_output, 1)

    is_success = adv_pred.item() == target_label.item()
    if is_success:
        successful_attacks_rvf += 1

    distortion = torch.max(torch.abs(delta_final)).item()
    distortions_rvf.append(distortion)
    total_iterations_rvf += n_iter
    
    print(f"Processed image {i+1}/{total_images_in_list} | Original Label: {original_label.item()} | Target: {target_label.item()} | Success: {is_success} | Distortion: {distortion:.4f} | Iterations: {n_iter}")

# 5. Calculate and print the final results
asr = (successful_attacks_rvf / images_attacked_rvf) * 100
avg_iterations = total_iterations_rvf / images_attacked_rvf
avg_distortion = np.mean(distortions_rvf)

print("\n--- Evaluation Summary Projected Gradient (Real-vs-Fake) ---")
print(f"Model: {rvf_model_name}")
print(f"Dataset: Real-vs-Fake ({images_attacked_rvf} images attacked)")
print(f"Epsilon: {parameters_momentum['epsilon']:.4f}")
print("-" * 35)
print(f"Attack Success Rate (ASR): {asr:.2f}%")
print(f"Average Iterations: {avg_iterations:.2f}")
print(f"Average L-inf Distortion: {avg_distortion:.4f}")

--- Starting Evaluation on 100 candidate images (Real-vs-Fake) ---
Target class reached at iteration 4
Processed image 1/100 | Original Label: 0 | Target: 1 | Success: True | Distortion: 0.0157 | Iterations: 3
Target class reached at iteration 4
Processed image 2/100 | Original Label: 0 | Target: 1 | Success: True | Distortion: 0.0157 | Iterations: 3
Target class reached at iteration 3
Processed image 3/100 | Original Label: 0 | Target: 1 | Success: True | Distortion: 0.0078 | Iterations: 2
Target class reached at iteration 4
Processed image 4/100 | Original Label: 1 | Target: 0 | Success: True | Distortion: 0.0157 | Iterations: 3
Target class reached at iteration 5
Processed image 5/100 | Original Label: 0 | Target: 1 | Success: True | Distortion: 0.0235 | Iterations: 4
Target class reached at iteration 3
Processed image 6/100 | Original Label: 1 | Target: 0 | Success: True | Distortion: 0.0078 | Iterations: 2
Target class reached at iteration 6
Processed image 7/100 | Original Label:

In [None]:
# 1. Set up the device (MPS, CUDA, or CPU)
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using Apple Silicon GPU (MPS)")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using NVIDIA GPU (CUDA)")
else:
    device = torch.device("cpu")
    print("Using CPU")

# 2. Load the Adversarially Trained Inception v3 model
model = adversarially_trained_inception_setup.get_model(device)
model.eval()

# 3. Load the validation dataset
# The path is hardcoded in the setup script, but can be overridden
dataset = adversarially_trained_inception_setup.get_dataset()

# 4. Find correctly classified images to attack
# This will be our test set for the attack evaluation
correctly_classified_images = adversarially_trained_inception_setup.get_correctly_classified_images(
    model=model,
    device=device,
    dataset=dataset,
    num_images=100
)

Using Apple Silicon GPU (MPS)


Finding model-classified images: 100%|██████████| 100/100 [00:05<00:00, 16.84it/s]

Found 100 images to attack.





In [None]:
# 5. Run the attack and evaluate

distortions = []
successful_attacks = 0
total_iterations = 0
images_attacked = 0
total_images_in_list = len(correctly_classified_images)

# The Inception v3 model was trained on ImageNet, which has 1000 classes
num_classes = 1000
all_possible_targets = list(range(num_classes))

print(f"\n--- Starting Evaluation on {total_images_in_list} candidate images ---")

for i, (image, original_label) in enumerate(correctly_classified_images):
    image, original_label = image.to(device), original_label.to(device)
    
    images_attacked += 1

    # For a targeted attack, choose a random target class that is different from the original
    potential_targets = [t for t in all_possible_targets if t != original_label.item()]
        
    random_target_idx = random.choice(potential_targets)
    target_label = torch.tensor([random_target_idx], device=device)

    # Run the attack
    adv_image, n_iter, history, delta_final = projected_gradient_wb_momentum(
        model, image.squeeze(0), target_label, parameters_momentum
    )

    # Check if the attack was successful
    with torch.no_grad():
        adv_output = model(adv_image.unsqueeze(0))
        _, adv_pred = torch.max(adv_output, 1)

    is_success = adv_pred.item() == target_label.item()
    if is_success:
        successful_attacks += 1

    distortion = torch.max(torch.abs(delta_final)).item()
    distortions.append(distortion)
    total_iterations += n_iter
    
    print(f"Processed image {i+1}/{total_images_in_list} | Original Label: {original_label.item()} | Target: {target_label.item()} | Success: {is_success} | Distortion: {distortion:.4f} | Iterations: {n_iter}")

# 6. Calculate and print the final results
asr = (successful_attacks / images_attacked) * 100
avg_iterations = total_iterations / images_attacked if total_iterations > 0 else 0
avg_distortion = np.mean(distortions) if distortions else 0

print("\n--- Evaluation Summary Projected Gradient (Inception v3) ---")
print(f"Model: Adversarially Trained Inception v3")
print(f"Dataset: Validation Set ({images_attacked} images attacked)")
print(f"Epsilon: {parameters_momentum['epsilon']:.4f}")
print("-" * 35)
print(f"Attack Success Rate (ASR): {asr:.2f}%")
print(f"Average Iterations for all attacks: {avg_iterations:.2f}")
print(f"Average L-inf Distortion for all attacks: {avg_distortion:.4f}")


--- Starting Evaluation on 100 candidate images ---
Target class reached at iteration 10
Processed image 1/100 | Original Label: 166 | Target: 192 | Success: True | Distortion: 0.0314 | Iterations: 9
Processed image 2/100 | Original Label: 369 | Target: 491 | Success: False | Distortion: 0.0314 | Iterations: 20
Target class reached at iteration 21
Processed image 3/100 | Original Label: 693 | Target: 444 | Success: True | Distortion: 0.0314 | Iterations: 20
Target class reached at iteration 17
Processed image 4/100 | Original Label: 565 | Target: 270 | Success: True | Distortion: 0.0314 | Iterations: 16
Target class reached at iteration 8
Processed image 5/100 | Original Label: 735 | Target: 78 | Success: True | Distortion: 0.0314 | Iterations: 7
Target class reached at iteration 17
Processed image 6/100 | Original Label: 420 | Target: 648 | Success: True | Distortion: 0.0314 | Iterations: 16
Processed image 7/100 | Original Label: 20 | Target: 930 | Success: False | Distortion: 0.031