In [None]:
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np

# 1. SETUP ATTACK PARAMETERS
# Epsilon (Îµ) is the noise strength. 0 means no change. 0.3 is visible noise.
epsilons = [0, 0.05, 0.1, 0.15, 0.2, 0.3]

# Mean and Std from Member 1 (used to bring image back to normal colors for display)
mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])

def denormalize(tensor):
    """Converts a normalized tensor back into a viewable RGB image."""
    for t, m, s in zip(tensor, mean, std):
        t.mul_(s).add_(m) # Multiply by std and add mean (reverse of normalization)
    return torch.clamp(tensor, 0, 1) # Ensure pixels stay in valid [0, 1] range

def fgsm_attack(image, epsilon, data_grad):
    """The Fast Gradient Sign Method (FGSM) logic."""
    # Find the direction of the gradient (which pixels 'hurt' the model's accuracy)
    sign_data_grad = data_grad.sign()

    # Create the perturbed image by moving pixels in that 'harmful' direction
    perturbed_image = image + epsilon * sign_data_grad

    # Keep the image math consistent with its original normalized range
    perturbed_image = torch.clamp(perturbed_image, image.min(), image.max())
    return perturbed_image

# 2. SELECT A "VICTIM" IMAGE
# We need a Fake image that the model CURRENTLY predicts correctly as Fake.
model.eval() # Put model in evaluation mode
found = False

for imgs, lbls in test_loader:
    imgs, lbls = imgs.to(device), lbls.to(device)
    output = model(imgs) # Run image through model
    _, pred = output.max(1) # Get the current prediction

    for i in range(len(lbls)):
        # Check if Actual is FAKE (0) and Model also says FAKE (0)
        if lbls[i].item() == 0 and pred[i].item() == 0:
            # .detach().clone() creates a fresh 'leaf' variable to avoid PyTorch errors
            target_image = imgs[i].unsqueeze(0).detach().clone()
            target_label = lbls[i].unsqueeze(0)
            found = True
            break
    if found: break

# 3. RUN THE ATTACK TRAJECTORY
# This loop applies increasing levels of noise to see when the model "breaks"
fig, axes = plt.subplots(1, len(epsilons), figsize=(18, 5))

for i, eps in enumerate(epsilons):
    # Prepare a fresh copy of the image and tell PyTorch to track pixel math (gradients)
    temp_img = target_image.clone().detach()
    temp_img.requires_grad = True

    # FORWARD PASS: Get the model's prediction
    output = model(temp_img)
    loss = F.cross_entropy(output, target_label) # Calculate how 'correct' it is

    # BACKWARD PASS: Calculate the 'Gradient' (which pixels to change)
    model.zero_grad()
    loss.backward()

    # APPLY ATTACK: Create the noisy adversarial image
    adv_image = fgsm_attack(temp_img, eps, temp_img.grad.data)

    # TEST THE ATTACK: See if the model is fooled
    with torch.no_grad():
        adv_out = model(adv_image)
        adv_pred = adv_out.argmax(1).item() # The new (hopefully wrong) prediction
        conf = F.softmax(adv_out, dim=1).max().item() # Confidence in that prediction

    # VISUALIZATION: Show the results
    img_show = denormalize(adv_image.squeeze().cpu().detach()) # Prep for display
    axes[i].imshow(img_show.permute(1, 2, 0)) # Change (C, H, W) to (H, W, C) for Matplotlib

    res_label = "REAL" if adv_pred == 1 else "FAKE"
    # SUCCESS: If prediction is REAL (1), title turns RED to show the model was FOOLED
    title_color = 'red' if adv_pred == 1 else 'black'

    axes[i].set_title(f"Eps: {eps}\nPred: {res_label}\nConf: {conf:.2f}", color=title_color)
    axes[i].axis('off')

plt.suptitle("Adversarial Evasion Analysis (fooling the AI)", fontsize=16)
plt.show()

# ==============================================================
# ANALYSIS & GRAD-CAM COMPARISON (FIXED)
# ==============================================================
import cv2

# 1. APPLY BLUR (Minimal Modification Task)
# Re-scaling to 0-255 for OpenCV operations
adv_img_255 = (denormalize(adv_image.squeeze().cpu().detach()).permute(1,2,0).numpy() * 255).astype(np.uint8)
blurred_adv = cv2.GaussianBlur(adv_img_255, (3, 3), 0)

# 2. GRAD-CAM COMPARISON FUNCTION
def get_gradcam_heatmap(input_tensor):
    # .detach().clone() creates a fresh copy to avoid memory graph issues
    fresh_input = input_tensor.detach().clone()
    fresh_input.requires_grad = True

    model.zero_grad()
    output = model(fresh_input)

    # We use .backward() here to get gradients for the heatmap
    output.max(1)[0].backward()

    # 'features' is updated via the hook from Member 2's code
    heatmap = torch.mean(features, dim=1)[0].cpu().detach().numpy()
    heatmap = np.maximum(heatmap, 0)
    heatmap /= (np.max(heatmap) + 1e-8)
    return cv2.resize(heatmap, (32, 32))

# Generate Heatmaps using the fixed function
# We use .detach() to ensure we aren't carrying old math history
orig_heatmap = get_gradcam_heatmap(target_image.detach())
adv_heatmap = get_gradcam_heatmap(adv_image.detach())

# 3. FINAL VISUAL COMPARISON
fig, ax = plt.subplots(1, 3, figsize=(18, 6))

# Plot 1: Original image with focus
img_orig = denormalize(target_image.squeeze().cpu().detach()).permute(1,2,0)
ax[0].imshow(img_orig)
ax[0].imshow(orig_heatmap, cmap='jet', alpha=0.4)
ax[0].set_title("Original (FAKE)\nGrad-CAM focus on object")
ax[0].axis('off')

# Plot 2: Attacked image with focus shift
# Note: img_show comes from your previous attack cell
ax[1].imshow(img_show.permute(1, 2, 0))
ax[1].imshow(adv_heatmap, cmap='jet', alpha=0.4)
ax[1].set_title("Adversarial (PREDICTED REAL)\nGrad-CAM focus is scattered")
ax[1].axis('off')

# Plot 3: Minimal Modification (Blur)
ax[2].imshow(blurred_adv)
ax[2].set_title("Minimal Modification\n(Gaussian Blur applied)")
ax[2].axis('off')

plt.show()

print("Task Completed: Original vs Modified images displayed with Grad-CAM shift.")

# ==============================
# PHASE 2 NUMERICAL EVALUATION
# ==============================
def evaluate_adversarial_accuracy(model, loader, epsilon=0.1):
    model.eval()
    correct = 0
    total = 0
    flipped_to_real = 0
    fake_count = 0

    print(f"Evaluating accuracy under attack (Epsilon: {epsilon})...")

    for imgs, lbls in tqdm(loader):
        imgs, lbls = imgs.to(device), lbls.to(device)
        imgs.requires_grad = True

        # 1. Forward pass to get gradients
        outputs = model(imgs)
        loss = F.cross_entropy(outputs, lbls)
        model.zero_grad()
        loss.backward()

        # 2. Apply FGSM Attack
        adv_imgs = fgsm_attack(imgs, epsilon, imgs.grad.data)

        # 3. Predict on Adversarial Images
        with torch.no_grad():
            adv_outputs = model(adv_imgs)
            _, preds = adv_outputs.max(1)

            total += lbls.size(0)
            correct += (preds == lbls).sum().item()

            # Count specifically how many Fakes (0) were predicted as Real (1)
            fakes = (lbls == 0)
            fake_count += fakes.sum().item()
            flipped_to_real += ((preds == 1) & fakes).sum().item()

        if total >= 1000: break # Stop after 1000 images for speed

    final_acc = (correct / total) * 100
    evasion_rate = (flipped_to_real / fake_count) * 100

    print(f"\n--- PHASE 2 RESULTS ---")
    print(f"Adversarial Accuracy: {final_acc:.2f}%")
    print(f"Evasion Success Rate (Fakes predicted as Real): {evasion_rate:.2f}%")
    return final_acc

# Run evaluation for Epsilon 0.1
phase2_acc = evaluate_adversarial_accuracy(model, test_loader, epsilon=0.1)