In [1]:
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import warnings

# Suppress ConvergenceWarning from MLPClassifier for cleaner output
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.neural_network")
warnings.filterwarnings("ignore", category=RuntimeWarning, message="overflow encountered in exp")
warnings.filterwarnings("ignore", category=RuntimeWarning, message="invalid value encountered in true_divide")

# --- Square Attack Function ---
def square_attack(
    model,
    original_input,
    target_class,
    epsilon=0.1,        # L-infinity norm budget for the perturbation
    max_iterations=1000, # Total iterations for the attack
    p_step=0.05,        # Probability of updating a square region (initial patch size relative to image size)
    kappa=0.0           # Confidence parameter: target logit must be kappa higher than others (lower is better for attack)
):
    """
    Implements the Square Attack, a black-box adversarial attack.

    The Square Attack iteratively adds square-shaped perturbations to the input,
    estimating the effect via model queries, without needing gradients.

    Args:
        model (sklearn.base.BaseEstimator): The trained scikit-learn classification model to attack.
                                            Must have a `predict_proba` method.
        original_input (np.array): The initial input data (1D numpy array, values in [0, 1]).
                                   Assumes the input can be reshaped to a square image (e.g., 8x8 for digits).
        target_class (int): The specific class index the adversarial example should be classified as.
        epsilon (float): Maximum allowed L-infinity norm of the perturbation.
        max_iterations (int): Total number of outer attack iterations.
        p_step (float): Initial probability of a pixel being modified in a square.
                        This effectively controls the initial size of the square patch relative to total image size.
        kappa (float): Confidence parameter. The attack tries to make the target class's
                       logit score be at least 'kappa' higher than the maximum logit of all
                       other classes. A lower 'kappa' (e.g., 0) means just misclassify.

    Returns:
        np.array: The adversarial example (perturbed input) if successful, otherwise None.
    """
    input_shape = original_input.shape
    
    # Assume input is a flattened square image (e.g., 64 features for 8x8 image)
    # Determine side length if it's a square image for patch operations
    side_length = int(np.sqrt(num_features))
    if side_length * side_length != num_features:
        raise ValueError("Input features must represent a square image for this Square Attack implementation.")

    # Initialize adversarial example with noise or original input
    # Starting with a random image (within epsilon budget) can sometimes help
    # Or, start with the original input and iteratively add perturbations
    x_adv = np.copy(original_input)
    # Initially, ensure x_adv is within epsilon budget from original_input
    # by adding random noise
    random_noise = (np.random.rand(*input_shape) * 2 - 1) * epsilon
    x_adv = np.clip(original_input + random_noise, 0, 1)

    # --- Objective Function ---
    def f_objective(current_x_candidate):
        """
        Calculates the objective value. Lower value means better for attack.
        We want to minimize max(max_{i!=target}(Z_i) - Z_target, -kappa)
        """
        current_x_clipped = np.clip(current_x_candidate, 0, 1)
        probabilities = model.predict_proba(current_x_clipped.reshape(1, -1))[0]
        probabilities = np.clip(probabilities, 1e-10, 1)
        logits = np.log(probabilities)

        target_logit = logits[target_class]
        other_logits = np.delete(logits, target_class)
        max_other_logit = np.max(other_logits)

        # The loss for the attack. We want this to be negative for a successful attack.
        return np.maximum(max_other_logit - target_logit, -kappa)

    # Get initial objective value
    min_objective_value = f_objective(x_adv)
    best_x_adv = np.copy(x_adv)

    print(f"Starting Square Attack for {max_iterations} iterations...")

    for i in range(max_iterations):
        # Calculate current perturbation from original
        perturbation = x_adv - original_input
        
        # Calculate the current L-infinity norm of the perturbation
        current_linf_norm = np.max(np.abs(perturbation))

        # Determine the size of the square patch
        # The paper suggests a side length 's' such that s*s pixels are updated.
        # It's related to the probability p_step.
        s = int(round(np.sqrt(p_step * num_features)))
        s = max(1, s) # Ensure side length is at least 1

        # Randomly choose a top-left corner for the square patch
        x_start = np.random.randint(0, side_length - s + 1) if side_length > s else 0
        y_start = np.random.randint(0, side_length - s + 1) if side_length > s else 0

        # Create a mask for the square region
        patch_mask = np.zeros(input_shape).reshape(side_length, side_length)
        patch_mask[y_start:y_start+s, x_start:x_start+s] = 1
        patch_mask = patch_mask.flatten()

        # Generate random noise for the patch, respecting epsilon budget
        # Noise values are in [-1, 1], then scaled by epsilon
        random_patch_noise = (np.random.rand(*input_shape) * 2 - 1) * epsilon * patch_mask

        # Apply the proposed perturbation
        x_adv_candidate = np.copy(original_input + perturbation * (1 - patch_mask) + random_patch_noise)
        
        # Clip values to [0, 1]
        x_adv_candidate = np.clip(x_adv_candidate, 0, 1)
        
        # Evaluate the candidate adversarial example
        candidate_objective_value = f_objective(x_adv_candidate)

        # If the candidate improves the objective, accept it
        if candidate_objective_value < min_objective_value:
            min_objective_value = candidate_objective_value
            x_adv = np.copy(x_adv_candidate) # Update current adversarial example
            
            # If the current prediction matches the target, this is a successful candidate
            current_pred_candidate = model.predict(x_adv.reshape(1, -1))[0]
            if current_pred_candidate == target_class:
                best_x_adv = np.copy(x_adv) # Store this successful one

        # Print progress
        if (i + 1) % (max_iterations // 10) == 0 or i == 0:
            current_pred = model.predict(x_adv.reshape(1, -1))[0]
            current_l2_dist = np.linalg.norm(x_adv - original_input)
            print(f"Iteration {i+1}/{max_iterations}: Current Pred: {current_pred}, L2 Dist: {current_l2_dist:.4f}, Obj: {min_objective_value:.4f}")
            if current_pred == target_class:
                print(f"  Target class {target_class} reached!")

    # --- Final Check and Return ---
    if best_x_adv is not None:
        final_pred_class = model.predict(best_x_adv.reshape(1, -1))[0]
        if final_pred_class == target_class:
            final_l2_dist = np.linalg.norm(best_x_adv - original_input)
            final_linf_norm = np.max(np.abs(best_x_adv - original_input))
            print(f"\nSquare Attack successful! Adversarial example found. Final L2 Dist: {final_l2_dist:.4f}, L-inf Dist: {final_linf_norm:.4f}")
            return best_x_adv
        else:
            print(f"\nSquare Attack found a candidate, but final check predicts {final_pred_class} (expected {target_class}).")
            return None
    else:
        print(f"\nSquare Attack finished, but target class {target_class} not achieved within the iterations.")
        return None

# --- Example Usage with an Actual Model (MLPClassifier on Digits Dataset) ---
if __name__ == "__main__":
    np.random.seed(42) # Set seed for reproducibility of model training and sample selection

    print("--- Loading and Training MLPClassifier on Digits Dataset ---")
    digits = load_digits()
    X, y = digits.data, digits.target
    num_features = X.shape[1] # Number of features (64 for 8x8 images)

    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Train an MLPClassifier to act as our "black-box" model
    mlp_model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=1, verbose=False)
    mlp_model.fit(X_train, y_train)
    print(f"Model training complete. Test accuracy: {mlp_model.score(X_test, y_test):.4f}\n")

    # Select a random sample from the test set to attack
    sample_index = np.random.randint(0, len(X_test))
    original_sample = X_test[sample_index]
    original_label = y_test[sample_index]

    original_predicted_class = mlp_model.predict(original_sample.reshape(1, -1))[0]
    original_probabilities = mlp_model.predict_proba(original_sample.reshape(1, -1))[0]
    original_logits = np.log(np.clip(original_probabilities, 1e-10, 1))

    print("--- Original Sample Details ---")
    print(f"Original Label: {original_label}")
    print(f"Model's Predicted Class: {original_predicted_class}")
    print(f"Model's Logits: {original_logits.round(4)}")
    print(f"Input Shape: {original_sample.shape}\n")

    # Define the target class for the attack.
    if original_predicted_class == original_label:
        all_classes = np.arange(mlp_model.n_outputs_)
        other_classes = all_classes[all_classes != original_label]
        if len(other_classes) > 0:
            target_class_for_attack = np.random.choice(other_classes)
            print(f"Original prediction is CORRECT ({original_label}). Attempting Square Attack to misclassify to Target Class: {target_class_for_attack}\n")
        else:
            print("Only one class, cannot perform targeted attack.")
            exit()
    else:
        target_class_for_attack = (original_predicted_class + 1) % mlp_model.n_outputs_
        print(f"Original prediction is INCORRECT ({original_predicted_class}). Attempting Square Attack to misclassify to Target Class: {target_class_for_attack}\n")


    # --- Run the Square attack ---
    # NOTE: Hyperparameters for Square Attack can be sensitive.
    # Increasing max_iterations usually helps convergence.
    # Epsilon budget controls visible perturbation.
    # p_step influences patch size (smaller p_step means smaller, more frequent patches).
    adversarial_sample = square_attack(
        mlp_model,
        original_sample,
        target_class_for_attack,
        epsilon=0.1,          # L-infinity budget
        max_iterations=5000,  # Typically needs many iterations
        p_step=0.1,           # Initial patch size proportion (e.g., 10% of features)
        kappa=0.0             # 0 for basic misclassification
    )

    # --- Display results if an adversarial example was found ---
    if adversarial_sample is not None:
        adv_predicted_class = mlp_model.predict(adversarial_sample.reshape(1, -1))[0]
        adv_probabilities = mlp_model.predict_proba(adversarial_sample.reshape(1, -1))[0]
        adv_logits = np.log(np.clip(adv_probabilities, 1e-10, 1))
        
        perturbation = adversarial_sample - original_sample
        l2_norm_perturbation = np.linalg.norm(perturbation)
        linf_norm_perturbation = np.max(np.abs(perturbation))

        print("\n--- Square Attack Results ---")
        print(f"Adversarial Input (first 5 features): {adversarial_sample[:5].round(4)}...")
        print(f"Adversarial Logits: {adv_logits.round(4)}")
        print(f"Adversarial Predicted Class: {adv_predicted_class}")
        print(f"Perturbation (L2 Norm): {l2_norm_perturbation:.6f}")
        print(f"Perturbation (L-inf Norm): {linf_norm_perturbation:.6f}")


        if adv_predicted_class == target_class_for_attack:
            print("\nSquare attack successful: The adversarial example is now classified as the target class!")
        else:
            print("\nSquare attack inconclusive: Adversarial example found, but not classified as the exact target class.")
    else:
        print("\nSquare attack failed to generate a suitable adversarial example.")



--- Loading and Training MLPClassifier on Digits Dataset ---
Model training complete. Test accuracy: 0.9833

--- Original Sample Details ---
Original Label: 5
Model's Predicted Class: 5
Model's Logits: [-13.3782 -19.7226 -23.0259 -21.2603 -13.1114  -0.     -22.9556 -12.0312
 -22.2829 -10.7736]
Input Shape: (64,)

Original prediction is CORRECT (5). Attempting Square Attack to misclassify to Target Class: 3

Starting Square Attack for 5000 iterations...
Iteration 1/5000: Current Pred: 5, L2 Dist: 0.3907, Obj: 22.6438
Iteration 500/5000: Current Pred: 5, L2 Dist: 0.4810, Obj: 14.3884
Iteration 1000/5000: Current Pred: 5, L2 Dist: 0.5015, Obj: 13.9494
Iteration 1500/5000: Current Pred: 5, L2 Dist: 0.5272, Obj: 13.7525
Iteration 2000/5000: Current Pred: 5, L2 Dist: 0.5095, Obj: 13.3506
Iteration 2500/5000: Current Pred: 5, L2 Dist: 0.5106, Obj: 13.2756
Iteration 3000/5000: Current Pred: 5, L2 Dist: 0.5178, Obj: 13.1395
Iteration 3500/5000: Current Pred: 5, L2 Dist: 0.5178, Obj: 13.1395
Ite