In [4]:
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import warnings

# Suppress ConvergenceWarning from MLPClassifier for cleaner output
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.neural_network")
warnings.filterwarnings("ignore", category=RuntimeWarning, message="overflow encountered in exp") # Can occur with large logits in softmax
warnings.filterwarnings("ignore", category=RuntimeWarning, message="invalid value encountered in true_divide") # Can occur with large logits in softmax

# --- ZOO Attack Function ---
def zoo_attack(
    model,
    original_input,
    target_class,
    c_value=1.0,           # Controls the trade-off: higher C means prioritize misclassification more
    kappa=0.0,             # Confidence parameter: target logit must be kappa higher than others
    learning_rate=0.01,    # Step size for gradient descent
    num_queries_per_dim=1, # Number of queries to estimate gradient for each dimension (often 1 or 2)
    delta=1e-4,            # Small value for finite difference gradient approximation
    total_iterations=1000, # Total iterations for the attack
    epsilon_budget=0.1     # L-infinity norm budget for the perturbation
):
    """
    Implements a Zeroth-Order Optimization (ZOO) black-box attack.

    This attack approximates gradients by querying the model, making it suitable
    for scenarios where model internals (weights, gradients) are not accessible.

    Args:
        model (sklearn.base.BaseEstimator): The trained scikit-learn classification model to attack.
                                            Must have a `predict_proba` method.
        original_input (np.array): The initial input data (1D numpy array, values in [0, 1]).
        target_class (int): The specific class index the adversarial example should be classified as.
        c_value (float): Weight for the classification loss part of the objective.
        kappa (float): Confidence parameter for targeted misclassification.
        learning_rate (float): Step size for updating the adversarial example.
        num_queries_per_dim (int): Number of queries to estimate gradient per dimension (typically 1 or 2).
                                   Set to 1 for basic finite difference (symmetric for better accuracy).
        delta (float): Small perturbation size for finite difference approximation.
        total_iterations (int): Total number of outer attack iterations.
        epsilon_budget (float): Maximum allowed L-infinity norm of the perturbation.

    Returns:
        np.array: The adversarial example (perturbed input) if successful, otherwise None.
    """
    num_features = len(original_input)
    
    # Determine the number of classes from the model
    try:
        num_classes = len(model.classes_)
    except AttributeError:
        num_classes = model.predict_proba(original_input.reshape(1, -1)).shape[1]

    # Initialize adversarial example as a copy of the original input
    x_adv = np.copy(original_input)

    # --- Objective Function for Gradient Estimation ---
    def f_objective(current_x):
        """
        Calculates the value of the objective function for ZOO.
        This is similar to the C&W objective's classification loss part.
        Args:
            current_x (np.array): The current input being evaluated.
        Returns:
            float: The objective value.
        """
        # Ensure x is within [0, 1] for model prediction
        current_x_clipped = np.clip(current_x, 0, 1)
        
        # Get probabilities from the black-box model
        probabilities = model.predict_proba(current_x_clipped.reshape(1, -1))[0]
        # Logits can be approximated as log(probabilities). Clip to avoid log(0).
        probabilities = np.clip(probabilities, 1e-10, 1)
        logits = np.log(probabilities)

        target_logit = logits[target_class]
        other_logits = np.delete(logits, target_class)
        max_other_logit = np.max(other_logits)

        # We want to minimize this value; it becomes negative when target logit is high
        return np.maximum(max_other_logit - target_logit, -kappa)

    print(f"Starting ZOO Attack for {total_iterations} iterations...")
    
    # Keep track of the best adversarial example found so far
    best_x_adv = None
    best_f_objective = np.inf

    for i in range(total_iterations):
        # Estimate gradients for each feature
        estimated_grad = np.zeros_like(original_input)
        
        for dim in range(num_features):
            # Create perturbed inputs for finite difference approximation
            x_plus_delta = np.copy(x_adv)
            x_minus_delta = np.copy(x_adv)
            
            x_plus_delta[dim] += delta
            x_minus_delta[dim] -= delta
            
            # Query the model for objective values
            obj_plus = f_objective(x_plus_delta)
            obj_minus = f_objective(x_minus_delta)
            
            # Finite difference gradient approximation
            estimated_grad[dim] = (obj_plus - obj_minus) / (2 * delta)

        # Update adversarial example using gradient descent
        # We want to minimize the objective, so move in the negative gradient direction
        x_adv -= learning_rate * estimated_grad
        
        # Project perturbation to stay within epsilon budget (L-infinity norm)
        # 1. Calculate current perturbation relative to original input
        perturbation = x_adv - original_input
        # 2. Clip the perturbation to stay within [-epsilon_budget, epsilon_budget]
        perturbation_clipped = np.clip(perturbation, -epsilon_budget, epsilon_budget)
        # 3. Apply clipped perturbation to original input to get new x_adv
        x_adv = original_input + perturbation_clipped

        # Clip x_adv to stay within valid pixel range [0, 1]
        x_adv = np.clip(x_adv, 0, 1)

        # Check current status and update best adversarial example
        current_pred = model.predict(x_adv.reshape(1, -1))[0]
        current_f_obj_val = f_objective(x_adv) # Calculate objective value for current x_adv
        
        # If the current adversarial example achieves the target, check if it's "better" (e.g., smaller perturbation or better f_objective)
        if current_pred == target_class:
            if current_f_obj_val < best_f_objective: # We want f_objective to be as negative as possible
                best_f_objective = current_f_obj_val
                best_x_adv = np.copy(x_adv)

        if (i + 1) % (total_iterations // 10) == 0 or i == 0:
            current_l2_dist = np.linalg.norm(x_adv - original_input)
            print(f"Iteration {i+1}/{total_iterations}: Current Pred: {current_pred}, L2 Dist: {current_l2_dist:.4f}, f_obj: {current_f_obj_val:.4f}")
            if current_pred == target_class:
                print(f"  Target class {target_class} reached!")
                # We won't break early, but continue refining perturbation if a better f_obj is found

    # --- Final Check and Return ---
    if best_x_adv is not None:
        final_pred_class = model.predict(best_x_adv.reshape(1, -1))[0]
        if final_pred_class == target_class:
            final_l2_dist = np.linalg.norm(best_x_adv - original_input)
            print(f"\nZOO Attack successful! Adversarial example found. Final L2 Dist: {final_l2_dist:.4f}")
            return best_x_adv
        else:
            # Should ideally not happen if best_x_adv was truly classified as target_class
            print(f"\nZOO Attack found a candidate, but final check predicts {final_pred_class} (expected {target_class}).")
            return None
    else:
        print(f"\nZOO Attack finished, but target class {target_class} not achieved within the iterations.")
        return None

# --- Example Usage with an Actual Model (MLPClassifier on Digits Dataset) ---
if __name__ == "__main__":
    np.random.seed(42) # Set seed for reproducibility of model training and sample selection

    print("--- Loading and Training MLPClassifier on Digits Dataset ---")
    digits = load_digits()
    X, y = digits.data, digits.target

    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Train an MLPClassifier to act as our "black-box" model
    mlp_model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=1, verbose=False)
    mlp_model.fit(X_train, y_train)
    print(f"Model training complete. Test accuracy: {mlp_model.score(X_test, y_test):.4f}\n")

    # Select a random sample from the test set to attack
    sample_index = np.random.randint(0, len(X_test))
    original_sample = X_test[sample_index]
    original_label = y_test[sample_index]

    original_predicted_class = mlp_model.predict(original_sample.reshape(1, -1))[0]
    original_probabilities = mlp_model.predict_proba(original_sample.reshape(1, -1))[0]
    original_logits = np.log(np.clip(original_probabilities, 1e-10, 1))

    print("--- Original Sample Details ---")
    print(f"Original Label: {original_label}")
    print(f"Model's Predicted Class: {original_predicted_class}")
    print(f"Model's Logits: {original_logits.round(4)}")
    print(f"Input Shape: {original_sample.shape}\n")

    # Define the target class for the attack.
    # We choose a target class different from the original predicted class.
    if original_predicted_class == original_label:
        all_classes = np.arange(mlp_model.n_outputs_)
        other_classes = all_classes[all_classes != original_label]
        if len(other_classes) > 0:
            target_class_for_attack = np.random.choice(other_classes)
            print(f"Original prediction is CORRECT ({original_label}). Attempting ZOO attack to misclassify to Target Class: {target_class_for_attack}\n")
        else:
            print("Only one class, cannot perform targeted attack.")
            exit()
    else:
        # If original prediction is already incorrect, pick a new target (e.g., original label + 1)
        target_class_for_attack = (original_predicted_class + 1) % mlp_model.n_outputs_
        print(f"Original prediction is INCORRECT ({original_predicted_class}). Attempting ZOO attack to misclassify to Target Class: {target_class_for_attack}\n")


    # --- Run the ZOO attack ---
    adversarial_sample = zoo_attack(
        mlp_model,
        original_sample,
        target_class_for_attack,
        c_value=10.0,          # Increased for stronger classification push
        kappa=0.0,             
        learning_rate=0.01,    # Keeping this, can try slightly lower too
        num_queries_per_dim=1, 
        delta=1e-4,            
        total_iterations=5000, # Increased significantly
        epsilon_budget=0.1     # Keeping this; can increase if still failing
    )

    # --- Display results if an adversarial example was found ---
    if adversarial_sample is not None:
        adv_predicted_class = mlp_model.predict(adversarial_sample.reshape(1, -1))[0]
        adv_probabilities = mlp_model.predict_proba(adversarial_sample.reshape(1, -1))[0]
        adv_logits = np.log(np.clip(adv_probabilities, 1e-10, 1))
        
        perturbation = adversarial_sample - original_sample
        l2_norm_perturbation = np.linalg.norm(perturbation)
        linf_norm_perturbation = np.max(np.abs(perturbation))

        print("\n--- ZOO Attack Results ---")
        print(f"Adversarial Input (first 5 features): {adversarial_sample[:5].round(4)}...")
        print(f"Adversarial Logits: {adv_logits.round(4)}")
        print(f"Adversarial Predicted Class: {adv_predicted_class}")
        print(f"Perturbation (L2 Norm): {l2_norm_perturbation:.6f}")
        print(f"Perturbation (L-inf Norm): {linf_norm_perturbation:.6f}")


        if adv_predicted_class == target_class_for_attack:
            print("\nZOO attack successful: The adversarial example is now classified as the target class!")
        else:
            print("\nZOO attack inconclusive: Adversarial example found, but not classified as the exact target class.")
    else:
        print("\nZOO attack failed to generate a suitable adversarial example.")

--- Loading and Training MLPClassifier on Digits Dataset ---
Model training complete. Test accuracy: 0.9833

--- Original Sample Details ---
Original Label: 5
Model's Predicted Class: 5
Model's Logits: [-13.3782 -19.7226 -23.0259 -21.2603 -13.1114  -0.     -22.9556 -12.0312
 -22.2829 -10.7736]
Input Shape: (64,)

Original prediction is CORRECT (5). Attempting ZOO attack to misclassify to Target Class: 3

Starting ZOO Attack for 5000 iterations...
Iteration 1/5000: Current Pred: 5, L2 Dist: 0.1751, f_obj: 17.6751
Iteration 500/5000: Current Pred: 5, L2 Dist: 0.6508, f_obj: 9.3848
Iteration 1000/5000: Current Pred: 5, L2 Dist: 0.6543, f_obj: 9.3846
Iteration 1500/5000: Current Pred: 5, L2 Dist: 0.6557, f_obj: 9.3845
Iteration 2000/5000: Current Pred: 5, L2 Dist: 0.6557, f_obj: 9.3845
Iteration 2500/5000: Current Pred: 5, L2 Dist: 0.6557, f_obj: 9.3845
Iteration 3000/5000: Current Pred: 5, L2 Dist: 0.6557, f_obj: 9.3845
Iteration 3500/5000: Current Pred: 5, L2 Dist: 0.6557, f_obj: 9.3845
