In [1]:
import numpy as np
from scipy.optimize import minimize
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import warnings

# Suppress ConvergenceWarning from MLPClassifier for cleaner output
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.neural_network")
warnings.filterwarnings("ignore", category=RuntimeWarning, message="overflow encountered in tanh")

# --- L-BFGS Attack Function ---
def lbfgs_attack(
    model,
    original_input,
    target_class,
    c_value=1.0,         # Controls the trade-off: higher C means prioritize misclassification more
    kappa=0.0,           # Confidence parameter: target logit must be kappa higher than others
    max_iterations=1000, # Maximum iterations for the optimizer
):
    """
    Implements a targeted adversarial attack using the L-BFGS optimization algorithm.

    This attack minimizes an objective function that balances the perturbation size (L2 norm)
    and a classification loss designed to achieve targeted misclassification.
    The input is transformed using tanh to optimize in an unconstrained space.

    Args:
        model (sklearn.base.BaseEstimator): The trained scikit-learn classification model to attack.
                                            Must have a `predict_proba` method.
        original_input (np.array): The initial input data (e.g., an image, feature vector).
                                   Assumed to be a 1D numpy array with values typically in [0, 1].
        target_class (int): The specific class index the adversarial example should be classified as.
        c_value (float): A positive constant that weights the classification loss part of the objective.
                         A higher 'c_value' makes the attack more aggressive in achieving
                         misclassification, potentially at the cost of a larger perturbation.
        kappa (float): A confidence parameter. The attack tries to make the target class's
                       logit score be at least 'kappa' higher than the maximum logit of all
                       other classes. Setting kappa > 0 makes the attack stronger.
        max_iterations (int): The maximum number of optimization steps.

    Returns:
        np.array: The adversarial example (perturbed input) if successful, otherwise None.
    """
    num_features = len(original_input)
    
    # Determine the number of classes from the model
    try:
        num_classes = len(model.classes_)
    except AttributeError:
        num_classes = model.predict_proba(original_input.reshape(1, -1)).shape[1]

    # Transform original_input to 'w' space using arctanh, clipping to avoid domain errors
    original_input_clamped = np.clip(original_input, 1e-6, 1 - 1e-6)
    w_initial = np.arctanh(2 * original_input_clamped - 1)

    # --- Define the Objective Function for L-BFGS ---
    def objective_function(w):
        """
        The objective function to be minimized.
        Combines L2 perturbation distance and a misclassification term.
        """
        # 1. Convert 'w' back to 'x' (input space [0, 1])
        x = (np.tanh(w) + 1) / 2
        x = np.clip(x, 0, 1) # Ensure values are within [0, 1]

        # 2. Calculate the L2 distance squared
        l2_dist_sq = np.sum((x - original_input)**2)

        # 3. Get logits from the model
        probabilities = model.predict_proba(x.reshape(1, -1))[0]
        probabilities = np.clip(probabilities, 1e-10, 1) # Clip to avoid log(0)
        logits = np.log(probabilities)

        # 4. Calculate the classification loss term (similar to C&W's f_loss)
        target_logit = logits[target_class]
        other_logits = np.delete(logits, target_class)
        max_other_logit = np.max(other_logits)
        
        # Loss term encourages target logit to be significantly higher than others
        f_loss = np.maximum(max_other_logit - target_logit, -kappa)

        # 5. Total objective value
        total_loss = l2_dist_sq + c_value * f_loss
        return total_loss

    # --- Perform Optimization using L-BFGS-B ---
    result = minimize(
        fun=objective_function,  # The function to minimize
        x0=w_initial,            # Initial guess for 'w'
        method='L-BFGS-B',       # The L-BFGS-B optimization algorithm
        options={'maxiter': max_iterations, 'disp': False} # Optimization options
    )

    # --- Process Results ---
    if result.success:
        optimized_w = result.x
        adversarial_x = (np.tanh(optimized_w) + 1) / 2
        adversarial_x = np.clip(adversarial_x, 0, 1) # Final clamping to [0, 1]

        # Verify if the attack achieved the targeted misclassification
        predicted_class_adv = model.predict(adversarial_x.reshape(1, -1))[0]
        
        if predicted_class_adv == target_class:
            l2_dist = np.linalg.norm(adversarial_x - original_input)
            print(f"L-BFGS Attack successful! Adversarial example found with L2 distance: {l2_dist:.4f}")
            return adversarial_x
        else:
            print(f"Optimization finished, but L-BFGS attack failed: Predicted class {predicted_class_adv} (expected {target_class}).")
            return None
    else:
        print(f"L-BFGS Optimization failed: {result.message}")
        return None

# --- Example Usage with an Actual Model (MLPClassifier on Digits Dataset) ---
if __name__ == "__main__":
    print("--- Loading and Training MLPClassifier on Digits Dataset ---")
    # 1. Load the Digits dataset (handwritten digits 0-9)
    digits = load_digits()
    X, y = digits.data, digits.target

    # Digits data ranges from 0 to 16. Normalize to [0, 1] for attack compatibility.
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # 2. Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # 3. Train an MLPClassifier (a simple neural network)
    mlp_model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=1, verbose=False)
    mlp_model.fit(X_train, y_train)
    print(f"Model training complete. Test accuracy: {mlp_model.score(X_test, y_test):.4f}\n")

    # 4. Select a random sample from the test set to attack
    np.random.seed(0) # for reproducibility
    sample_index = np.random.randint(0, len(X_test))
    original_sample = X_test[sample_index]
    original_label = y_test[sample_index]

    # Get the model's initial prediction for the chosen sample
    original_predicted_class = mlp_model.predict(original_sample.reshape(1, -1))[0]
    original_probabilities = mlp_model.predict_proba(original_sample.reshape(1, -1))[0]
    original_logits = np.log(np.clip(original_probabilities, 1e-10, 1)) # Calculate logits

    print("--- Original Sample Details ---")
    print(f"Original Label: {original_label}")
    print(f"Model's Predicted Class: {original_predicted_class}")
    print(f"Model's Logits: {original_logits.round(4)}")
    print(f"Input Shape: {original_sample.shape} (8x8 image flattened to 64 features)\n")

    # 5. Define the target class for the attack.
    # We choose a target class different from the original predicted class.
    # If the original prediction is correct, pick a random different class.
    # If the original prediction is wrong, try to make it predict original_label + 1 (cyclic).
    if original_predicted_class == original_label:
        all_classes = np.arange(mlp_model.n_outputs_)
        other_classes = all_classes[all_classes != original_label]
        target_class_for_attack = np.random.choice(other_classes)
        print(f"Original prediction is CORRECT ({original_label}). Attempting L-BFGS attack to misclassify to Target Class: {target_class_for_attack}\n")
    else:
        target_class_for_attack = (original_predicted_class + 1) % mlp_model.n_outputs_
        print(f"Original prediction is INCORRECT ({original_predicted_class}). Attempting L-BFGS attack to misclassify to Target Class: {target_class_for_attack}\n")


    # 6. Run the L-BFGS attack
    # Adjust c_value and kappa to balance perturbation size vs. attack strength.
    # These values might need tuning for different models and datasets.
    adversarial_sample = lbfgs_attack(
        mlp_model,
        original_sample,
        target_class_for_attack,
        c_value=1.0,       # Initial c-value; try increasing if attack fails
        kappa=0.0,         # A common starting point for kappa; increase for stronger confidence
        max_iterations=500 # Number of iterations for the optimizer
    )

    # 7. Display results if an adversarial example was found
    if adversarial_sample is not None:
        adv_predicted_class = mlp_model.predict(adversarial_sample.reshape(1, -1))[0]
        adv_probabilities = mlp_model.predict_proba(adversarial_sample.reshape(1, -1))[0]
        adv_logits = np.log(np.clip(adv_probabilities, 1e-10, 1))
        
        perturbation = adversarial_sample - original_sample
        l2_norm_perturbation = np.linalg.norm(perturbation)

        print("\n--- L-BFGS Attack Results ---")
        print(f"Adversarial Input (first 5 features): {adversarial_sample[:5].round(4)}...")
        print(f"Adversarial Logits: {adv_logits.round(4)}")
        print(f"Adversarial Predicted Class: {adv_predicted_class}")
        print(f"Perturbation (L2 Norm): {l2_norm_perturbation:.6f}")

        if adv_predicted_class == target_class_for_attack:
            print("\nAttack successful: The adversarial example is now classified as the target class!")
        else:
            print("\nAttack inconclusive: Adversarial example found, but not classified as the exact target class (or optimization couldn't achieve target).")
    else:
        print("\nL-BFGS attack failed to generate a suitable adversarial example.")


--- Loading and Training MLPClassifier on Digits Dataset ---
Model training complete. Test accuracy: 0.9833

--- Original Sample Details ---
Original Label: 4
Model's Predicted Class: 4
Model's Logits: [-21.7323 -19.4936 -23.0259 -23.0259  -0.     -23.0259 -17.5507 -16.9132
 -23.0259 -23.0259]
Input Shape: (64,) (8x8 image flattened to 64 features)

Original prediction is CORRECT (4). Attempting L-BFGS attack to misclassify to Target Class: 6

L-BFGS Attack successful! Adversarial example found with L2 distance: 1.4428

--- L-BFGS Attack Results ---
Adversarial Input (first 5 features): [0.     0.     0.     0.8027 0.9286]...
Adversarial Logits: [ -9.4788 -12.9278 -22.4388 -23.0259  -0.6932 -11.7264  -0.6932 -12.3716
 -13.6486 -23.0259]
Adversarial Predicted Class: 6
Perturbation (L2 Norm): 1.442760

Attack successful: The adversarial example is now classified as the target class!
