In [1]:
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import warnings

# Suppress ConvergenceWarning from MLPClassifier for cleaner output
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.neural_network")
warnings.filterwarnings("ignore", category=RuntimeWarning, message="overflow encountered in exp")
warnings.filterwarnings("ignore", category=RuntimeWarning, message="invalid value encountered in true_divide")

# --- Helper Functions for HopSkipJump ---

def _is_misclassified(model, x, original_label):
    """
    Checks if an input x is misclassified by the model compared to the original_label.
    Assumes model.predict returns a single class label for a single input.
    """
    prediction = model.predict(x.reshape(1, -1))[0]
    return prediction != original_label

def _find_boundary_point(model, original_input, random_noise, original_label):
    """
    Finds an initial point on the decision boundary using binary search.
    Starts from a random point far away, and moves towards the original_input.
    """
    # Start with a point far from the original input that is misclassified
    # We add large noise to ensure it's misclassified initially
    x_adv_initial = np.clip(original_input + random_noise, 0, 1)

    if not _is_misclassified(model, x_adv_initial, original_label):
        # If the initial noisy point isn't misclassified, try more noise.
        # This can sometimes happen if the noise isn't large enough
        # or if the model's decision boundary is very complex.
        print("Initial noisy point not misclassified. Trying larger noise.")
        x_adv_initial = np.clip(original_input + (np.random.rand(*original_input.shape) * 2 - 1) * 2.0, 0, 1)
        if not _is_misclassified(model, x_adv_initial, original_label):
            print("Even with larger noise, initial point not misclassified. Cannot start attack.")
            return None

    # Binary search to find a point on the decision boundary
    low = 0.0
    high = 1.0
    boundary_point = x_adv_initial # Current candidate for adversarial point

    # Perform binary search for a few iterations to get close to the boundary
    for _ in range(30): # Binary search steps
        mid = (low + high) / 2.0
        candidate = original_input + mid * (x_adv_initial - original_input)
        candidate = np.clip(candidate, 0, 1) # Ensure valid pixel range

        if _is_misclassified(model, candidate, original_label):
            boundary_point = candidate # This point is misclassified, try moving closer to original
            high = mid
        else:
            low = mid # This point is correctly classified, move further from original
            
    return boundary_point

def _estimate_normal(model, x_current, x_original, original_label, num_samples=100, delta=1e-4):
    """
    Estimates the normal vector to the decision boundary at x_current.
    This is done by averaging the normalized vectors from x_current to nearby misclassified points.
    """
    normal_vector = np.zeros_like(x_current)
    
    for _ in range(num_samples):
        # Generate random noise direction
        random_direction = np.random.randn(*x_current.shape)
        random_direction = random_direction / np.linalg.norm(random_direction) # Normalize to unit vector

        # Perturb x_current in both directions
        x_plus = np.clip(x_current + delta * random_direction, 0, 1)
        x_minus = np.clip(x_current - delta * random_direction, 0, 1)

        # Check if the perturbed points are misclassified
        is_plus_misclassified = _is_misclassified(model, x_plus, original_label)
        is_minus_misclassified = _is_misclassified(model, x_minus, original_label)

        if is_plus_misclassified != is_minus_misclassified:
            # If one is misclassified and the other is not, it implies crossing the boundary
            if is_plus_misclassified:
                normal_vector += random_direction # Vector from current to misclassified side
            else:
                normal_vector -= random_direction # Vector from current to misclassified side

    if np.linalg.norm(normal_vector) == 0:
        # If no boundary crossing was detected, fall back to simple direction
        return x_current - x_original
    else:
        return normal_vector / np.linalg.norm(normal_vector) # Normalize

# --- HopSkipJump Attack Function ---
def hopskipjump_attack(
    model,
    original_input,
    original_label,
    max_iterations=1000, # Total iterations for the attack
    initial_delta_factor=0.1, # Initial perturbation step size for boundary search
    gamma=0.01,           # Step size multiplier for moving along the boundary
    num_normal_samples=100, # Number of samples for normal estimation
    normal_delta=1e-4     # Delta for normal estimation
):
    """
    Implements the HopSkipJump Attack, a query-efficient black-box attack.

    Args:
        model (sklearn.base.BaseEstimator): The trained scikit-learn classification model to attack.
                                            Must have a `predict` method.
        original_input (np.array): The initial input data (1D numpy array, values in [0, 1]).
        original_label (int): The true label of the original input.
        max_iterations (int): Total number of outer attack iterations.
        initial_delta_factor (float): Factor to determine the initial random noise magnitude.
        gamma (float): Multiplier for the step size when moving along the estimated boundary normal.
        num_normal_samples (int): Number of random queries for normal vector estimation.
        normal_delta (float): Small perturbation size for normal estimation.

    Returns:
        np.array: The adversarial example (perturbed input) if successful, otherwise None.
    """
    print(f"Starting HopSkipJump Attack for {max_iterations} iterations...")
    
    # Step 1: Find an initial adversarial example on the decision boundary
    # Start with a random point that is far from original_input and misclassified
    random_noise_initial = (np.random.rand(*original_input.shape) * 2 - 1) * initial_delta_factor
    x_adv = _find_boundary_point(model, original_input, random_noise_initial, original_label)

    if x_adv is None:
        print("Failed to find an initial boundary point. Attack cannot proceed.")
        return None

    best_x_adv = np.copy(x_adv)
    min_dist_to_original = np.linalg.norm(x_adv - original_input)

    print(f"Initial adversarial point found. L2 distance to original: {min_dist_to_original:.4f}")

    for i in range(max_iterations):
        # Step 2: Estimate the normal vector to the decision boundary at x_adv
        # This vector points from the correctly classified side to the misclassified side.
        normal = _estimate_normal(model, x_adv, original_input, original_label, num_normal_samples, normal_delta)
        
        # Step 3: Move x_adv towards the original_input along the estimated normal
        # This is the "Hop" step. We want to reduce the distance to original while staying misclassified.
        # Project original_input onto the line defined by x_adv and normal
        direction_to_original = original_input - x_adv
        projection_onto_normal = np.dot(direction_to_original, normal) * normal
        
        # New candidate point: move x_adv towards original_input
        x_adv_candidate = x_adv + projection_onto_normal

        # Step 4: Binary search (or "Jump" step) to find the closest point on the boundary
        # between x_adv_candidate and x_original (if it crossed)
        
        # Binary search bounds
        low = 0.0
        high = 1.0 # The full step to original

        if _is_misclassified(model, x_adv_candidate, original_label):
             # If candidate is still misclassified, it means we didn't cross the boundary or we are on the misclassified side of the boundary.
             # We want to find the point on the boundary between x_adv and x_adv_candidate.
            found_candidate_in_range = False
            for _ in range(30): # Binary search steps
                mid = (low + high) / 2.0
                current_candidate_on_line = x_adv + mid * (x_adv_candidate - x_adv)
                current_candidate_on_line = np.clip(current_candidate_on_line, 0, 1)

                if _is_misclassified(model, current_candidate_on_line, original_label):
                    x_adv = current_candidate_on_line # Keep moving towards original
                    high = mid
                    found_candidate_in_range = True
                else:
                    low = mid
            if not found_candidate_in_range:
                # If after binary search we couldn't find a boundary point, it means
                # the initial x_adv_candidate was already on the misclassified side.
                # In this case, simply keep x_adv as it is.
                pass # x_adv retains its value from the previous iteration or initial boundary point.
        else:
            # Candidate is correctly classified, means we crossed the boundary.
            # Perform binary search between x_adv (misclassified) and x_adv_candidate (correctly classified)
            low_bs = 0.0
            high_bs = 1.0
            for _ in range(30):
                mid_bs = (low_bs + high_bs) / 2.0
                temp_x = x_adv + mid_bs * (x_adv_candidate - x_adv)
                temp_x = np.clip(temp_x, 0, 1)
                if _is_misclassified(model, temp_x, original_label):
                    low_bs = mid_bs
                else:
                    high_bs = mid_bs
            # Update x_adv to the point near the boundary
            x_adv = np.clip(x_adv + low_bs * (x_adv_candidate - x_adv), 0, 1)


        # Apply small random noise and project onto boundary to avoid getting stuck
        random_step = np.random.randn(*x_adv.shape)
        random_step = random_step / np.linalg.norm(random_step)
        
        x_adv = np.clip(x_adv + gamma * random_step, 0, 1)
        
        # Final projection onto [0,1] range
        x_adv = np.clip(x_adv, 0, 1)
        
        # Check current distance and prediction
        current_dist = np.linalg.norm(x_adv - original_input)
        if current_dist < min_dist_to_original and _is_misclassified(model, x_adv, original_label):
            min_dist_to_original = current_dist
            best_x_adv = np.copy(x_adv)

        if (i + 1) % (max_iterations // 10) == 0 or i == 0:
            current_pred = model.predict(x_adv.reshape(1, -1))[0]
            print(f"Iteration {i+1}/{max_iterations}: Current Pred: {current_pred}, L2 Dist: {current_dist:.4f}, Misclassified: {_is_misclassified(model, x_adv, original_label)}")
            
    # --- Final Check and Return ---
    if best_x_adv is not None and _is_misclassified(model, best_x_adv, original_label):
        final_l2_dist = np.linalg.norm(best_x_adv - original_input)
        final_linf_norm = np.max(np.abs(best_x_adv - original_input))
        print(f"\nHopSkipJump Attack successful! Adversarial example found. Final L2 Dist: {final_l2_dist:.4f}, L-inf Dist: {final_linf_norm:.4f}")
        return best_x_adv
    else:
        print(f"\nHopSkipJump Attack finished, but a successful adversarial example was not found.")
        return None

# --- Example Usage with an Actual Model (MLPClassifier on Digits Dataset) ---
if __name__ == "__main__":
    np.random.seed(42) # Set seed for reproducibility of model training and sample selection

    print("--- Loading and Training MLPClassifier on Digits Dataset ---")
    digits = load_digits()
    X, y = digits.data, digits.target
    num_features = X.shape[1] # Number of features (64 for 8x8 images)

    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Train an MLPClassifier to act as our "black-box" model
    mlp_model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=1, verbose=False)
    mlp_model.fit(X_train, y_train)
    print(f"Model training complete. Test accuracy: {mlp_model.score(X_test, y_test):.4f}\n")

    # Select a random sample from the test set to attack
    # Find a sample that the model initially classifies correctly
    original_sample = None
    original_label = -1
    for i in range(len(X_test)):
        sample = X_test[i]
        label = y_test[i]
        if mlp_model.predict(sample.reshape(1, -1))[0] == label:
            original_sample = sample
            original_label = label
            break
    
    if original_sample is None:
        print("Could not find a correctly classified sample to attack. Exiting.")
        exit()

    original_predicted_class = mlp_model.predict(original_sample.reshape(1, -1))[0]
    original_probabilities = mlp_model.predict_proba(original_sample.reshape(1, -1))[0]
    original_logits = np.log(np.clip(original_probabilities, 1e-10, 1))

    print("--- Original Sample Details ---")
    print(f"Original Label: {original_label}")
    print(f"Model's Predicted Class: {original_predicted_class}")
    print(f"Model's Logits: {original_logits.round(4)}")
    print(f"Input Shape: {original_sample.shape}\n")

    # Run the HopSkipJump attack
    # HopSkipJump is computationally intensive due to many queries.
    # Adjust parameters carefully.
    adversarial_sample = hopskipjump_attack(
        mlp_model,
        original_sample,
        original_label,
        max_iterations=100,  # Max iterations for the attack
        initial_delta_factor=0.5, # Factor for initial random noise to find a misclassified point
        gamma=0.01,           # Step size multiplier when moving along boundary
        num_normal_samples=50, # Number of random queries for normal estimation (more = better estimate, slower)
        normal_delta=1e-4     # Delta for normal estimation queries
    )

    # --- Display results if an adversarial example was found ---
    if adversarial_sample is not None:
        adv_predicted_class = mlp_model.predict(adversarial_sample.reshape(1, -1))[0]
        adv_probabilities = mlp_model.predict_proba(adversarial_sample.reshape(1, -1))[0]
        adv_logits = np.log(np.clip(adv_probabilities, 1e-10, 1))
        
        perturbation = adversarial_sample - original_sample
        l2_norm_perturbation = np.linalg.norm(perturbation)
        linf_norm_perturbation = np.max(np.abs(perturbation))

        print("\n--- HopSkipJump Attack Results ---")
        print(f"Adversarial Input (first 5 features): {adversarial_sample[:5].round(4)}...")
        print(f"Adversarial Logits: {adv_logits.round(4)}")
        print(f"Adversarial Predicted Class: {adv_predicted_class}")
        print(f"Perturbation (L2 Norm): {l2_norm_perturbation:.6f}")
        print(f"Perturbation (L-inf Norm): {linf_norm_perturbation:.6f}")


        if adv_predicted_class != original_label:
            print("\nHopSkipJump attack successful: The adversarial example is now misclassified!")
        else:
            print("\nHopSkipJump attack inconclusive: Adversarial example found, but still classified as original label.")
    else:
        print("\nHopSkipJump attack failed to generate a suitable adversarial example.")




--- Loading and Training MLPClassifier on Digits Dataset ---
Model training complete. Test accuracy: 0.9833

--- Original Sample Details ---
Original Label: 6
Model's Predicted Class: 6
Model's Logits: [-11.9587 -17.4366 -21.7252 -23.0259 -12.2853 -13.7145  -0.     -20.7407
 -11.1009 -18.7912]
Input Shape: (64,)

Starting HopSkipJump Attack for 100 iterations...
Initial noisy point not misclassified. Trying larger noise.
Initial adversarial point found. L2 distance to original: 1.6366
Iteration 1/100: Current Pred: 6, L2 Dist: 1.6354, Misclassified: False
Iteration 10/100: Current Pred: 6, L2 Dist: 1.6354, Misclassified: False
Iteration 20/100: Current Pred: 6, L2 Dist: 1.6361, Misclassified: False
Iteration 30/100: Current Pred: 6, L2 Dist: 1.6296, Misclassified: False
Iteration 40/100: Current Pred: 6, L2 Dist: 1.6349, Misclassified: False
Iteration 50/100: Current Pred: 6, L2 Dist: 1.6300, Misclassified: False
Iteration 60/100: Current Pred: 6, L2 Dist: 1.6270, Misclassified: False
