
**Key Findings**:

1. **BAE (BERT-based Adversarial Examples)**:
   - Uses BERT masked language model
   - High semantic similarity
   - Moderate success rate
   - More queries needed

2. **PWWS (Probability Weighted Word Saliency)**:
   - Uses word importance scores
   - High success rate
   - May sacrifice some naturalness
   - Efficient (fewer queries)

3. **TextFooler**:
   - Balanced approach
   - Good semantic similarity
   - Reasonable success rate
   - General-purpose attack

---


## Lab 3: SHAP Explainability - Exercise

### Exercise: Test Explanation Robustness

**Task**: Test explanation robustness on adversarial examples. Compare trust scores for clean vs adversarial samples.

**Your Task**:


In [3]:
# Lab 3: Explanation Robustness Testing (Python 3.14 Compatible)
# Note: Using SHAP instead of Alibi due to Python 3.14 compatibility

import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Install shap if needed
try:
    import shap
except ImportError:
    print("Installing shap...")
    import subprocess
    subprocess.check_call(['pip', 'install', '-q', 'shap==0.45.0'])
    import shap

print("✓ Libraries loaded successfully")

# Load dataset
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=50, random_state=42)
model.fit(X_train, y_train)

baseline_accuracy = model.score(X_test, y_test)
print(f"\nModel accuracy: {baseline_accuracy:.2%}")

# Feature names
feature_names = ['sepal length', 'sepal width', 'petal length', 'petal width']
class_names = ['setosa', 'versicolor', 'virginica']

# Create SHAP explainer
print("\nCreating SHAP explainer...")
explainer = shap.TreeExplainer(model)

# Get SHAP values for test samples
shap_values = explainer.shap_values(X_test)

print("✓ SHAP explainer created")

# Function to create adversarial examples
def create_adversarial_tabular(X, model, epsilon=0.3):
    X_adv = X.copy()
    for i in range(len(X)):
        perturbation = np.random.randn(X.shape[1]) * epsilon
        X_adv[i] += perturbation
        X_adv[i] = np.maximum(X_adv[i], 0)
    return X_adv

# Generate adversarial examples
print("\nGenerating adversarial examples...")
X_test_adv = create_adversarial_tabular(X_test, model, epsilon=0.3)

# Verify some are adversarial
clean_preds = model.predict(X_test)
adv_preds = model.predict(X_test_adv)
adv_success_rate = (clean_preds != adv_preds).mean()
adv_accuracy = accuracy_score(y_test, adv_preds)

print(f"Adversarial success rate: {adv_success_rate:.1%}")
print(f"Adversarial accuracy: {adv_accuracy:.2%}")
print(f"Accuracy drop: {(baseline_accuracy - adv_accuracy):.2%}")

# Get SHAP values for adversarial examples
print("\nComputing SHAP values for adversarial examples...")
shap_values_adv = explainer.shap_values(X_test_adv)

# Compare explanation stability
print("\n" + "="*80)
print("EXPLANATION ROBUSTNESS ANALYSIS")
print("="*80)

num_samples = min(5, len(X_test))
explanation_changes = []

for i in range(num_samples):
    clean_sample = X_test[i]
    adv_sample = X_test_adv[i]
    clean_pred = clean_preds[i]
    adv_pred = adv_preds[i]
    
    print(f"\nSample {i+1}:")
    print(f"  Clean prediction: {class_names[clean_pred]}")
    print(f"  Adversarial prediction: {class_names[adv_pred]}")
    print(f"  Prediction changed: {clean_pred != adv_pred}")
    
    # Compare SHAP values (for multi-class, use the predicted class)
    # For RandomForest with 3 classes, shap_values is a list of 3 arrays
    # Each array is (n_samples, n_features)
    if isinstance(shap_values, list):
        # Get SHAP values for the predicted class
        clean_shap = shap_values[int(clean_pred)][i]
        adv_shap = shap_values_adv[int(adv_pred)][i]
    else:
        # Single array case
        clean_shap = shap_values[i]
        adv_shap = shap_values_adv[i]
    
    # Ensure we have 1D arrays
    clean_shap = np.atleast_1d(np.array(clean_shap))
    adv_shap = np.atleast_1d(np.array(adv_shap))
    
    explanation_change = np.linalg.norm(clean_shap - adv_shap)
    explanation_changes.append(explanation_change)
    
    print(f"  Explanation change (L2): {explanation_change:.3f}")
    
    # Get top feature indices (flatten to ensure 1D)
    clean_shap_flat = np.abs(clean_shap).flatten()
    adv_shap_flat = np.abs(adv_shap).flatten()
    
    # Get top 2 features (or fewer if not enough features)
    n_top = min(2, len(clean_shap_flat))
    clean_top_idx = np.argsort(clean_shap_flat)[-n_top:][::-1]
    adv_top_idx = np.argsort(adv_shap_flat)[-n_top:][::-1]
    
    # Build feature lists safely (ensure indices are valid)
    clean_top_features = []
    for idx in clean_top_idx:
        idx_int = int(idx)
        if 0 <= idx_int < len(feature_names):
            clean_top_features.append(feature_names[idx_int])
    
    adv_top_features = []
    for idx in adv_top_idx:
        idx_int = int(idx)
        if 0 <= idx_int < len(feature_names):
            adv_top_features.append(feature_names[idx_int])
    
    print(f"  Clean top features: {clean_top_features}")
    print(f"  Adversarial top features: {adv_top_features}")
    
    if list(clean_top_idx) == list(adv_top_idx):
        print(f"  Feature importance order: ✓ Stable")
    else:
        print(f"  Feature importance order: ✗ Changed")

# Statistical analysis
print("\n" + "="*80)
print("STATISTICAL SUMMARY")
print("="*80)

print(f"\nExplanation Changes:")
print(f"  Mean change: {np.mean(explanation_changes):.3f}")
print(f"  Max change: {np.max(explanation_changes):.3f}")
print(f"  Std dev: {np.std(explanation_changes):.3f}")

# Calculate correlation
all_clean_shap = []
all_adv_shap = []

for i in range(len(X_test)):
    clean_pred = clean_preds[i]
    adv_pred = adv_preds[i]
    
    # Get SHAP values with proper indexing
    if isinstance(shap_values, list):
        all_clean_shap.append(shap_values[int(clean_pred)][i])
        all_adv_shap.append(shap_values_adv[int(adv_pred)][i])
    else:
        all_clean_shap.append(shap_values[i])
        all_adv_shap.append(shap_values_adv[i])

all_clean_shap = np.array(all_clean_shap).flatten()
all_adv_shap = np.array(all_adv_shap).flatten()

correlation = np.corrcoef(all_clean_shap, all_adv_shap)[0, 1]

print(f"\nExplanation Correlation:")
print(f"  Correlation coefficient: {correlation:.3f}")

if correlation > 0.8:
    print(f"  ✓ Explanations are relatively stable")
elif correlation > 0.5:
    print(f"  ⚠ Explanations show moderate instability")
else:
    print(f"  ✗ Explanations are highly unstable")

# Key findings
print("\n" + "="*80)
print("KEY FINDINGS")
print("="*80)

print("\n1. Explanation Robustness:")
if np.mean(explanation_changes) < 1.0:
    print("   ✓ SHAP explanations are relatively stable under perturbation")
else:
    print("   ✗ SHAP explanations change significantly under adversarial perturbation")

print("\n2. Security Implications:")
print("   - Adversarial examples can fool both predictions AND explanations")
print("   - Cannot rely on explanations alone for security")
print("   - Need robust models, not just interpretable ones")

print("\n3. Comparison with Alibi:")
print("   - SHAP provides feature importance (global explanations)")
print("   - Alibi Anchors provide rule-based explanations (local explanations)")
print("   - Both can be affected by adversarial perturbations")
print("   - SHAP is more compatible with newer Python versions")

print("\n4. Recommendations:")
print("   - Use adversarial training to improve robustness")
print("   - Combine multiple explanation methods")
print("   - Monitor explanation stability in production")
print("   - Implement input validation and anomaly detection")

print("\n" + "="*80)
print("✓ Lab 3 Complete!")
print("="*80)


✓ Libraries loaded successfully

Model accuracy: 100.00%

Creating SHAP explainer...
✓ SHAP explainer created

Generating adversarial examples...
Adversarial success rate: 6.7%
Adversarial accuracy: 93.33%
Accuracy drop: 6.67%

Computing SHAP values for adversarial examples...

EXPLANATION ROBUSTNESS ANALYSIS

Sample 1:
  Clean prediction: versicolor
  Adversarial prediction: versicolor
  Prediction changed: False
  Explanation change (L2): 0.588
  Clean top features: []
  Adversarial top features: []
  Feature importance order: ✗ Changed

Sample 2:
  Clean prediction: setosa
  Adversarial prediction: setosa
  Prediction changed: False
  Explanation change (L2): 0.057
  Clean top features: []
  Adversarial top features: []
  Feature importance order: ✓ Stable

Sample 3:
  Clean prediction: virginica
  Adversarial prediction: virginica
  Prediction changed: False
  Explanation change (L2): 0.026
  Clean top features: []
  Adversarial top features: []
  Feature importance order: ✓ Stable