### Using SHAP for Feature Drift Analysis
**Description**: Utilize SHapley Additive exPlanations (SHAP) values to analyze feature
importance changes over time, indicating feature drift.

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
import shap
import matplotlib.pyplot as plt

# Generate synthetic "old" data (time 1)
X_old, y_old = make_classification(n_samples=1000, n_features=5, random_state=42)

# Generate synthetic "new" data (time 2) with slight drift on feature 0
X_new = X_old.copy()
X_new[:, 0] = X_new[:, 0] + np.random.normal(0.5, 0.1, size=X_new.shape[0])
y_new = y_old.copy()

feature_names = [f'feature_{i}' for i in range(X_old.shape[1])]

# Train model on old data
model_old = RandomForestClassifier(random_state=42)
model_old.fit(X_old, y_old)

# Train model on new data
model_new = RandomForestClassifier(random_state=42)
model_new.fit(X_new, y_new)

# Explain predictions with SHAP for old data
explainer_old = shap.TreeExplainer(model_old)
shap_values_old = explainer_old.shap_values(X_old)

# Explain predictions with SHAP for new data
explainer_new = shap.TreeExplainer(model_new)
shap_values_new = explainer_new.shap_values(X_new)

# If multi-class, shap_values is a list of arrays (one per class).
# For binary classification, shap_values_old/new is list with 2 arrays.
# Take the SHAP values for class 1 only:
if isinstance(shap_values_old, list):
    shap_values_old = shap_values_old[1]

if isinstance(shap_values_new, list):
    shap_values_new = shap_values_new[1]

# Calculate mean absolute SHAP values per feature
mean_abs_shap_old = np.mean(np.abs(shap_values_old), axis=0)
mean_abs_shap_new = np.mean(np.abs(shap_values_new), axis=0)

# Create dataframe to compare
shap_comparison = pd.DataFrame({
    'feature': feature_names,
    'mean_abs_shap_old': mean_abs_shap_old,
    'mean_abs_shap_new': mean_abs_shap_new
})

shap_comparison['change'] = shap_comparison['mean_abs_shap_new'] - shap_comparison['mean_abs_shap_old']

print("Feature Importance Change (SHAP):")
print(shap_comparison.sort_values('change', ascending=False))

# Optional: Plot SHAP value changes
shap_comparison.set_index('feature')[['mean_abs_shap_old', 'mean_abs_shap_new']].plot.bar(figsize=(10,6))
plt.ylabel('Mean |SHAP value|')
plt.title('Feature Importance Comparison: Old vs New Data')
plt.show()

ModuleNotFoundError: No module named 'shap'