### Using SHAP for Feature Drift Analysis
**Description**: Utilize SHapley Additive exPlanations (SHAP) values to analyze feature
importance changes over time, indicating feature drift.

In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import shap

# Step 1: Generate synthetic old and new data
np.random.seed(42)

old_data = pd.DataFrame({
    'feature1': np.random.normal(0, 1, 1000),
    'feature2': np.random.normal(5, 2, 1000),
    'feature3': np.random.uniform(0, 10, 1000)
})

new_data = pd.DataFrame({
    'feature1': np.random.normal(0.5, 1, 1000),  # shifted mean
    'feature2': np.random.normal(4.8, 2.2, 1000),  # shifted mean & variance
    'feature3': np.random.uniform(0, 12, 1000)  # shifted range
})

# Step 2: Label data
old_data['is_new'] = 0
new_data['is_new'] = 1

# Step 3: Combine datasets
data = pd.concat([old_data, new_data], ignore_index=True)
X = data.drop(columns=['is_new'])
y = data['is_new']

# Step 4: Train classifier to distinguish old vs new
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Step 5: Use SHAP to explain feature importance
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(X_val)

# Step 6: Plot summary for class "1" (new data)
shap.summary_plot(shap_values[1], X_val, plot_type="bar")

# Step 7: Optional - detailed beeswarm plot to see impact of each feature
shap.summary_plot(shap_values[1], X_val)


ModuleNotFoundError: No module named 'shap'