### Using SHAP for Feature Drift Analysis
**Description**: Utilize SHapley Additive exPlanations (SHAP) values to analyze feature
importance changes over time, indicating feature drift.

In [1]:
# write your code from here
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import shap
import matplotlib.pyplot as plt

def shap_feature_drift_analysis(train_df, test_df, features, random_state=42):
    """
    Use SHAP values to analyze feature drift between two datasets.
    
    Parameters:
    - train_df: pd.DataFrame, first dataset (e.g., old data)
    - test_df: pd.DataFrame, second dataset (e.g., new data)
    - features: list of feature column names
    - random_state: int for reproducibility
    
    Returns:
    - shap_values: SHAP values for the combined dataset
    - combined_df: combined dataset with labels
    """
    # Label the data
    train_df = train_df.copy()
    test_df = test_df.copy()
    train_df['is_new_data'] = 0
    test_df['is_new_data'] = 1

    combined_df = pd.concat([train_df[features + ['is_new_data']],
                             test_df[features + ['is_new_data']]], axis=0)
    
    X = combined_df[features]
    y = combined_df['is_new_data']
    
    # Split dataset to avoid overfitting
    X_train, X_val, y_train, y_val = train_test_split(X, y, 
                                                      test_size=0.3, 
                                                      random_state=random_state,
                                                      stratify=y)
    
    # Train classifier
    model = RandomForestClassifier(n_estimators=100, random_state=random_state)
    model.fit(X_train, y_train)
    
    # Explain predictions with SHAP
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_val)
    
    # shap_values is a list for each class; index 1 corresponds to class '1' (new data)
    shap_vals_new_data = shap_values[1]
    
    # Convert to DataFrame for easy plotting
    shap_df = pd.DataFrame(shap_vals_new_data, columns=features)
    shap_df['label'] = y_val.values
    
    # Plot summary plot (beeswarm) of SHAP values to see global feature importance
    print("SHAP summary plot for detecting feature drift:")
    shap.summary_plot(shap_vals_new_data, X_val, plot_type="bar")
    
    # Optional: Plot SHAP distribution comparison for key features
    for feature in features:
        plt.figure(figsize=(8, 4))
        shap_df.boxplot(column=feature, by='label')
        plt.title(f"SHAP value distribution for feature '{feature}'")
        plt.suptitle('')
        plt.xlabel('Dataset (0=Old, 1=New)')
        plt.ylabel('SHAP value')
        plt.show()
    
    return shap_vals_new_data, combined_df

# Example usage:

np.random.seed(42)
train_df = pd.DataFrame({
    'f1': np.random.normal(0, 1, 1000),
    'f2': np.random.normal(5, 2, 1000),
    'f3': np.random.normal(-3, 1, 1000)
})

test_df = pd.DataFrame({
    'f1': np.random.normal(0.5, 1, 1000),  # Shift in f1
    'f2': np.random.normal(5, 2, 1000),
    'f3': np.random.normal(-3, 1, 1000)
})

features = ['f1', 'f2', 'f3']

shap_feature_drift_analysis(train_df, test_df, features)


ModuleNotFoundError: No module named 'shap'