In [16]:
# Import required libraries
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import joblib
import sys
from sklearn.neighbors import KNeighborsRegressor

# Import this library
import shap
# Load a sample of the data and the models
X_train = pd.read_csv("data/X_train.csv").sample(500, random_state=42)
X_test = pd.read_csv("data/X_test.csv").sample(500, random_state=42)
y_train = pd.read_csv("data/y_train.csv")["nextmonth__home_decor"].sample(500, random_state=42)
y_test = pd.read_csv("data/y_test.csv")["nextmonth__home_decor"].sample(500, random_state=42)
model = joblib.load("data/model.pkl")
knn_model = joblib.load("data/knn_model.pkl")

# Here is a view of how the RandomForestRegressor model was fitted:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(
    n_estimators=100,  # Add this required parameter
    max_depth=16,
    min_samples_split=12,
    min_samples_leaf=7,
    max_features="sqrt",
    bootstrap=False,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)

# Identify a game theory-based XAI method

xai = "shap"



# Compute feature importance based on the model's predictions on X_test. Extract the top five features and store them as a set in top_feats

# Use Shap's TreeExplainer since RandomForestRegressor is a Tree-based model
explainer = shap.TreeExplainer(model)

# Calculate SHAP values
shap_values = explainer.shap_values(X_test)

# Get feature importances
feature_importance = np.abs(shap_values).mean(axis=0)

# Create a DataFrame of the feature importance
feature_importance_df = pd.DataFrame(
    {"Feature": X_test.columns, "Importance": feature_importance}
).sort_values(by="Importance", ascending=False)

# Top five most impactful features based on SHAP
top_feats = feature_importance_df.head(5)

# Evaluate the consistency of feature importance explanations across the two models provided

# Here is a view of how the k-NN model was fitted:
knn_model = KNeighborsRegressor(
     n_neighbors=80,
     weights="uniform",
     algorithm="auto",
     leaf_size=30,
     p=2,
     metric="minkowski",
     metric_params=None,
     n_jobs=-1,
 )


# Fit the model
knn_model.fit(X_train, y_train)

# Create a SHAP Kernel Explainer
knn_explainer = shap.KernelExplainer(knn_model.predict, shap.kmeans(X_test, 5))

# Calculate SHAP values
knn_shap_values = knn_explainer.shap_values(X_test.sample(50, random_state=42))

# Get feature importance
knn_feature_importance = np.abs(knn_shap_values).mean(axis=0)

# Create a DataFrame of the feature importance
knn_feature_importance_df = pd.DataFrame(
  {"Feature": X_test.columns, "Importance": knn_feature_importance}
).sort_values(by="Importance", ascending=False)

# Top five most impactful features based on SHAP
knn_top_feats = knn_feature_importance_df.head(5)

# Calculate cosine similarty consistency across both models
consistency = round(
    cosine_similarity([feature_importance], [knn_feature_importance])[0][0], 2
)
print("Consistency between SHAP values:", consistency)

# The marketing team wants to know if your models are stable and reliable. What is your response?
reliable = "yes"

# As you're working with a smaller sample of the dataset for faster run times, you may or may not have had similar categories appear in the top features. Try running through the project again after submitting and use the full dataset to compare the results.

  0%|          | 0/50 [00:00<?, ?it/s]

Consistency between SHAP values: 0.69
