In [44]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import pandas as pd
import dalex as dx
import plotly.express as px


iris = load_iris()
X, y = iris.data, iris.target
X_df = pd.DataFrame(X, columns=iris.feature_names)                                              # Convert to DataFrame to include feature names
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.3, random_state=42)    # Split the dataset into training and testing sets


rf_model = RandomForestClassifier(random_state=42)                                              # Build the random forest model
rf_model.fit(X_train, y_train)

explainer = dx.Explainer(rf_model, X_test, y_test, label=", ".join(iris.target_names))          # Create a DALEX explainer
predictions = explainer.predict(X_test)                                                   # Generate predictions for the first 10 observations


bd = explainer.predict_parts(X_test.iloc[0], type='break_down')                                 # Generate Break Down plot for the first observation
bd.plot()

probabilities = rf_model.predict_proba(X_test.iloc[[0]])[0]
thresholds = {iris.target_names[i]: prob for i, prob in enumerate(probabilities)}
print("Classification thresholds for the first observation:", thresholds)


shap = explainer.predict_parts(X_test.iloc[0], type='shap')                                     # Generate Shapley plot for the first observation
shap.plot()

fi = explainer.model_parts()                                                                    # Generate Feature Importance plot
fi.plot()

performance = explainer.model_performance()                                                     # Evaluate model performance
performance.plot()

pdp = explainer.model_profile(variables=['sepal length (cm)'])                                  # Generate Partial Dependence Plots (PDP)
pdp.plot()

cp = explainer.predict_profile(X_test.iloc[0])                                                  # Generate Ceteris Paribus plot for the first observation
cp.plot()

ale = explainer.model_profile(type='accumulated', variables=['sepal length (cm)'])              # Generate Accumulated Local Effects (ALE) plot
ale.plot()

ale = explainer.model_profile(type='conditional', variables=['sepal length (cm)'])              # Generate Accumulated Local Effects (ALE) plot
ale.plot()


Preparation of a new explainer is initiated

  -> data              : 45 rows 4 cols
  -> target variable   : 45 values
  -> model_class       : sklearn.ensemble._forest.RandomForestClassifier (default)
  -> label             : setosa, versicolor, virginica
  -> predict function  : <function yhat_proba_default at 0x000001BB7C097740> will be used (default)
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 0.0, mean = 0.287, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.05, mean = 0.579, max = 2.0
  -> model_info        : package sklearn

A new explainer has been created!


Classification thresholds for the first observation: {'setosa': 0.0, 'versicolor': 0.97, 'virginica': 0.03}


Calculating ceteris paribus: 100%|██████████| 1/1 [00:00<00:00, 49.72it/s]


Calculating ceteris paribus: 100%|██████████| 4/4 [00:00<00:00, 46.29it/s]


Calculating ceteris paribus: 100%|██████████| 1/1 [00:00<00:00, 31.39it/s]
Calculating accumulated dependency: 100%|██████████| 1/1 [00:00<00:00,  9.91it/s]


Calculating ceteris paribus: 100%|██████████| 1/1 [00:00<00:00, 51.67it/s]
Calculating conditional dependency: 100%|██████████| 1/1 [00:00<00:00, 17.89it/s]


In [30]:
performance = explainer.model_performance()                                                     # Evaluate model performance
performance.plot()

In [39]:
cp = explainer.predict_profile(X_test.iloc[30])                                                  # Generate Ceteris Paribus plot for the first observation
cp.plot()

Calculating ceteris paribus: 100%|██████████| 4/4 [00:00<00:00, 100.36it/s]




In [29]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pandas as pd
import dalex as dx
import plotly.express as px

# Load the iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Convert to DataFrame to include feature names
X_df = pd.DataFrame(X, columns=iris.feature_names)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.3, random_state=42)

# Build the random forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Create a DALEX explainer for the random forest model
explainer_rf = dx.Explainer(rf_model, X_test, y_test, label="Random Forest")

# Build a logistic regression model
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)

# Create another DALEX explainer for the logistic regression model
explainer_lr = dx.Explainer(lr_model, X_test, y_test, label="Logistic Regression")

# Generate predictions for the first 10 observations
predictions = explainer_rf.predict(X_test)[0:10]

# Generate Break Down plot for the first observation
bd = explainer_rf.predict_parts(X_test.iloc[0], type='break_down')
bd.plot()

# Generate Shapley plot for the first observation
shap = explainer_rf.predict_parts(X_test.iloc[0], type='shap')
shap.plot()

# Generate Feature Importance plot for the random forest model
fi_rf = explainer_rf.model_parts()
fi_rf.plot()

# Evaluate model performance for both models
performance_rf = explainer_rf.model_performance()
performance_lr = explainer_lr.model_performance()

# Plot and compare model performances
print("Random Forest")
performance_rf.plot()
print("Logistic Regression")
performance_lr.plot()

# Generate Partial Dependence Plots (PDP)
pdp = explainer_rf.model_profile(variables=['sepal length (cm)'])
pdp.plot()

pdp = explainer_lr.model_profile(variables=['sepal length (cm)'])
pdp.plot()

# Generate Ceteris Paribus plot for the first observation
cp = explainer_rf.predict_profile(X_test.iloc[0])
cp.plot()

cp = explainer_lr.predict_profile(X_test.iloc[0])
cp.plot()
# Generate Accumulated Local Effects (ALE) plot
ale = explainer_rf.model_profile(type='accumulated', variables=['sepal length (cm)'])
ale.plot()
ale = explainer_lr.model_profile(type='accumulated', variables=['sepal length (cm)'])
ale.plot()

# Generate Residuals plot
residuals = explainer_rf.model_performance()
residuals.plot()
residuals = explainer_lr.model_performance()
residuals.plot()

# Advanced Feature Analysis - Ceteris Paribus and ALE for another feature
cp_petal = explainer_rf.predict_profile(X_test.iloc[0], variables=['petal length (cm)'])
cp_petal.plot()
cp_petal = explainer_lr.predict_profile(X_test.iloc[0], variables=['petal length (cm)'])
cp_petal.plot()

ale_petal = explainer_rf.model_profile(type='accumulated', variables=['petal length (cm)'])
ale_petal.plot()
ale_petal = explainer_lr.model_profile(type='accumulated', variables=['petal length (cm)'])
ale_petal.plot()


Preparation of a new explainer is initiated

  -> data              : 45 rows 4 cols
  -> target variable   : 45 values
  -> model_class       : sklearn.ensemble._forest.RandomForestClassifier (default)
  -> label             : Random Forest
  -> predict function  : <function yhat_proba_default at 0x000002190C19F7E0> will be used (default)
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 0.0, mean = 0.287, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.05, mean = 0.579, max = 2.0
  -> model_info        : package sklearn

A new explainer has been created!
Preparation of a new explainer is initiated

  -> data              : 45 rows 4 cols
  -> target variable   : 45 values
  -> model_class       : sklearn.linear_model._logistic.LogisticRegression (default)
  -> label             : Logistic Regression
  -> predict 

Random Forest
Logistic Regression


Calculating ceteris paribus: 100%|██████████| 1/1 [00:00<00:00, 43.97it/s]


Calculating ceteris paribus: 100%|██████████| 1/1 [00:00<00:00, 176.53it/s]


Calculating ceteris paribus: 100%|██████████| 4/4 [00:00<00:00, 62.84it/s]


Calculating ceteris paribus: 100%|██████████| 4/4 [00:00<00:00, 315.55it/s]


Calculating ceteris paribus: 100%|██████████| 1/1 [00:00<00:00,  9.12it/s]
Calculating accumulated dependency: 100%|██████████| 1/1 [00:00<00:00,  8.91it/s]


Calculating ceteris paribus: 100%|██████████| 1/1 [00:00<00:00, 200.59it/s]
Calculating accumulated dependency: 100%|██████████| 1/1 [00:00<00:00, 15.66it/s]


Calculating ceteris paribus: 100%|██████████| 1/1 [00:00<00:00, 87.49it/s]


Calculating ceteris paribus: 100%|██████████| 1/1 [00:00<00:00, 245.93it/s]


Calculating ceteris paribus: 100%|██████████| 1/1 [00:00<00:00, 42.41it/s]
Calculating accumulated dependency: 100%|██████████| 1/1 [00:00<00:00, 16.24it/s]


Calculating ceteris paribus: 100%|██████████| 1/1 [00:00<00:00, 241.66it/s]
Calculating accumulated dependency: 100%|██████████| 1/1 [00:00<00:00, 15.14it/s]
