In [None]:
# Cell 5: transform inputs to numeric arrays and compute SHAP values using the explainer
import shap
import numpy as np

# Enforce expected feature ordering for preprocessing and explanation
feature_cols = [
    "age", "tenure_months", "monthly_charges",
    "num_support_tickets", "contract_type", "payment_method", "has_internet"
]

# Select columns from raw DataFrames (assumes X_train and X_mis_raw are defined)
X_train = X_train.loc[:, feature_cols]
X_mis_raw = X_mis_raw.loc[:, feature_cols]

# Retrieve pipeline components: preprocessing step and classifier
prep = model.named_steps.get("prep", None)
clf = model.named_steps.get("clf", None)
if prep is None or clf is None:
    raise RuntimeError("Expected pipeline with steps named 'prep' and 'clf'.")

# Apply preprocessing to obtain numeric representations (OneHotEncoder may produce sparse output)
X_train_trans = prep.transform(X_train)
X_mis_trans = prep.transform(X_mis_raw)

# Convert sparse matrices to dense arrays for SHAP compatibility
if hasattr(X_train_trans, "toarray"):
    X_train_trans = X_train_trans.toarray()
if hasattr(X_mis_trans, "toarray"):
    X_mis_trans = X_mis_trans.toarray()

print("X_train_trans.shape:", X_train_trans.shape)
print("X_mis_trans.shape:", X_mis_trans.shape)

# Compute SHAP values for the misclassified set using a pre-built explainer
#Learners to type



# (explainer should be a fitted shap.Explainer / TreeExplainer compatible with clf)
sv = explainer.shap_values(X_mis_trans)

# Normalize SHAP output to a 2-D array shap_matrix with shape (n_samples, n_transformed_features)
if isinstance(sv, list):
    # For multiclass outputs, select contributions for the positive class (index 1) when available
    class_idx = 1 if len(sv) > 1 else 0
    shap_matrix = np.asarray(sv[class_idx])
else:
    shap_matrix = np.asarray(sv)

print("shap_matrix.shape:", shap_matrix.shape)

# Validate dimensionality: number of transformed features must match SHAP feature dimension
n_trans_features = X_train_trans.shape[1]
if shap_matrix.shape[1] != n_trans_features:
    raise ValueError(f"SHAP feature dim {shap_matrix.shape[1]} != expected {n_trans_features}")

# Obtain transformed feature names for plotting
try:
    trans_feature_names = prep.get_feature_names_out()
except Exception:
    # Fallback: construct feature names from known numeric and categorical transformers
    numeric_features = ["age", "tenure_months", "monthly_charges", "num_support_tickets", "has_internet"]
    cat_features = ["contract_type", "payment_method"]
    # Assumes the preprocessing pipeline has a named transformer 'cat' that supports get_feature_names_out
    cat_names = prep.named_transformers_["cat"].get_feature_names_out(cat_features)
    trans_feature_names = list(numeric_features) + list(cat_names)

print("Number of transformed features:", len(trans_feature_names))
