In [1]:
import os
import sys
sys.path.append(os.path.abspath('..')) 

In [2]:
import joblib
from customer_churn_ml.data_loader import load_data
import pandas as pd
import numpy as np
import shap

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data = load_data()

In [4]:
test = data[:20]

In [5]:
test = test.drop(columns=['customer_id'])

In [6]:
test.head(2)

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,contract,paperless_billing,payment_method,monthly_charges,total_charges,churn
0,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
1,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes


In [7]:
model = joblib.load("churn_clf_model.pkl")

In [8]:
model.named_steps

{'preprocessor': ColumnTransformer(remainder='passthrough',
                   transformers=[('encoding',
                                  OneHotEncoder(handle_unknown='ignore'),
                                  ['gender', 'senior_citizen', 'partner',
                                   'dependents', 'phone_service',
                                   'multiple_lines', 'internet_service',
                                   'online_security', 'online_backup',
                                   'device_protection', 'tech_support',
                                   'streaming_tv', 'streaming_movies',
                                   'contract', 'paperless_billing',
                                   'payment_method'])]),
 'model': ExtraTreesClassifier(class_weight='balanced', max_depth=20, max_features='log2',
                      min_samples_leaf=10, min_samples_split=15,
                      n_estimators=250, random_state=42)}

In [9]:
model_name = model.named_steps['model']

In [10]:
model_name

In [11]:
preprocessor = model.named_steps['preprocessor']

In [12]:
X_train = test.iloc[:, :-1]

In [21]:
len(X_train.columns)

19

In [13]:
test = X_train

In [14]:
test['senior_citizen'] = test['senior_citizen'].astype(object)

In [15]:
pre_processed = preprocessor.transform(test)

In [16]:
explainer = shap.TreeExplainer(model_name)

In [17]:
shap_vals = explainer.shap_values(X=pre_processed)

In [30]:
shap_vals[0].shape

(46, 2)

### DeepSeek:

In [26]:
transformed_features = preprocessor.get_feature_names_out()

In [28]:
categorical_features = preprocessor.transformers[0][2]  # Names of categorical features
numerical_features = [col for col in X_train.columns if col not in categorical_features]

In [29]:
import numpy as np

# Initialize aggregated SHAP array
aggregated_shap = np.zeros((shap_vals.shape[0], len(X_train.columns), 2))  # Shape: (n_samples, 19, 2)

for i, feature in enumerate(X_train.columns):
    if feature in categorical_features:
        # Find all encoded columns for this feature
        prefix = f'encoding__{feature}_'
        matching_indices = [idx for idx, tf in enumerate(transformed_features) if tf.startswith(prefix)]
        # Sum SHAP values across these columns for both classes
        if matching_indices:
            aggregated_shap[:, i, 0] = np.sum(shap_vals[:, matching_indices, 0], axis=1)
            aggregated_shap[:, i, 1] = np.sum(shap_vals[:, matching_indices, 1], axis=1)
    else:
        # Find the corresponding numerical feature column
        remainder_feature = f'remainder__{feature}'
        try:
            idx = list(transformed_features).index(remainder_feature)
            aggregated_shap[:, i, 0] = shap_vals[:, idx, 0]
            aggregated_shap[:, i, 1] = shap_vals[:, idx, 1]
        except ValueError:
            pass  # Handle if feature not found (unlikely)

In [35]:
len(aggregated_shap[0])

19

In [36]:
import numpy as np

# Assuming `aggregated_shap` has shape (n_samples, 19, 2)
mean_abs_shap_churn = np.mean(np.abs(aggregated_shap[:, :, 1]), axis=0)  # For Churn
mean_abs_shap_non_churn = np.mean(np.abs(aggregated_shap[:, :, 0]), axis=0)  # For Non-Churn

In [48]:
import pandas as pd

feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance_Churn': mean_abs_shap_churn,
    'Importance_Non_Churn': mean_abs_shap_non_churn
})

# Sort by importance for Churn (descending)
feature_importance = feature_importance.sort_values('Importance_Churn', ascending=False).reset_index(drop=True)

In [49]:
feature_importance

Unnamed: 0,Feature,Importance_Churn,Importance_Non_Churn
0,contract,0.117863,0.117863
1,internet_service,0.062671,0.062671
2,online_security,0.060181,0.060181
3,tech_support,0.038494,0.038494
4,payment_method,0.035637,0.035637
5,tenure,0.028489,0.028489
6,online_backup,0.024401,0.024401
7,paperless_billing,0.022198,0.022198
8,total_charges,0.016929,0.016929
9,streaming_movies,0.012467,0.012467


In [52]:
# import matplotlib.pyplot as plt

# plt.figure(figsize=(12, 8))
# plt.barh(
#     feature_importance['Feature'],
#     feature_importance['Importance_Churn'],
#     color='red',
#     alpha=0.6,
#     label='Churn'
# )
# plt.barh(
#     feature_importance['Feature'],
#     -feature_importance['Importance_Non_Churn'],  # Negative for visual separation
#     color='blue',
#     alpha=0.6,
#     label='Non-Churn'
# )
# plt.xlabel('Mean Absolute SHAP Value (Impact on Prediction)')
# plt.title('Global Feature Importance for Churn vs. Non-Churn')
# plt.legend()
# plt.gca().invert_yaxis()  # Show top features at the top
# plt.show()