In [50]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import numpy as np
from imblearn.over_sampling import SMOTE

import matplotlib.pyplot as plt
import seaborn  as sns
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_columns' , None)

In [51]:
pd.set_option('display.max_columns',None)
df = pd.read_excel(r'CHURNDATA (1) (1).xlsx')
df.head(2)

Unnamed: 0,CIF,CUS_DOB,AGE,CUS_Month_Income,CUS_Gender,CUS_Marital_Status,CUS_Customer_Since,YEARS_WITH_US,# total debit transactions for S1,# total debit transactions for S2,# total debit transactions for S3,total debit amount for S1,total debit amount for S2,total debit amount for S3,# total credit transactions for S1,# total credit transactions for S2,# total credit transactions for S3,total credit amount for S1,total credit amount for S2,total credit amount for S3,total debit amount,total debit transactions,total credit amount,total credit transactions,total transactions,CUS_Target,TAR_Desc,Status
0,XXXXXX,Feb 13 1970 12:00AM,49,7116.64,MALE,MARRIED,1994-06-30,25,277,265,345,1459126.64,1230543.08,2068641.91,10,24,31,1516981.1,1764079.61,2378592.62,4758311.63,887,5659653.33,65,952,2231,EXECUTIVE,ACTIVE
1,XXXXXX,Sep 20 1973 12:00AM,46,1500000.0,FEMALE,SINGLE,2005-05-19,14,37,15,45,35372.55,20134.0,83856.67,2,4,4,10000.0,19500.0,57500.0,139363.22,97,87000.0,10,107,2223,LOW,ACTIVE


In [52]:
del df['CIF']
del df['CUS_DOB']
del df['CUS_Customer_Since']

### Checking for null values

In [53]:
df.shape

(1249, 25)

In [54]:
df.isna().sum()

AGE                                    0
CUS_Month_Income                      11
CUS_Gender                             2
CUS_Marital_Status                     0
YEARS_WITH_US                          0
# total debit transactions for S1      0
# total debit transactions for S2      0
# total debit transactions for S3      0
total debit amount for S1              0
total debit amount for S2              0
total debit amount for S3              0
# total credit transactions for S1     0
# total credit transactions for S2     0
# total credit transactions for S3     0
total credit amount for S1             0
total credit amount for S2             0
total credit amount for S3             0
total debit amount                     0
total debit transactions               0
total credit amount                    0
total credit transactions              0
total transactions                     0
CUS_Target                             0
TAR_Desc                               0
Status          

In [55]:
df.dropna(inplace = True)

In [56]:
df['Status'] = df['Status'].map({'ACTIVE':0 , 'CHURN':1})

In [57]:
smote = SMOTE(sampling_strategy='auto', random_state=42)

In [58]:
X = df.drop(columns=['Status'],axis=1)
y = df['Status']

In [59]:
columns_to_encode = ['CUS_Gender','TAR_Desc','CUS_Target','CUS_Marital_Status']
scaling = ['AGE','CUS_Month_Income','YEARS_WITH_US', '# total debit transactions for S1',
       '# total debit transactions for S2',
       '# total debit transactions for S3', 'total debit amount for S1',
       'total debit amount for S2', 'total debit amount for S3',
       '# total credit transactions for S1',
       '# total credit transactions for S2',
       '# total credit transactions for S3', 'total credit amount for S1',
       'total credit amount for S2', 'total credit amount for S3',
       'total debit amount', 'total debit transactions', 'total credit amount',
       'total credit transactions', 'total transactions']

preprocessor = ColumnTransformer(
    transformers=[
        ('standardscaler', StandardScaler(), scaling),
        ('cat', OrdinalEncoder(), columns_to_encode)
    ],
    remainder='passthrough'
)

# Apply the preprocessing to your DataFrame
X_preprocessed = preprocessor.fit_transform(X)

# Convert the transformed data back to a DataFrame
X_preprocessed_df = pd.DataFrame(X_preprocessed ,columns=X.columns)
# Now, X_preprocessed_df contains the preprocessed data
X_preprocessed_df.head(1)

Unnamed: 0,AGE,CUS_Month_Income,CUS_Gender,CUS_Marital_Status,YEARS_WITH_US,# total debit transactions for S1,# total debit transactions for S2,# total debit transactions for S3,total debit amount for S1,total debit amount for S2,total debit amount for S3,# total credit transactions for S1,# total credit transactions for S2,# total credit transactions for S3,total credit amount for S1,total credit amount for S2,total credit amount for S3,total debit amount,total debit transactions,total credit amount,total credit transactions,total transactions,CUS_Target,TAR_Desc
0,0.204,-0.51,5.064,2.747,2.572,3.389,0.752,0.708,1.868,0.172,1.145,1.577,0.736,1.372,1.622,1.211,3.046,1.277,1.044,2.876,1.0,0.0,6.0,1.0


In [60]:
smote = SMOTE()
X_resampled , y_resampled = smote.fit_resample(X_preprocessed_df , y)

# RandomForest Feature Importance

In [61]:
from sklearn.ensemble import RandomForestRegressor

# Train a Random Forest regressor on label encoded data
rf_label =RandomForestClassifier(n_estimators=100, random_state=42)
rf_label.fit(X_resampled, y_resampled)

# Extract feature importance scores for label encoded data
fi_df_rf = pd.DataFrame({
    'feature': X_resampled.columns,
    'rf_importance': rf_label.feature_importances_
}).sort_values(by='rf_importance', ascending=False)

fi_df_rf

Unnamed: 0,feature,rf_importance
8,total debit amount for S1,0.136
5,# total debit transactions for S1,0.125
19,total credit amount,0.101
15,total credit amount for S2,0.07
16,total credit amount for S3,0.063
4,YEARS_WITH_US,0.051
11,# total credit transactions for S1,0.05
14,total credit amount for S1,0.045
0,AGE,0.044
7,# total debit transactions for S3,0.041


#  Gradient Boosting Feature importances

In [62]:
from sklearn.ensemble import GradientBoostingClassifier

# Train a Random Forest regressor on label encoded data
gb_label = GradientBoostingClassifier()
gb_label.fit(X_resampled, y_resampled)

# Extract feature importance scores for label encoded data
fi_df_gb = pd.DataFrame({
    'feature': X_resampled.columns,
    'gb_importance': gb_label.feature_importances_
}).sort_values(by='gb_importance', ascending=False)

fi_df_gb

Unnamed: 0,feature,gb_importance
8,total debit amount for S1,0.395
5,# total debit transactions for S1,0.264
0,AGE,0.05
14,total credit amount for S1,0.039
1,CUS_Month_Income,0.039
11,# total credit transactions for S1,0.024
17,total debit amount,0.024
10,total debit amount for S3,0.022
19,total credit amount,0.019
20,total credit transactions,0.017


# rfe

In [63]:
from sklearn.feature_selection import RFE

# Initialize the base estimator
estimator = RandomForestRegressor()

# Apply RFE on the label-encoded and standardized training data
selector_label = RFE(estimator, n_features_to_select=X_resampled.shape[1], step=1)
selector_label = selector_label.fit(X_resampled, y_resampled)

# Get the selected features based on RFE
selected_features = X_resampled.columns[selector_label.support_]

# Extract the coefficients for the selected features from the underlying linear regression model
selected_coefficients = selector_label.estimator_.feature_importances_

# Organize the results into a DataFrame
fi_df_rfe = pd.DataFrame({
    'feature': selected_features,
    'rfe_score': selected_coefficients
}).sort_values(by='rfe_score', ascending=False)

fi_df_rfe

Unnamed: 0,feature,rfe_score
8,total debit amount for S1,0.388
5,# total debit transactions for S1,0.16
0,AGE,0.059
1,CUS_Month_Income,0.051
3,CUS_Marital_Status,0.028
15,total credit amount for S2,0.028
6,# total debit transactions for S2,0.026
14,total credit amount for S1,0.025
19,total credit amount,0.022
11,# total credit transactions for S1,0.021


# Permutation Importance

In [64]:
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split

X_train_label, X_test_label, y_train_label, y_test_label = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train a Random Forest regressor on label encoded data
rf_label = RandomForestClassifier(n_estimators=100, random_state=42)
rf_label.fit(X_train_label, y_train_label)

# Calculate Permutation Importance
perm_importance = permutation_importance(rf_label, X_test_label, y_test_label, n_repeats=30, random_state=42)

# Organize results into a DataFrame
fi_df_pi = pd.DataFrame({
    'feature': X_resampled.columns,
    'permutation_importance': perm_importance.importances_mean
}).sort_values(by='permutation_importance', ascending=False)

fi_df_pi

Unnamed: 0,feature,permutation_importance
8,total debit amount for S1,0.034
5,# total debit transactions for S1,0.025
15,total credit amount for S2,0.015
0,AGE,0.015
3,CUS_Marital_Status,0.013
14,total credit amount for S1,0.01
12,# total credit transactions for S2,0.01
1,CUS_Month_Income,0.008
11,# total credit transactions for S1,0.007
4,YEARS_WITH_US,0.006


# shap

In [65]:
import shap

# Compute SHAP values using the trained Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_resampled, y_resampled)

explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_resampled)

# Summing the absolute SHAP values across all samples to get an overall measure of feature importance
shap_sum = np.abs(shap_values).mean(axis=0)
fi_df_shap = pd.DataFrame({
    'feature': X_resampled.columns,
    'SHAP_score': np.abs(shap_values).mean(axis=0)
}).sort_values(by='SHAP_score', ascending=False)

fi_df_shap

Unnamed: 0,feature,SHAP_score
8,total debit amount for S1,0.142
5,# total debit transactions for S1,0.134
14,total credit amount for S1,0.032
0,AGE,0.019
17,total debit amount,0.017
1,CUS_Month_Income,0.016
19,total credit amount,0.016
11,# total credit transactions for S1,0.014
3,CUS_Marital_Status,0.014
4,YEARS_WITH_US,0.013


In [66]:
final_fi_df = (
    fi_df_rf
    .merge(fi_df_gb, on='feature')
    .merge(fi_df_shap, on='feature')
    .merge(fi_df_rfe, on='feature')
    .merge(fi_df_pi , on='feature')
    .set_index('feature')
)
final_fi_df

Unnamed: 0_level_0,rf_importance,gb_importance,SHAP_score,rfe_score,permutation_importance
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
total debit amount for S1,0.136,0.395,0.142,0.388,0.034
# total debit transactions for S1,0.125,0.264,0.134,0.16,0.025
total credit amount,0.101,0.019,0.016,0.022,0.002
total credit amount for S2,0.07,0.016,0.01,0.028,0.015
total credit amount for S3,0.063,0.016,0.01,0.017,0.001
YEARS_WITH_US,0.051,0.006,0.013,0.02,0.006
# total credit transactions for S1,0.05,0.024,0.014,0.021,0.007
total credit amount for S1,0.045,0.039,0.032,0.025,0.01
AGE,0.044,0.05,0.019,0.059,0.015
# total debit transactions for S3,0.041,0.006,0.007,0.021,0.002


In [67]:
final_fi_df[['rf_importance','gb_importance','permutation_importance','rfe_score' ,'SHAP_score']].mean(axis=1).sort_values(ascending=False)


feature
total debit amount for S1            0.219
# total debit transactions for S1    0.142
AGE                                  0.037
total credit amount                  0.032
total credit amount for S1           0.030
CUS_Month_Income                     0.030
total credit amount for S2           0.028
# total credit transactions for S1   0.023
total credit amount for S3           0.021
CUS_Marital_Status                   0.019
YEARS_WITH_US                        0.019
total debit amount                   0.018
# total debit transactions for S2    0.016
# total debit transactions for S3    0.016
total debit amount for S3            0.015
total debit transactions             0.012
total debit amount for S2            0.011
# total credit transactions for S2   0.011
# total credit transactions for S3   0.010
total credit transactions            0.009
CUS_Target                           0.007
CUS_Gender                           0.006
total transactions                   0.005
TAR