# **Bank Customer Churn Prediction System**
---

In [None]:
import warnings
import numpy as np
import pandas as pd
import joblib
from scipy.stats import randint, uniform
from pandas.api.types import is_numeric_dtype, is_object_dtype
from imblearn.over_sampling import SMOTENC
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import PCA

# ML Models
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

# Plotting Libraries
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
warnings.filterwarnings('ignore')

In [3]:
dataset = pd.read_csv('./datasets/churn_dataset.csv')

In [4]:
#FIXME: Fix the dataset
#NOTE: This is a two databases combination problem not severe!
dataset['Card Type '] = dataset['Card Type '].where(dataset['HasCrCard'] == 1, 'None')

In [5]:
def preprocess_dataset(dataset):
    def handle_outliers(dataset):
        # Note: Dropped columns are categorical features. There is no use to handle outliers for them.
        for index, feature in enumerate(dataset.drop(columns=['Education', 'Geography', 'Gender', 'Card Type', 'IsActiveMember', 'HasCrCard', 'Housing', 'Loan', 'Exited'])):
            Q1 = dataset[feature].quantile(0.25)
            Q3 = dataset[feature].quantile(0.75)
            IQR = Q3 - Q1
            dataset[feature] = np.clip(dataset[feature], Q1 - 1.5 * IQR, Q3 + 1.5 * IQR)
          
        return dataset;           

    def handle_missing_values(dataset):
        columns = dataset.columns.tolist()
        for column in columns:
            # Calculate null values and percentage
            null_count = dataset[column].isnull().sum()
            total_count = len(dataset[column])
            null_percentage = (null_count / total_count) * 100

            # Handle columns based on null percentage and data type
            if null_percentage < 50:
                # Fill missing values for numerical columns
                if is_numeric_dtype(dataset[column]):
                    mean = dataset[column].mean()
                    dataset[column].fillna(mean, inplace=True)
                # Fill missing values for categorical columns
                elif is_object_dtype(dataset[column]):
                    mode = dataset[column].mode()[0]
                    dataset[column].fillna(mode, inplace=True)
            # elif 50 <= null_percentage < 70:
            #     # TODO: Implement the data missing data handling
            #     print(f"Under development for column: {column}")
            else:   
                # Drop columns with more than 80% missing values
                dataset.drop(columns=column, inplace=True)

        return dataset

    def remove_duplicates(dataset):
        if dataset.duplicated().sum() > 0:
            dataset = dataset.drop_duplicates().reset_index(drop = True)
        return dataset

    def encode_categorical_features(dataset):
        # One-Hot encoding
        encoded = pd.get_dummies(dataset[['Geography','Education','Card Type']], drop_first=True).astype(float)
        dataset = pd.concat([dataset, encoded], axis=1)
        dataset = dataset.drop(columns=['Geography','Education', 'Card Type'])

        # # Label encoding
        labelEncoder = LabelEncoder()
        dataset['Gender'] = labelEncoder.fit_transform(dataset['Gender'])
        dataset['Housing'] = labelEncoder.fit_transform(dataset['Housing'])
        dataset['Loan'] = labelEncoder.fit_transform(dataset['Loan'])

        dataset['Gender'] = dataset['Gender'].astype(float)
        dataset['Housing'] = dataset['Housing'].astype(float)
        dataset['Loan'] = dataset['Loan'].astype(float)

        # Moving the Y predictor to the end of the dataset
        feature_exited = dataset['Exited']
        dataset = dataset.drop(columns=['Exited'])
        dataset = pd.concat([dataset, feature_exited], axis=1)

        return dataset

    def split_dataset_to_X_y(dataset):
        X = dataset.iloc[:, :-1]
        y = dataset.iloc[:, -1]
        return X, y

    def handle_class_imbalance(X, y):
        categorical_features = [
            'Gender', 
            'HasCrCard', 'IsActiveMember', 
            'Housing', 'Loan','Geography_Germany', 
            'Geography_Spain', 'Education_secondary',
            'Education_tertiary', 'Education_unknown', 
            'Card Type_GOLD','Card Type_None', 
            'Card Type_PLATINUM', 'Card Type_SILVER'
        ]

        cat_indices = [X.columns.get_loc(col) for col in categorical_features]
        
        smote = SMOTENC(categorical_features=cat_indices, k_neighbors=9, random_state=42)
        overSampled_X, overSampled_y = smote.fit_resample(X, y)

        return overSampled_X, overSampled_y  

    def scale_features(X_train, X_test):
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test) 
        return X_train_scaled, X_test_scaled

    # Removing irrelevant features
    dataset = dataset.drop(columns= ['CustomerId', 'Surname'])

    # Removing the white spaces from feature names
    dataset.columns = dataset.columns.str.strip()

    # Handling null values
    dataset = handle_missing_values(dataset)

    # Check and drop duplicates from the database
    dataset =  remove_duplicates(dataset)

    # Check and handle outliers from the database
    dataset = handle_outliers(dataset)

    # Encoding categorical features using one-hot encoding and label encoding
    dataset = encode_categorical_features(dataset)

    # Re-Removing the white spaces from feature names
    dataset.columns = dataset.columns.str.strip()

    # Splitting the dataset into X and y
    X, y = split_dataset_to_X_y(dataset)

    # Generating synthetic data using SMOTE
    X, y = handle_class_imbalance(X, y)

    # Splitting the dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

    # Feature scaling using min max scaler
    X_train, X_test = scale_features(X_train, X_test)

    return X_train, X_test, y_train, y_test

In [6]:
X_train, X_test, y_train, y_test = preprocess_dataset(dataset)

In [7]:
def preform_grid_search(model, parameters, X_train, y_train):
    grid_search = GridSearchCV(model, parameters)
    return grid_search.fit(X_train, y_train)

In [None]:
def preform_random_search(model, params, n_tier, cv, X_train, y_train):
    random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=n_tier, cv=cv, scoring='accuracy', n_jobs=-1, random_state=42)
    return random_search.fit(X_train, y_train)

In [None]:

XGM = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    # random_state=42,
    n_jobs=-1,
    # max_depth= 7,
    # n_estimators= 350,
    # reg_alpha= 1,
    # reg_lambda= 0.001,
)

# XGM.fit(X_train, y_train, eval_set=[(X_test, y_test)])

# params_XG = {
#     'max_depth': [3, 5, 7],
#     'earning_rate': [0.1, 0.01, 0.001],
#     'reg_alpha': [0, 0.1, 0.5, 1, 5, 10],
#     'reg_lambda': [0.1, 0.01, 0.001],
#     'n_estimators': [100, 200, 350, 400],
#}
# XGM = preform_grid_search(XGM, params_XG, X_train, y_train)

params_XG = {
    'max_depth': randint(2, 8),
    'reg_alpha': uniform(0.001, 1),
    'reg_lambda': uniform(0.01, 10),
    'n_estimators': randint(100, 500),
}

XGM = preform_random_search(XGM, params_XG, 20, 5, X_train, y_train)
XGM.best_params_


{'max_depth': 2,
 'n_estimators': 413,
 'reg_alpha': np.float64(0.5257564316322378),
 'reg_lambda': np.float64(4.329450186421157)}

In [None]:
joblib.dump(XGM.best_estimator_, "./models/best_XGboost.pkl")

['./models/best_XGboost.pkl']

In [11]:
y_pred_XG = XGM.predict(X_test)
print(accuracy_score(y_test, y_pred_XG))
print(classification_report(y_test, y_pred_XG))
print(confusion_matrix(y_test, y_pred_XG))

0.9121155053358443
              precision    recall  f1-score   support

         0.0       0.89      0.94      0.92      1633
         1.0       0.93      0.88      0.91      1553

    accuracy                           0.91      3186
   macro avg       0.91      0.91      0.91      3186
weighted avg       0.91      0.91      0.91      3186

[[1533  100]
 [ 180 1373]]


In [None]:
LGB = lgb.LGBMClassifier()

params_LGB = {
    'learning_rate': uniform(0.001, 1),
    'max_depth': randint(2, 20),
    'num_leaves': randint(20, 60)
}

LGB = preform_random_search(LGB, params_LGB, 20, 5, X_train, y_train)
LGB.best_params_


[LightGBM] [Info] Number of positive: 6410, number of negative: 6330
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000415 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1558
[LightGBM] [Info] Number of data points in the train set: 12740, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503140 -> initscore=0.012559
[LightGBM] [Info] Start training from score 0.012559


{'learning_rate': np.float64(0.24202546602601172),
 'max_depth': 8,
 'num_leaves': 27}

In [None]:
joblib.dump(LGB.best_estimator_, "./models/best_lightgbm.pkl")

['./models/best_lightgbm.pkl']

In [14]:
y_pred_LGM = LGB.predict(X_test)

print(accuracy_score(y_test, y_pred_LGM))
print(classification_report(y_test, y_pred_LGM))
print(confusion_matrix(y_test, y_pred_LGM))

0.9127432517263026
              precision    recall  f1-score   support

         0.0       0.90      0.94      0.92      1633
         1.0       0.93      0.88      0.91      1553

    accuracy                           0.91      3186
   macro avg       0.91      0.91      0.91      3186
weighted avg       0.91      0.91      0.91      3186

[[1535   98]
 [ 180 1373]]


In [None]:
RF = RandomForestClassifier(
    n_estimators = 300
)
# RF.fit(X_train, y_train)

# params_RF = {
#     'n_estimators': [100, 300, 500],
# }

params_RF = {
    'max_depth': randint(3, 20),
    'min_samples_split': randint(2, 20)
}

# RF = preform_grid_search(RF, params_RF, X_train, y_train)
RF = preform_random_search(RF, params_RF, 20, 10, X_train, y_train)
RF.best_params_

{'max_depth': 17, 'min_samples_split': 4}

In [None]:
joblib.dump(RF.best_estimator_, "./models/best_RandomForest_Classifier.pkl")

['./models/best_RandomForest_Classifier.pkl']

In [33]:
y_pred_RC = RF.predict(X_test)
print(accuracy_score(y_test, y_pred_RC))
print(classification_report(y_test, y_pred_RC))
print(confusion_matrix(y_test, y_pred_RC))


0.9048964218455744
              precision    recall  f1-score   support

         0.0       0.89      0.93      0.91      1633
         1.0       0.93      0.87      0.90      1553

    accuracy                           0.90      3186
   macro avg       0.91      0.90      0.90      3186
weighted avg       0.91      0.90      0.90      3186

[[1525  108]
 [ 195 1358]]


In [None]:
votes = VotingClassifier(estimators=[('xg', XGM), ('lgb', LGB), ('rf', RF)], voting='soft')
votes.fit(X_train, y_train)
y_pred= votes.predict(X_test)

[LightGBM] [Info] Number of positive: 6410, number of negative: 6330
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001285 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1558
[LightGBM] [Info] Number of data points in the train set: 12740, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503140 -> initscore=0.012559
[LightGBM] [Info] Start training from score 0.012559


In [19]:
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.9158819836785939
              precision    recall  f1-score   support

         0.0       0.90      0.94      0.92      1633
         1.0       0.94      0.89      0.91      1553

    accuracy                           0.92      3186
   macro avg       0.92      0.92      0.92      3186
weighted avg       0.92      0.92      0.92      3186

[[1540   93]
 [ 175 1378]]
