# ⚽S4E1 - EDA & initial submission - Binary Classification with a Bank Churn Dataset 

Welcome to 2024! For this Episode of the Series, your task is to predict whether a customer continues with their account or closes it (e.g., churns). Good luck!

## Evaluation

Submissions are evaluated on area under the ROC curve between the predicted probability and the observed target.

## Submission Format

For each id in the test set, you must predict the probability for the target variable Exited. The file should contain a header and have the following format:

```
id,Exited
0,0.9
1,0.1
2,0.5
etc.
```

## Data Description

The dataset for this competition (both train and test) was generated from a deep learning model trained on the Bank Customer Churn Prediction dataset. Feature distributions are close to, but not exactly the same, as the original. 

# Code

## ToC

- [Imports](#Imports)


## Imports

In [None]:
# essentials
import os
import pathlib
from copy import copy
import json

import pandas as pd
import numpy as np
from tqdm import tqdm

# visualisation
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn imports
import sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MaxAbsScaler, PowerTransformer, FunctionTransformer, StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MaxAbsScaler, PowerTransformer, FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline, make_union, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, chi2, f_classif, SequentialFeatureSelector, RFECV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.base import clone as clone_model
from sklearn.metrics import classification_report, confusion_matrix, log_loss
from sklearn.impute import SimpleImputer, MissingIndicator, KNNImputer


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier, StackingClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, ElasticNet, SGDClassifier, RidgeClassifier, PassiveAggressiveClassifier, TweedieRegressor
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB, ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import RocCurveDisplay, roc_auc_score, make_scorer, roc_curve

from sklearn.preprocessing import Binarizer, Normalizer, RobustScaler, StandardScaler
from sklearn.preprocessing import FunctionTransformer

# others
import xgboost as xgb 
import lightgbm as lgb
import catboost as cb

import optuna
import shap

RANDOM_SEED = 64

palette = ["#4464ad", "#dc136c", "#F4FF52", "#f58f29","#45cb85"]

sns.set_theme(style="whitegrid")
sns.set_palette(palette)
sns.palplot(palette)

## Data loading & EDA

First we will check

1. Number and types of columns
2. Number of rows in train and test
2. Missing values
3. Target variable distribution

In [None]:
IN_KAGGLE = False

kaggle_folder = "/kaggle/input/"
local_folder = "./data/"
input_folder = kaggle_folder if IN_KAGGLE else local_folder

train_df = pd.read_csv(input_folder + "playground-series-s4e1/train.csv", index_col="id")
test_df = pd.read_csv(input_folder + "playground-series-s4e1/test.csv", index_col="id")
submission_df = pd.read_csv(input_folder + "playground-series-s4e1/sample_submission.csv")
original_df = pd.read_csv(input_folder + "bank-customer-churn-prediction/Churn_Modelling.csv")
target_col = "Exited"

numeric_features = ['CustomerId', 'CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
categorical_features = ['Surname', 'Geography', 'Gender', 'HasCrCard', 'IsActiveMember']

features_to_drop = ['CustomerId', 'Surname']

GENERATED_COLUMN = True
ADD_ORIGINAL_DF = True

model_postfix = "_with_original" if ADD_ORIGINAL_DF else ""
model_postfix += "_generated" if GENERATED_COLUMN else ""

original_df = original_df.drop(columns=['RowNumber'])

# drop na rows from orignal df
original_df = original_df.dropna()

if GENERATED_COLUMN:
    train_df['generated'] = 1
    test_df['generated'] = 1
    original_df['generated'] = 0
    categorical_features.append('generated')
    
if ADD_ORIGINAL_DF:
    train_df = pd.concat([train_df, original_df])


for f in features_to_drop:
    if f in numeric_features:
        numeric_features.remove(f)
    if f in categorical_features:
        categorical_features.remove(f)
    
    train_df = train_df.drop(columns=f)

def initial_feature_engineering(df):
    df['HasCrCard'] = df['HasCrCard'].astype('bool')
    df['IsActiveMember'] = df['IsActiveMember'].astype('bool')
    df['Gender'] = df['Gender'].map({ "Male": 0, "Female": 1}).astype("bool")
    # encode geography
    df = pd.get_dummies(df, columns=['Geography'])

    return df

def feature_engineering_1(df):
    # Balance
    df['balance_over_100k'] = df['Balance'] >= 100000
    df['balance_over_150k'] = df['Balance'] >= 150000

    # EstimatedSalary
    df["estimated_salary_under_50k"] = df["EstimatedSalary"] < 50000
    df["estimated_salary_50k_to_100k"] = (df["EstimatedSalary"] >= 50000) & (df["EstimatedSalary"] < 100000)
    df["estamated_salary_over_150k"] = df["EstimatedSalary"] >= 150000

    # NumOfProducts
    df["num_of_products_3_or_4"] = df["NumOfProducts"] >= 3

    # Age
    df["age_over_40"] = df["Age"] >= 40
    df["age_over_50"] = df["Age"] >= 50
    df["age_over_60"] = df["Age"] >= 60

    new_features = [
        "balance_over_100k",
        "balance_over_150k",
        "estimated_salary_under_50k",
        "estimated_salary_50k_to_100k",
        "estamated_salary_over_150k",
        "num_of_products_3_or_4",
        "age_over_40",
        "age_over_50",
        "age_over_60",
    ]
    for f in new_features:
        df[f] = df[f].astype("int")

    return df

train_df = initial_feature_engineering(train_df)
train_df = feature_engineering_1(train_df)
X_train, X_val, y_train, y_val = train_test_split(train_df.drop(columns=target_col), train_df[target_col], test_size=0.2, random_state=RANDOM_SEED, stratify=train_df[target_col])

## Ideas for feature engineering

In [None]:
def create_pipeline(model, numeric_scalers=("scaler", StandardScaler())):
    numeric_pipeline = Pipeline(
        [numeric_scalers]
    )

    categorical_pipeline = Pipeline([
        #("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore", drop='if_binary')),
    ])

    preprocessor = ColumnTransformer([
        ("numeric", numeric_pipeline, numeric_features),
        #("categorical", categorical_pipeline, categorical_features),
    ], remainder='passthrough')

    return Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", model),
    ])

def train_models(models, X_train, y_train, parameters={}):
    trained_models = {}
    for model_name, model in tqdm(models.items()):
        if model_name in parameters:
            model.set_params(**parameters[model_name])
        model = create_pipeline(model)
        model.fit(X_train, y_train)
        trained_models[model_name] = model
    return trained_models

def evaluate_models(models, X_val, y_val):
    # create a dataframe with "model_name", "accuracy", "precision", "recall", "area under the ROC curve"
    results_df = pd.DataFrame(columns=["model_name", "accuracy", "precision", "recall", "auc"])

    for model_name, model in tqdm(models.items()):
        y_pred = model.predict(X_val)
        y_proba = model.predict_proba(X_val)[:, 1]
        results_df = pd.concat([
            results_df,
            pd.DataFrame({
                "model_name": [model_name],
                "accuracy": [model.score(X_val, y_val)],
                "precision": [sklearn.metrics.precision_score(y_val, y_pred)],
                "recall": [sklearn.metrics.recall_score(y_val, y_pred)],
                "auc": [sklearn.metrics.roc_auc_score(y_val, y_proba)],
            })
        ])
    return results_df

def plot_roc_curve(models, X_val, y_val):
    fig, ax = plt.subplots(1, 1, figsize=(16, 8))
    palette_to_use = sns.color_palette("husl", len(models))
    # for each model, plot the roc curve in the same plot, with other color
    for i, (model_name, model) in enumerate(models.items()):
        y_proba = model.predict_proba(X_val)[:, 1]
        fpr, tpr, _ = roc_curve(y_val, y_proba)
        roc_auc = roc_auc_score(y_val, y_proba)
        ax.plot(fpr, tpr, label=f"{model_name} (AUC = {roc_auc:.2f})", color=palette_to_use[i])
        ax.plot([0, 1], [0, 1], color='black', linestyle='--')
    ax.set_xlabel("False Positive Rate")
    ax.set_ylabel("True Positive Rate")
    ax.set_title("ROC Curve")
    # show legend
    ax.legend()


In [None]:
trained_models = {}

# generate random seed
models = {
    "xgboost": xgb.XGBClassifier(random_state=RANDOM_SEED, n_jobs=-1),
    "lightgbm": lgb.LGBMClassifier(random_state=RANDOM_SEED, n_jobs=-1, verbosity=-1),
    "catboost": cb.CatBoostClassifier(random_state=RANDOM_SEED, verbose=False, thread_count=16),
    #"knn": KNeighborsClassifier(n_jobs=-1),
    #"random_forest": RandomForestClassifier(random_state=RANDOM_SEED, n_jobs=-1),
    #"gradient_boosting": GradientBoostingClassifier(random_state=RANDOM_SEED),
    #"extra_trees": ExtraTreesClassifier(random_state=RANDOM_SEED, n_jobs=-1),
    #"bagging": BaggingClassifier(random_state=RANDOM_SEED, n_jobs=-1),
    #"ada_boost": AdaBoostClassifier(random_state=RANDOM_SEED),
    #"sgd": SGDClassifier(random_state=RANDOM_SEED, loss="log_loss", n_jobs=-1),
}

print("Training models...")
trained_models = train_models(models, X_train, y_train, trained_models)
print("Evaluating models...")
results_df = evaluate_models(trained_models, X_val, y_val)
results_df.sort_values(by="auc", ascending=False)

Comparison of ROC curves for trained models:

## Optimizing hyperparameters for the best model

We will use optuna here to first optimize all different hyperparameters

## LightGBM

In [None]:
import optuna.integration.lightgbm as lgb

from lightgbm import early_stopping
from lightgbm import log_evaluation
import sklearn.datasets
from sklearn.model_selection import StratifiedKFold


dtrain = lgb.Dataset(X_train, label=y_train)

params = {
    "objective": "binary",
    "num_class": 1,
    "is_unbalance": "true",
    "metric": "auc",
    "verbosity": -1,
    "boosting_type": "gbdt",
}

tuner = lgb.LightGBMTunerCV(
    params,
    dtrain,
    folds=StratifiedKFold(n_splits=3),
    callbacks=[early_stopping(1000), log_evaluation(100)],
)

tuner.run()

In [None]:

print("Best score:", tuner.best_score)
lightgbm_best_params = tuner.best_params
print("Best params:", lightgbm_best_params)
print("  Params: ")
for key, value in lightgbm_best_params.items():
    print("    {}: {}".format(key, value))

## Xgboost

In [None]:
def objective(trial):
    dtrain = xgb.DMatrix(X_train, label=y_train, nthread=-1)
    param = {
        "verbosity": 0,
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 1, 1000),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 0, 10),
        # on gpu
        "device": "cuda",
        "tree_method": "hist",
    }

    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "test-auc")
    history = xgb.cv(param, dtrain, num_boost_round=100, callbacks=[pruning_callback])

    mean_auc = history["test-auc-mean"].values[-1]
    return mean_auc

pruner = optuna.pruners.MedianPruner(n_warmup_steps=5)
study = optuna.create_study(pruner=pruner, direction="maximize")
study.optimize(objective, n_trials=100)

In [None]:
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
xgboost_best_trial = study.best_trial
print("  Value: {}".format(xgboost_best_trial.value))
xgboost_best_params = xgboost_best_trial.params
print("  Params: ")
for key, value in xgboost_best_params.items():
    print("    {}: {}".format(key, value))

## Catboost

In [None]:
from optuna.integration import CatBoostPruningCallback
from sklearn.metrics import roc_auc_score
def objective(trial: optuna.Trial) -> float:
    train_x, valid_x, train_y, valid_y = train_test_split(X_train, y_train, test_size=0.25, stratify=y_train)

    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "eval_metric": "AUC",
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1, log=True)

    gbm = cb.CatBoostClassifier(**param)

    pruning_callback = CatBoostPruningCallback(trial, "AUC")
    gbm.fit(
        train_x,
        train_y,
        eval_set=[(valid_x, valid_y)],
        verbose=0,
        early_stopping_rounds=100,
        callbacks=[pruning_callback],
    )

    # evoke pruning manually.
    pruning_callback.check_pruned()
    preds = gbm.predict_proba(valid_x)[:, 1]
    auc = roc_auc_score(valid_y, preds)

    return auc

study = optuna.create_study(
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="maximize"
)
study.optimize(objective, n_trials=100, timeout=600)

In [None]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
catboost_best_trial = study.best_trial

print("  Value: {}".format(catboost_best_trial.value))
catboost_best_params = catboost_best_trial.params
print("  Params: ")
for key, value in catboost_best_params.items():
    print("    {}: {}".format(key, value))

## Best params

In [None]:
best_params_lightgbm = lightgbm_best_params
best_params_xgboost = xgboost_best_params
best_params_catboost = catboost_best_params

optuna_best_parameters_found = {
    "xgboost": xgboost_best_params,
    "lightgbm": lightgbm_best_params,
    "catboost": catboost_best_params,
}

optuna_best_parameters_found

## Submission

In [None]:
import lightgbm

models = {
    "xgboost": xgb.XGBClassifier(random_state=RANDOM_SEED, n_jobs=-1),
    "lightgbm": lightgbm.LGBMClassifier(random_state=RANDOM_SEED, n_jobs=-1, verbosity=-1),
    "catboost": cb.CatBoostClassifier(random_state=RANDOM_SEED, verbose=False),
    #"logistic_regression": LogisticRegression(random_state=RANDOM_SEED, n_jobs=-1),
    "knn": KNeighborsClassifier(n_jobs=-1),
    "stacked": StackingClassifier(
        [
            ("xgboost", xgb.XGBClassifier(random_state=RANDOM_SEED, n_jobs=-1, **xgboost_best_params)),
            ("lightgbm", lightgbm.LGBMClassifier(random_state=RANDOM_SEED, n_jobs=-1, **lightgbm_best_params)),
            ("catboost", cb.CatBoostClassifier(random_state=RANDOM_SEED, verbose=False, **catboost_best_params)),
        ],
        final_estimator=LogisticRegression(random_state=RANDOM_SEED, n_jobs=-1),
        n_jobs=-1,
    ),
}

trained_models = train_models(models, X_train, y_train, parameters=optuna_best_parameters_found)

In [None]:
results_df = evaluate_models(trained_models, X_val, y_val)
results_df.sort_values(by="auc", ascending=False)

In [None]:
plot_roc_curve(trained_models, X_val, y_val)

# Submission

In [None]:
train_df = pd.read_csv(input_folder + "playground-series-s4e1/train.csv", index_col="id")
test_df = pd.read_csv(input_folder + "playground-series-s4e1/test.csv", index_col="id")
submission_df = pd.read_csv(input_folder + "playground-series-s4e1/sample_submission.csv")
original_df = pd.read_csv(input_folder + "bank-customer-churn-prediction/Churn_Modelling.csv")
target_col = "Exited"

numeric_features = ['CustomerId', 'CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
categorical_features = ['Surname', 'Geography', 'Gender', 'HasCrCard', 'IsActiveMember']

features_to_drop = ['CustomerId', 'Surname']

GENERATED_COLUMN = True
ADD_ORIGINAL_DF = True

model_postfix = "_with_original" if ADD_ORIGINAL_DF else ""
model_postfix += "_generated" if GENERATED_COLUMN else ""

original_df = original_df.drop(columns=['RowNumber'])

# drop na rows from orignal df
original_df = original_df.dropna()

if GENERATED_COLUMN:
    train_df['generated'] = 1
    test_df['generated'] = 1
    original_df['generated'] = 0
    categorical_features.append('generated')
    
if ADD_ORIGINAL_DF:
    train_df = pd.concat([train_df, original_df])


for f in features_to_drop:
    if f in numeric_features:
        numeric_features.remove(f)
    if f in categorical_features:
        categorical_features.remove(f)
    
    train_df = train_df.drop(columns=f)
    test_df = test_df.drop(columns=f)

train_df = initial_feature_engineering(train_df)
train_df = feature_engineering_1(train_df)

test_df = initial_feature_engineering(test_df)
test_df = feature_engineering_1(test_df)

In [None]:
# train model on train data
model = StackingClassifier(
    [
        ("xgboost", xgb.XGBClassifier(random_state=RANDOM_SEED, n_jobs=-1, **xgboost_best_params)),
        ("lightgbm", lightgbm.LGBMClassifier(random_state=RANDOM_SEED, n_jobs=-1, **lightgbm_best_params)),
        ("catboost", cb.CatBoostClassifier(random_state=RANDOM_SEED, verbose=False, **catboost_best_params)),
    ],
    final_estimator=LogisticRegression(random_state=RANDOM_SEED, n_jobs=-1),
    n_jobs=-1,
)
model = cb.CatBoostClassifier(random_state=RANDOM_SEED, verbose=False, **catboost_best_params)
model = create_pipeline(model)
X_train = train_df.drop(columns=target_col)
y_train = train_df[target_col]
X_test = test_df

model.fit(X_train, y_train)

Use for comparison or blending with other predictions:

In [None]:
# predict on test data
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]


# create submission df

submission_df = pd.DataFrame({
    "id": test_df.index,
    target_col: y_proba
})
submission_df.to_csv("./submission.csv", index=False)
submission_df