In [1]:
import pandas as pd

In [2]:
train=pd.read_csv('bank_churn_dataset/train.csv')

In [3]:
train

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165029,165029,15667085,Meng,667,Spain,Female,33.0,2,0.00,1,1.0,1.0,131834.75,0
165030,165030,15665521,Okechukwu,792,France,Male,35.0,3,0.00,1,0.0,0.0,131834.45,0
165031,165031,15664752,Hsia,565,France,Male,31.0,5,0.00,1,1.0,1.0,127429.56,0
165032,165032,15689614,Hsiung,554,Spain,Female,30.0,7,161533.00,1,0.0,1.0,71173.03,0


In [4]:
train.columns

Index(['id', 'CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender',
       'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [5]:
# dropping unnecessary columns right at the start
train=train.drop(['id','CustomerId','Surname'],axis=1)

In [6]:
# check for nulls
train.isnull().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [7]:
# handling categorical variables
for cols in ['Geography','Gender']:
    print(train[cols].value_counts())
    print("")

Geography
France     94215
Spain      36213
Germany    34606
Name: count, dtype: int64

Gender
Male      93150
Female    71884
Name: count, dtype: int64



In [22]:
X=train.iloc[:,:-1]
y=train.iloc[:,-1]

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

def feature_transformation(encode_df,categorical_attributes):
    num_attributes=encode_df.drop(categorical_attributes,axis=1).columns.to_list()
    
    numeric_pipeline=Pipeline([
        ('std_scaler',StandardScaler())
    ])


    encoder_pipeline=ColumnTransformer([
        ("numeric_var",numeric_pipeline,num_attributes),
        ("cat_var",OneHotEncoder(),categorical_attributes)
    ])
    df_encoded=encoder_pipeline.fit_transform(encode_df)
    return df_encoded,encoder_pipeline

In [10]:
X_transformed,encoder_pipeline=feature_transformation(X,['Geography','Gender'])

In [11]:
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

models = {
    "Random Forest": RandomForestClassifier(n_jobs=-1),
#     "SVM": SVC(probability=True),
    "XGBoost": XGBClassifier(n_jobs=-1),
    "Logistic Regression": LogisticRegression(max_iter=1000, n_jobs=-1)
}

# Number of folds
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

In [12]:
# Evaluate each model using k-fold cross-validation with AUC-ROC scoring
for model_name, model in models.items():
    scores = cross_val_score(model, X_transformed, y, cv=kf, scoring='roc_auc', n_jobs=-1)
    print(f"{model_name} AUC-ROC: {np.mean(scores):.4f} (+/- {np.std(scores):.4f})")

Random Forest AUC-ROC: 0.8731 (+/- 0.0018)
XGBoost AUC-ROC: 0.8878 (+/- 0.0019)
Logistic Regression AUC-ROC: 0.8178 (+/- 0.0030)


In [13]:
# hyperparameter tuning
import optuna
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }
    
    model = XGBClassifier(**param, use_label_encoder=False, eval_metric='logloss')
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_transformed, y, cv=kf, scoring='roc_auc', n_jobs=-1)
    
    return np.mean(scores)

# Run the optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Best hyperparameters
print("Best hyperparameters: ", study.best_params)


  from .autonotebook import tqdm as notebook_tqdm
[I 2024-08-06 12:56:55,529] A new study created in memory with name: no-name-60458c5f-9281-4738-9c47-a091bf7bb093
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
[I 2024-08-06 12:57:49,524] Trial 0 finished with value: 0.8894654810504667 and parameters: {'n_estimators': 226, 'max_depth': 5, 'learning_rate': 0.0309463689721519, 'subsample': 0.6183969910527577, 'colsample_bytree': 0.7573870693420599, 'gamma': 0.2558638242542017, 'reg_alpha': 3.742107061329385e-06, 'reg_lambda': 7.445117891499493e-05, 'min_child_weight': 10}. Best is trial 0 with value: 0.8894654810504667.
[I 2024-08-06 13:01:02,446] Trial 1 finished with value: 0.8888482254979572 and parameters: {'n_estimators': 408, 'max_depth': 10, 'learning_rate': 0.013305763

[I 2024-08-06 13:23:56,678] Trial 15 finished with value: 0.8868365529394981 and parameters: {'n_estimators': 325, 'max_depth': 3, 'learning_rate': 0.018038461056308224, 'subsample': 0.9975393205486285, 'colsample_bytree': 0.7771685008469085, 'gamma': 0.010263441515089676, 'reg_alpha': 2.703021211556981e-05, 'reg_lambda': 1.8728546399420552e-05, 'min_child_weight': 5}. Best is trial 9 with value: 0.8899197068085737.
[I 2024-08-06 13:25:59,398] Trial 16 finished with value: 0.8900783959218608 and parameters: {'n_estimators': 456, 'max_depth': 6, 'learning_rate': 0.03301198837157549, 'subsample': 0.5247574518630387, 'colsample_bytree': 0.6308759676520247, 'gamma': 0.00019841531482647928, 'reg_alpha': 0.0022154359588011847, 'reg_lambda': 0.0016289087719378518, 'min_child_weight': 2}. Best is trial 16 with value: 0.8900783959218608.
[I 2024-08-06 13:26:22,291] Trial 17 finished with value: 0.8855875719060512 and parameters: {'n_estimators': 51, 'max_depth': 8, 'learning_rate': 0.0363059715

[I 2024-08-06 16:04:00,846] Trial 35 finished with value: 0.8889036746303456 and parameters: {'n_estimators': 291, 'max_depth': 4, 'learning_rate': 0.025422209735073874, 'subsample': 0.5508563542336649, 'colsample_bytree': 0.7413507521085029, 'gamma': 0.04108909496726704, 'reg_alpha': 6.707917298409239e-06, 'reg_lambda': 0.00040694256866027407, 'min_child_weight': 3}. Best is trial 31 with value: 0.8901429874994854.
[I 2024-08-06 16:06:10,502] Trial 36 finished with value: 0.8902071601852832 and parameters: {'n_estimators': 462, 'max_depth': 5, 'learning_rate': 0.04051994506982148, 'subsample': 0.634411736714682, 'colsample_bytree': 0.7568307726212872, 'gamma': 0.23924862096352095, 'reg_alpha': 4.901695064333086e-05, 'reg_lambda': 2.0513849580073577e-05, 'min_child_weight': 2}. Best is trial 36 with value: 0.8902071601852832.
[I 2024-08-06 16:06:58,071] Trial 37 finished with value: 0.8890185820596799 and parameters: {'n_estimators': 333, 'max_depth': 3, 'learning_rate': 0.042261304099

[I 2024-08-06 17:22:01,724] Trial 55 finished with value: 0.8900859333347142 and parameters: {'n_estimators': 422, 'max_depth': 5, 'learning_rate': 0.03401706377314162, 'subsample': 0.5708588182186456, 'colsample_bytree': 0.7674449164920305, 'gamma': 0.06118933143192967, 'reg_alpha': 2.5731247036307797e-05, 'reg_lambda': 1.3968882802044144e-05, 'min_child_weight': 7}. Best is trial 36 with value: 0.8902071601852832.
[I 2024-08-06 17:22:40,357] Trial 56 finished with value: 0.8898533050168334 and parameters: {'n_estimators': 446, 'max_depth': 4, 'learning_rate': 0.03792885535399471, 'subsample': 0.6064879425601573, 'colsample_bytree': 0.7864469159994568, 'gamma': 0.21935478686341203, 'reg_alpha': 2.3196546097284475e-06, 'reg_lambda': 1.955365523562306e-06, 'min_child_weight': 1}. Best is trial 36 with value: 0.8902071601852832.
[I 2024-08-06 17:23:40,700] Trial 57 finished with value: 0.8900544086426363 and parameters: {'n_estimators': 437, 'max_depth': 6, 'learning_rate': 0.02819202878

[I 2024-08-06 19:12:22,199] Trial 75 finished with value: 0.8902204030189124 and parameters: {'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.040245991624918466, 'subsample': 0.5694341690413364, 'colsample_bytree': 0.7343585145866265, 'gamma': 0.05351315405636665, 'reg_alpha': 0.00012018799706203243, 'reg_lambda': 2.0349547067656788e-05, 'min_child_weight': 2}. Best is trial 63 with value: 0.8902623924969228.
[I 2024-08-06 19:13:20,694] Trial 76 finished with value: 0.8902823816191164 and parameters: {'n_estimators': 499, 'max_depth': 5, 'learning_rate': 0.04039596650741127, 'subsample': 0.5792897916748322, 'colsample_bytree': 0.7018433054706488, 'gamma': 0.06431491781468587, 'reg_alpha': 3.573178043868162e-05, 'reg_lambda': 2.06266983531361e-05, 'min_child_weight': 1}. Best is trial 76 with value: 0.8902823816191164.
[I 2024-08-06 19:14:06,878] Trial 77 finished with value: 0.8899190184116325 and parameters: {'n_estimators': 500, 'max_depth': 4, 'learning_rate': 0.039633985479

[I 2024-08-06 19:26:40,869] Trial 95 finished with value: 0.8901884410804526 and parameters: {'n_estimators': 454, 'max_depth': 5, 'learning_rate': 0.036689901141421295, 'subsample': 0.54926000539724, 'colsample_bytree': 0.7607333744744585, 'gamma': 0.028573536519070807, 'reg_alpha': 0.0003153742097573063, 'reg_lambda': 0.00011541707854624694, 'min_child_weight': 3}. Best is trial 76 with value: 0.8902823816191164.
[I 2024-08-06 19:27:25,878] Trial 96 finished with value: 0.8900840486583558 and parameters: {'n_estimators': 439, 'max_depth': 5, 'learning_rate': 0.030263937823247048, 'subsample': 0.5657024878811617, 'colsample_bytree': 0.7486815413776646, 'gamma': 0.16940909087939965, 'reg_alpha': 0.0001327329856866033, 'reg_lambda': 6.953468245792022e-05, 'min_child_weight': 2}. Best is trial 76 with value: 0.8902823816191164.
[I 2024-08-06 19:28:24,319] Trial 97 finished with value: 0.8898414659592971 and parameters: {'n_estimators': 479, 'max_depth': 6, 'learning_rate': 0.044686527361

Best hyperparameters:  {'n_estimators': 499, 'max_depth': 5, 'learning_rate': 0.04039596650741127, 'subsample': 0.5792897916748322, 'colsample_bytree': 0.7018433054706488, 'gamma': 0.06431491781468587, 'reg_alpha': 3.573178043868162e-05, 'reg_lambda': 2.06266983531361e-05, 'min_child_weight': 1}


In [16]:
best_hyper_params=study.best_params
best_hyper_params['n_jobs']=-1
best_hyper_params

{'n_estimators': 499,
 'max_depth': 5,
 'learning_rate': 0.04039596650741127,
 'subsample': 0.5792897916748322,
 'colsample_bytree': 0.7018433054706488,
 'gamma': 0.06431491781468587,
 'reg_alpha': 3.573178043868162e-05,
 'reg_lambda': 2.06266983531361e-05,
 'min_child_weight': 1,
 'n_jobs': -1}

In [17]:
# training the model and testing
model = XGBClassifier(**best_hyper_params, use_label_encoder=False, eval_metric='logloss')

In [33]:
test_df=pd.read_csv('bank_churn_dataset/test.csv')
test_df

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,165034,15773898,Lucchese,586,France,Female,23.0,2,0.00,2,0.0,1.0,160976.75
1,165035,15782418,Nott,683,France,Female,46.0,2,0.00,1,1.0,0.0,72549.27
2,165036,15807120,K?,656,France,Female,34.0,7,0.00,2,1.0,0.0,138882.09
3,165037,15808905,O'Donnell,681,France,Male,36.0,8,0.00,1,1.0,0.0,113931.57
4,165038,15607314,Higgins,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
110018,275052,15662091,P'eng,570,Spain,Male,29.0,7,116099.82,1,1.0,1.0,148087.62
110019,275053,15774133,Cox,575,France,Female,36.0,4,178032.53,1,1.0,1.0,42181.68
110020,275054,15728456,Ch'iu,712,France,Male,31.0,2,0.00,2,1.0,0.0,16287.38
110021,275055,15687541,Yegorova,709,France,Female,32.0,3,0.00,1,1.0,1.0,158816.58


In [34]:
test=test_df.drop(['id','CustomerId','Surname'],axis=1)
# pass the train and test features through all the transformation
X_train,encoder_pipeline=feature_transformation(X,['Geography','Gender'])
X_test=encoder_pipeline.transform(test)

In [25]:
model.fit(X_train,y)

In [31]:
model_probab=model.predict_proba(X_test)[:,1] #only need it for the positive class
model_probab

array([0.02560449, 0.8274647 , 0.02557555, ..., 0.01918652, 0.1454306 ,
       0.17866515], dtype=float32)

In [38]:
output_df=pd.DataFrame({'id':test_df['id'],'Exited':model_probab})
output_df.to_csv('Results/BankChurn.csv',index=False)

In [39]:
pd.read_csv('Results/BankChurn.csv')

Unnamed: 0,id,Exited
0,165034,0.025604
1,165035,0.827465
2,165036,0.025576
3,165037,0.237847
4,165038,0.352661
...,...,...
110018,275052,0.041731
110019,275053,0.115411
110020,275054,0.019187
110021,275055,0.145431
