In [None]:
import kagglehub

path = kagglehub.dataset_download("saurabhbadole/bank-customer-churn-prediction-dataset")

print("Path to dataset files:", path)

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split,StratifiedKFold
from xgboost import  XGBClassifier
from sklearn.preprocessing import StandardScaler,MinMaxScaler,OrdinalEncoder

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,VotingClassifier
import optuna

In [None]:
df=pd.read_csv(f"{path}/Churn_Modelling.csv")

In [None]:
df.sample(5)

In [None]:
df.shape

In [None]:
df.to_excel("bank_customer.xlsx",index=False)

# Problem in DataSet  & EDA & Feature Eng:

``` 1. H?,L?,Hs? ->Surname , make no sense, [invalid_surname] ```

``` 2. Many Customer have Balance =0 ```




In [None]:
invalid_surname=df[(df['Surname']=="H?") | (df['Surname']=="L?") | (df['Surname']=="Hs?")]['CustomerId']

In [None]:
df.dtypes

In [None]:
df[df['CreditScore'] < 100]

In [None]:
df['Geography'].value_counts()

In [None]:
df['Gender'].value_counts()

In [None]:
df[(df['Age'] < 0) & (df['Age']>100)]

In [None]:
df[df['Balance']==0]

In [None]:
df['HasCrCard'].value_counts()

In [None]:
df[df['EstimatedSalary'] < 100]

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.groupby('Geography').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
df.groupby('Gender').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
df[df['Balance']==0.0]['Exited']

In [None]:
plt.subplots(figsize=(8, 8))
df_2dhist = pd.DataFrame({
    x_label: grp['Gender'].value_counts()
    for x_label, grp in df.groupby('Geography')
})
sns.heatmap(df_2dhist, cmap='viridis')
plt.xlabel('Geography')
_ = plt.ylabel('Gender')

In [None]:
df['CreditScore'].plot(kind='hist', bins=20, title='CreditScore')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
df['CreditScore'].plot(kind='hist', bins=20, title='CreditScore')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
df.sample(5)

In [None]:
avg_credit=df['CreditScore'].mean()

In [None]:
df[df['CreditScore'] > avg_credit]['CreditScore'].count()

In [None]:
df['CreditScoreCategory'] = np.where(df['CreditScore'] > avg_credit, 'Above Average', 'Below Average')
df.sample(5)

In [None]:
df.groupby('CreditScoreCategory').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
ct = pd.crosstab(df['CreditScoreCategory'], df['Exited'])

ct.plot(kind='bar', figsize=(10, 6))

plt.title('Distribution of CreditScoreCategory & Exited ')
plt.xlabel('CreditScoreCategory')
plt.ylabel('Count')
plt.show()

In [None]:
ct

In [None]:
pd.crosstab(df['IsActiveMember'], df['Exited']).plot(kind="bar")

In [None]:
pd.crosstab(df['HasCrCard'], df['Exited']).plot(kind="bar")

In [None]:
df.columns

In [None]:
df[df['Balance']==0.0]

In [None]:
df['hasZeroBalance']= np.where(
    df['Balance'] ==0.0,
    "Yes",
    'No'
)

In [None]:
df.drop(['RowNumber','CustomerId','Surname'],axis=1,inplace=True)

In [None]:
df['CreditScoreCategory'].value_counts()

In [None]:
pd.crosstab(df['CreditScoreCategory'],df['Exited'])

In [None]:
df['hasZeroBalance'].value_counts()

In [None]:
pd.crosstab(df['hasZeroBalance'],df['Exited'])

In [None]:
df['CreditScoreCategory']=df['CreditScoreCategory'].map({'Above Average':1,"Below Average":0})

In [None]:
df['hasZeroBalance']=df['hasZeroBalance'].map({'Yes':1,"No":0})

In [None]:
df['Gender']=df['Gender'].map({'Male':1,"Female":0})

In [None]:
label_enc= LabelEncoder()

In [None]:
df['Geography']=label_enc.fit_transform(df['Geography'])

In [None]:
df.sample(4)

In [None]:
X=df.drop("Exited",axis=1)
y=df['Exited']

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.1,random_state=42)

In [None]:
x_train.shape

In [None]:
transformation= ColumnTransformer(
    transformers=[
        ("crscore_trans",StandardScaler(),['CreditScore']),
        ("age_trans",StandardScaler(),['Age']),
        ("tenure_trans",StandardScaler(),['Tenure']),
        ("balance_trans",StandardScaler(),['Balance']),
        ("estimated_salary_trans",StandardScaler(),['EstimatedSalary']),
        ("geo_trans",OrdinalEncoder(),['Geography']),
        ("gender_trans",OrdinalEncoder(),['Gender']),
        ("credit_score_trans",OrdinalEncoder(),['CreditScoreCategory']),
        ('haszerobal_trans',OrdinalEncoder(),['hasZeroBalance'])
    ],
    remainder="passthrough"
)

In [None]:
x_train=transformation.fit_transform(x_train)

## Modelling

In [None]:
df.sample(5)

In [None]:
model = XGBClassifier(
    n_estimators=500,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic'
)

In [None]:
model.fit(x_train,y_train)

In [None]:
x_test = transformation.transform(x_test)

In [None]:
accuracy_score(model.predict(x_test),y_test)

In [None]:
model2= CatBoostClassifier(
    iterations=600,
    learning_rate=0.01,
    depth=6,
    verbose=0
)

In [None]:
model2.fit(x_train,y_train)

In [None]:
accuracy_score(model2.predict(x_test),y_test)

In [None]:
def objective(trial):
    classifier_name = trial.suggest_categorical(
        "classifier_name",
        [
            "RandomForestClassifier",
            "GradientBoostingClassifier",
            "XGBClassifier",
            "CatBoostClassifier",
        ],
    )

    if classifier_name == "RandomForestClassifier":
        n_estimators = trial.suggest_int("rf_n_estimators", 200, 800)
        criterion = trial.suggest_categorical(
            "rf_criterion", ["gini", "entropy", "log_loss"]
        )
        max_features = trial.suggest_float("rf_max_features", 0.3, 1.0)
        max_depth = trial.suggest_int("rf_max_depth", 3, 20)

        model = RandomForestClassifier(
            n_estimators=n_estimators,
            criterion=criterion,
            max_features=max_features,
            max_depth=max_depth,
            random_state=42,
            n_jobs=-1,
        )

    elif classifier_name == "GradientBoostingClassifier":
        learning_rate = trial.suggest_float("gb_learning_rate", 0.01, 0.3)
        n_estimators = trial.suggest_int("gb_n_estimators", 100, 800)
        subsample = trial.suggest_float("gb_subsample", 0.5, 1.0)
        max_depth = trial.suggest_int("gb_max_depth", 3, 15)

        model = GradientBoostingClassifier(
            learning_rate=learning_rate,
            n_estimators=n_estimators,
            subsample=subsample,
            max_depth=max_depth,
            random_state=42,
        )

    elif classifier_name == "XGBClassifier":
        n_estimators = trial.suggest_int("xgb_n_estimators", 200, 800)
        learning_rate = trial.suggest_float("xgb_learning_rate", 0.01, 0.3)
        max_depth = trial.suggest_int("xgb_max_depth", 3, 15)
        subsample = trial.suggest_float("xgb_subsample", 0.5, 1.0)
        colsample_bytree = trial.suggest_float(
            "xgb_colsample_bytree", 0.5, 1.0
        )

        model = XGBClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            eval_metric="mlogloss",
            random_state=42,
            verbosity=0,
        )

    else:  # CatBoostClassifier
        params = {
            "iterations": trial.suggest_int("cb_iterations", 500, 5000),
            "learning_rate": trial.suggest_float("cb_learning_rate", 0.01, 0.3),
            "depth": trial.suggest_int("cb_depth", 3, 10),
            "l2_leaf_reg": trial.suggest_float("cb_l2_leaf_reg", 0.0, 5.0),
            "border_count": trial.suggest_int("cb_border_count", 32, 255),
            "random_strength": trial.suggest_float("cb_random_strength", 0.0, 2.0),
            "bagging_temperature": trial.suggest_float(
                "cb_bagging_temperature", 0.0, 5.0
            ),
            "auto_class_weights": trial.suggest_categorical(
                "cb_auto_class_weights", ["Balanced", None]
            ),
        }

        model = CatBoostClassifier(
            **params,
            random_seed=42,
            thread_count=-1,
            verbose=0,
        )

    scores = []
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for train_idx, val_idx in cv.split(x_train, y_train):
        x_train_fold = x_train[train_idx]
        x_val_fold   = x_train[val_idx]

        y_train_fold = y_train[train_idx]
        y_val_fold   = y_train[val_idx]

        model.fit(x_train_fold, y_train_fold)
        preds = model.predict(x_val_fold)

        scores.append(accuracy_score(y_val_fold, preds))


    return np.mean(scores)


In [None]:
# Convert y_train to a numpy array to ensure positional indexing works correctly
# within the objective function during cross-validation.
y_train = y_train.values

study=optuna.create_study(direction="maximize",sampler=optuna.samplers.TPESampler())
study.optimize(objective,n_trials=50)

In [None]:
study.best_params

In [None]:
cat_boost_params={'iterations': 2655,
 'learning_rate': 0.010822546006851903,
 'depth': 3,
 'l2_leaf_reg': 3.1021063306875454,
 'border_count': 152,
 'random_strength': 1.9888675230962634,
 'bagging_temperature': 2.282647805693784,
 'auto_class_weights': None}

xgboost_params={
    'n_estimators': 283,
    'learning_rate': 0.04573763808143029,
    'max_depth': 4,
   'subsample': 0.6031060172569973,
    'colsample_bytree': 0.7719731600149188
}

gboost_params={
    'learning_rate':  0.015748996794454576, 
    'n_estimators': 697, 'subsample': 0.8326930685923569, 'max_depth': 4
}

randomfor_params={
    'n_estimators': 761, 'criterion': 'gini', 
    'max_features': 0.9402315006750689, 'max_depth': 20
}

In [None]:
clf1=CatBoostClassifier(**cat_boost_params)
clf2 = XGBClassifier(**xgboost_params)
clf3=GradientBoostingClassifier(**gboost_params)
clf4= RandomForestClassifier(**randomfor_params)

In [None]:
stack_model = VotingClassifier(estimators=[
        ('catb', clf1), ('xbg', clf2), ('gb', clf3),('rnf',clf4)], voting='hard')

In [None]:
stack_model.fit(x_train,y_train)

In [None]:
# 'classifier_name': 'XGBClassifier',
#  'xgb_n_estimators': 283,
#  'xgb_learning_rate': 0.04573763808143029,
#  'xgb_max_depth': 4,
#  'xgb_subsample': 0.6031060172569973,
#  'xgb_colsample_bytree': 0.7719731600149188

#'classifier_name': 'CatBoostClassifier',
 # 'cb_iterations': 2655,
 # 'cb_learning_rate': 0.010822546006851903,
 # 'cb_depth': 3,
 # 'cb_l2_leaf_reg': 3.1021063306875454,
 # 'cb_border_count': 152,
 # 'cb_random_strength': 1.9888675230962634,
 # 'cb_bagging_temperature': 2.282647805693784,
 # 'cb_auto_class_weights': None}

# 'classifier_name': 'GradientBoostingClassifier', 'gb_learning_rate': 
# 0.015748996794454576, 'gb_n_estimators': 697, 'gb_subsample': 0.8326930685923569, 'gb_max_depth': 4

# 'classifier_name': 'RandomForestClassifier', 'rf_n_estimators': 761, 'rf_criterion': 'gini', 
# 'rf_max_features': 0.9402315006750689, 'rf_max_depth': 20



In [None]:
accuracy_score(stack_model.predict(x_test),y_test)

In [None]:
import pickle

In [None]:
filename = "final_model.pkl"

with open(filename,"wb") as f:
    pickle.dump(stack_model,f)

In [None]:
transformer="transformation.pkl"
with open(transformer,"wb") as f:
    pickle.dump(transformation,f)