## Importing the Dataset from Kaggle

In [67]:
import kagglehub
shantanudhakadd_bank_customer_churn_prediction_path = kagglehub.dataset_download('shantanudhakadd/bank-customer-churn-prediction')

print('Data source import complete.')

Data source import complete.


In [68]:
print(shantanudhakadd_bank_customer_churn_prediction_path)

/kaggle/input/bank-customer-churn-prediction


In [69]:
import os
contents = os.listdir(shantanudhakadd_bank_customer_churn_prediction_path)

print(f"Contents of the directory '{shantanudhakadd_bank_customer_churn_prediction_path}':")
for item in contents:
    print(item)

Contents of the directory '/kaggle/input/bank-customer-churn-prediction':
Churn_Modelling.csv


## Importing the Neccessary Libraries

In [79]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix

In [71]:
trainPath = r"/kaggle/input/bank-customer-churn-prediction/Churn_Modelling.csv"
trainData = pd.read_csv(trainPath)
trainData.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [72]:
trainData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


## Cleaning the Dataset and doing One-Hot Encoding to Gender and Geography column

In [73]:
columns_to_drop = ['RowNumber', 'CustomerId', 'Surname']

trainData_processed = trainData.drop(columns=columns_to_drop, axis=1)
trainData_processed = pd.get_dummies(trainData_processed, columns=['Gender'], drop_first=True)
trainData_processed = pd.get_dummies(trainData_processed, columns=['Geography'], drop_first=True)

trainData_processed.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Gender_Male,Geography_Germany,Geography_Spain
0,619,42,2,0.0,1,1,1,101348.88,1,False,False,False
1,608,41,1,83807.86,1,0,1,112542.58,0,False,False,True
2,502,42,8,159660.8,3,1,0,113931.57,1,False,False,False
3,699,39,1,0.0,2,0,0,93826.63,0,False,False,False
4,850,43,2,125510.82,1,1,1,79084.1,0,False,False,True


In [74]:
trainData_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CreditScore        10000 non-null  int64  
 1   Age                10000 non-null  int64  
 2   Tenure             10000 non-null  int64  
 3   Balance            10000 non-null  float64
 4   NumOfProducts      10000 non-null  int64  
 5   HasCrCard          10000 non-null  int64  
 6   IsActiveMember     10000 non-null  int64  
 7   EstimatedSalary    10000 non-null  float64
 8   Exited             10000 non-null  int64  
 9   Gender_Male        10000 non-null  bool   
 10  Geography_Germany  10000 non-null  bool   
 11  Geography_Spain    10000 non-null  bool   
dtypes: bool(3), float64(2), int64(7)
memory usage: 732.6 KB


## Splitting the Data into Training and Test set

In [75]:
X = trainData_processed.drop('Exited', axis=1)
y = trainData_processed['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data splitting complete.")


Data splitting complete.


## Training the Models using the Processed Data

In [76]:
log_reg = LogisticRegression(solver='liblinear', random_state=42)
log_reg.fit(X_train, y_train)
print("Logistic Regression model trained.")

rand_forest = RandomForestClassifier(n_estimators=100, random_state=42)
rand_forest.fit(X_train, y_train)
print("Random Forest model trained.")

grad_boost = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
grad_boost.fit(X_train, y_train)
print("Gradient Boosting model trained.")


Logistic Regression model trained.
Random Forest model trained.
Gradient Boosting model trained.


## Making Predictions

In [77]:
y_pred_log_reg = log_reg.predict(X_test)
y_pred_rand_forest = rand_forest.predict(X_test)
y_pred_grad_boost = grad_boost.predict(X_test)

print("Predictions made.")

Predictions made.


## Evaluating the Models Performance using different score metrics

In [78]:
def evaluate_model(y_test, y_pred, model_name):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    # ROC AUC Score requires probability predictions
    y_pred_proba = None
    if model_name == "Logistic Regression":
        y_pred_proba = log_reg.predict_proba(X_test)[:, 1]
    elif model_name == "Random Forest":
        y_pred_proba = rand_forest.predict_proba(X_test)[:, 1]
    elif model_name == "Gradient Boosting":
        y_pred_proba = grad_boost.predict_proba(X_test)[:, 1]

    roc_auc = roc_auc_score(y_test, y_pred_proba)

    print(f"--- {model_name} ---")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC Score: {roc_auc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("-" * 30)

evaluate_model(y_test, y_pred_log_reg, "Logistic Regression")
evaluate_model(y_test, y_pred_rand_forest, "Random Forest")
evaluate_model(y_test, y_pred_grad_boost, "Gradient Boosting")

--- Logistic Regression ---
Accuracy: 0.8015
Precision: 0.4677
Recall: 0.0738
F1 Score: 0.1275
ROC AUC Score: 0.6749

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.98      0.89      1607
           1       0.47      0.07      0.13       393

    accuracy                           0.80      2000
   macro avg       0.64      0.53      0.51      2000
weighted avg       0.74      0.80      0.74      2000


Confusion Matrix:
[[1574   33]
 [ 364   29]]
------------------------------
--- Random Forest ---
Accuracy: 0.8700
Precision: 0.7671
Recall: 0.4860
F1 Score: 0.5950
ROC AUC Score: 0.8648

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.77      0.49      0.60       393

    accuracy                           0.87      2000
   macro avg       0.83      0.72      0.76      2000
weighted avg       0.86      0.87      0.86     

## Churn Model Performance

I tested three models to predict customer churn: Logistic Regression, Random Forest, and Gradient Boosting.

Accuracy Results:

- Logistic Regression: 80.15% accurate
- Random Forest: 87.00% accurate
- Gradient Boosting: 86.75% accurate

Simply put:

Random Forest and Gradient Boosting are much better at predicting churn accurately compared to Logistic Regression.