In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shantanudhakadd/bank-customer-churn-prediction")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/shantanudhakadd/bank-customer-churn-prediction?dataset_version_number=1...


100%|██████████| 262k/262k [00:00<00:00, 77.8MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/shantanudhakadd/bank-customer-churn-prediction/versions/1





In [2]:
# Importing Essential Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [3]:
data_file_path = path + "/Churn_Modelling.csv"
df = pd.read_csv(data_file_path)

In [5]:
# Overlook of the data
print("\n Data Head:")
print(df.head())

print("\n Data Info:")
df.info()


 Data Head:
   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  


In [9]:
# Deleting the unwanted columns from the dataset
df = df.drop(columns=['RowNumber', 'CustomerId', 'Surname'], axis=1)
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [11]:
# Feature Engineering and Preprocessing
TARGET_COLUMN = 'Exited'
X = df.drop(columns=[TARGET_COLUMN])
y = df[TARGET_COLUMN]

X = pd.get_dummies(X, columns=['Geography', 'Gender'], drop_first=True)
print("\nFeature after One_hot encoding:")
print(X.head())


Feature after One_hot encoding:
   CreditScore  Age  Tenure    Balance  NumOfProducts  HasCrCard  \
0          619   42       2       0.00              1          1   
1          608   41       1   83807.86              1          0   
2          502   42       8  159660.80              3          1   
3          699   39       1       0.00              2          0   
4          850   43       2  125510.82              1          1   

   IsActiveMember  EstimatedSalary  Geography_Germany  Geography_Spain  \
0               1        101348.88              False            False   
1               1        112542.58              False             True   
2               0        113931.57              False            False   
3               0         93826.63              False            False   
4               1         79084.10              False             True   

   Gender_Male  
0        False  
1        False  
2        False  
3        False  
4        False  


In [14]:
# Data splitting for Training and Testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")


Training set size: 8000 samples
Test set size: 2000 samples


In [15]:
# Scaling the Numerical Features
scaler = StandardScaler()
numerical_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']

X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

print("\nFeatutes after Scaling:")
print(X_train.head())


Featutes after Scaling:
      CreditScore       Age    Tenure   Balance  NumOfProducts  HasCrCard  \
2151     1.058568  1.715086  0.684723 -1.226059              1          1   
8392     0.913626 -0.659935 -0.696202  0.413288              1          1   
5006     1.079274 -0.184931 -1.731895  0.601687              2          1   
4117    -0.929207 -0.184931 -0.005739 -1.226059              2          1   
7182     0.427035  0.955079  0.339492  0.548318              2          0   

      IsActiveMember  EstimatedSalary  Geography_Germany  Geography_Spain  \
2151               0         1.042084              False            False   
8392               0        -0.623556               True            False   
5006               1         0.308128               True            False   
4117               0        -0.290199              False            False   
7182               1         0.135042               True            False   

      Gender_Male  
2151         True  
8392     

In [17]:
# Model Training and Evaluation (Logistic Regression)
print("       MODEL 1: Logistic Regression        ")

log_model = LogisticRegression(random_state=42)
log_model.fit(X_train, y_train)
y_pred_lr = log_model.predict(X_test)

accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f"Accuracy Score: {accuracy_lr:.4f}")
print("\nClassication Report:")
print(classification_report(y_test, y_pred_lr))

       MODEL 1: Logistic Regression        
Accuracy Score: 0.8080

Classication Report:
              precision    recall  f1-score   support

           0       0.82      0.97      0.89      1593
           1       0.59      0.19      0.28       407

    accuracy                           0.81      2000
   macro avg       0.71      0.58      0.59      2000
weighted avg       0.78      0.81      0.77      2000



In [18]:
# Model Training and Evaluation (Random Forest)
print("       MODEL 2: RANDOM FOREST        ")
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy Score: {accuracy_rf:.4f}")
print("\nClassication Report:")
print(classification_report(y_test, y_pred_rf))


       MODEL 2: RANDOM FOREST        
Accuracy Score: 0.8670

Classication Report:
              precision    recall  f1-score   support

           0       0.87      0.98      0.92      1593
           1       0.82      0.44      0.58       407

    accuracy                           0.87      2000
   macro avg       0.85      0.71      0.75      2000
weighted avg       0.86      0.87      0.85      2000



In [20]:
# Overall Conclusion
print("       SUMMARY        ")
print(f"Logistic Regression Accuracy: {accuracy_lr:.4f}")
print(f"Random Forest Accuracy: {accuracy_rf:.4f}")

if accuracy_rf > accuracy_lr:
  print("\nRandom Forest model performed better at predicting churn.")
else:
  print("\nLogistic Regression model performed better at predicting churn.")

new_customer_data = X_test.iloc[0:1]
prediction = rf_model.predict(new_customer_data)
probability = rf_model.predict_proba(new_customer_data)[0]

print(f"\nExample: For a test customer, the Random Forest prediction is: {'Churn' if prediction[0] == 1 else 'No Churn'}")
print(f"Churn Probability (Class 1): {probability[1]*100:.2f}%")

       SUMMARY        
Logistic Regression Accuracy: 0.8080
Random Forest Accuracy: 0.8670

Random Forest model performed better at predicting churn.

Example: For a test customer, the Random Forest prediction is: No Churn
Churn Probability (Class 1): 2.70%
