In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/binaryclassificationwithabankchurndatasetumgc/sample_submission.csv
/kaggle/input/binaryclassificationwithabankchurndatasetumgc/train.csv
/kaggle/input/binaryclassificationwithabankchurndatasetumgc/test.csv


# Model Workflow
- Loaded and explored data 
- Cleaned and dropped irrelevant columns
- Encoded categorical variables 
- Split features & target 
- Train-test split done with stratification 
- Standardized numerical features 
- Trained Logistic Regression model with balanced weights
- Evaluated using ROC AUC on test set 



# Step-1 
### Loading Dataset

In [2]:
bank_churn_data = pd.read_csv("/kaggle/input/binaryclassificationwithabankchurndatasetumgc/train.csv")

# Step- 2
### Understanding Our Data

In [5]:
# Printing the first 5 lines of our datset
bank_churn_data.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15788494.0,Chu,591.0,Spain,Male,30.0,4.0,112419.92,1.0,0.0,0.0,88890.05,0.0
1,1,15743893.0,Ch'ang,544.0,Spain,Female,47.0,1.0,0.0,1.0,1.0,0.0,80120.19,1.0
2,2,15738884.0,Mancini,705.0,France,Male,37.0,9.0,0.0,2.0,1.0,1.0,79919.13,0.0
3,3,15795586.0,Chikelu,670.0,France,Male,40.0,10.0,0.0,2.0,1.0,1.0,55835.66,0.0
4,4,15586518.0,Tsou,714.0,Spain,Female,46.0,1.0,107879.06,1.0,1.0,0.0,162697.93,0.0


In [4]:
# Checking dimensions of our data (rows,columns)
bank_churn_data.shape

(15000, 14)

In [8]:
# Understanding Value Count for our "Exited" Column
bank_churn_data["Exited"].value_counts()

Exited
0.0    11966
1.0     3034
Name: count, dtype: int64

**Label Explanation**
- 0.0 → Customer stayed (did not churn)
- 1.0 → Customer exited (churned)

In [9]:
# get statistical summary of data
bank_churn_data.describe()

Unnamed: 0,id,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0
mean,7499.5,15710830.0,657.576267,37.690333,5.0158,42391.092967,1.586,0.782133,0.497733,117238.4,0.202267
std,4330.271354,1637345.0,72.135924,8.1256,2.804724,59657.264519,0.529864,0.41281,0.500012,46729.02,0.401703
min,0.0,1567894.0,431.0,18.0,0.0,0.0,1.0,0.0,0.0,447.73,0.0
25%,3749.75,15635570.0,601.0,32.0,3.0,0.0,1.0,1.0,0.0,82463.69,0.0
50%,7499.5,15690540.0,660.0,37.0,5.0,0.0,2.0,1.0,0.0,122488.6,0.0
75%,11249.25,15757870.0,708.0,42.0,7.0,108910.3825,2.0,1.0,1.0,155888.5,0.0
max,14999.0,158119800.0,850.0,72.0,10.0,187911.55,4.0,1.0,1.0,1387001.0,1.0


# Step- 3
### Clean and Prepare data

In [10]:
# Check for missing values
bank_churn_data.isnull().sum()

id                 0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [11]:
# Drop irrelevant columns
bank_churn_data = bank_churn_data.drop(columns = ["id", "CustomerId", "Surname"])

In [12]:
bank_churn_data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,591.0,Spain,Male,30.0,4.0,112419.92,1.0,0.0,0.0,88890.05,0.0
1,544.0,Spain,Female,47.0,1.0,0.0,1.0,1.0,0.0,80120.19,1.0
2,705.0,France,Male,37.0,9.0,0.0,2.0,1.0,1.0,79919.13,0.0
3,670.0,France,Male,40.0,10.0,0.0,2.0,1.0,1.0,55835.66,0.0
4,714.0,Spain,Female,46.0,1.0,107879.06,1.0,1.0,0.0,162697.93,0.0


# Step - 4 
### Label Encoding
Label encoding converts categorical labels into numeric form so machine learning models can process them.

In [13]:
# Understanding Value count for Categorical
#  labels i.e "Geography" and "Gender"

In [14]:
bank_churn_data["Gender"].value_counts()

Gender
Male      8515
Female    6485
Name: count, dtype: int64

In [16]:
bank_churn_data["Geography"].value_counts()

Geography
France     8946
Spain      3338
Germany    2716
Name: count, dtype: int64

Encoding Categorical Values


In [20]:
# importing Label Encoder
from sklearn.preprocessing import LabelEncoder

In [21]:
# Assigning Label Encoder to variable "Label_encode"
label_encode = LabelEncoder()

In [25]:
# Encoding "Geography" column
label1 = label_encode.fit_transform(bank_churn_data.Geography)

In [26]:
# Assigning Encoded values to "Geography" Column
bank_churn_data["Geography"] = label1

In [27]:
bank_churn_data["Geography"].value_counts()

Geography
0    8946
2    3338
1    2716
Name: count, dtype: int64

***Label Explanation:-***
- 0 ---> France
- 1 ---> Germany
- 2 ---> Spain

In [28]:
# Reapeating same steps for "Gender" column
label2 = label_encode.fit_transform(bank_churn_data.Gender)

In [29]:
bank_churn_data["Gender"] = label2

In [30]:
bank_churn_data["Gender"].value_counts()

Gender
1    8515
0    6485
Name: count, dtype: int64

***Label Explanation:-***
- 0 ---> Female
- 1 ---> Male

# Step - 5 
### Features and Target Split

In [31]:
# Features
X = bank_churn_data.drop(columns = "Exited")

In [32]:
# Target
Y = bank_churn_data["Exited"]

# Step - 6
### Train - Test Split

In [34]:
# importing dependency
from sklearn.model_selection import train_test_split

In [35]:
# Splitting data into test and train split
X_train , X_test , Y_train , Y_test = train_test_split(
    X, Y , test_size = 0.2, stratify = Y, random_state = 0
)

In [36]:
# Printing Dimensions
print(f"Feature Data Dimensions: {X.shape}")
print(f"Training Data Dimensions: {X_train.shape}")
print(f"Test Data Dimensions: {X_test.shape}")

Feature Data Dimensions: (15000, 10)
Training Data Dimensions: (12000, 10)
Test Data Dimensions: (3000, 10)


# Step - 7 
### Data Standarization

In [37]:
# importing dependencies
from sklearn.preprocessing import StandardScaler

# Assigning Standard scaler to variable "Scaler"
scaler = StandardScaler()

In [42]:
# list of columns to be standardized
num_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

# Fit the model on X_train
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])

# Apply the scaling parameters learned from training data to the test data
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [44]:
# Check if the numerical features are standardized (mean ~ 0, std ~ 1)
print(X_train[num_cols].mean())
print(X_train[num_cols].std())

CreditScore        3.493502e-17
Age                4.736952e-18
Tenure             4.559316e-17
Balance            6.809368e-18
NumOfProducts     -1.953993e-17
EstimatedSalary    1.243450e-17
dtype: float64
CreditScore        1.000042
Age                1.000042
Tenure             1.000042
Balance            1.000042
NumOfProducts      1.000042
EstimatedSalary    1.000042
dtype: float64


# Step - 8
### Model Training

In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Initialize model with balanced class weights
model = LogisticRegression(class_weight='balanced', random_state=0, max_iter=1000)

# Train the model
model.fit(X_train, Y_train)

# Predict probabilities on test set
y_probs = model.predict_proba(X_test)[:,1]  # Probabilities for class 1

# Evaluate ROC AUC score
auc_score = roc_auc_score(Y_test, y_probs)
print(f"ROC AUC Score: {auc_score:.4f}")


ROC AUC Score: 0.8785


# Step- 9
### Loading and Preparing the test data (test.csv)

In [46]:
# Load test data 
test_data = pd.read_csv("/kaggle/input/binaryclassificationwithabankchurndatasetumgc/test.csv")

In [47]:
test_data.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,15000,15686936.0,Ting,537.0,France,Female,53.0,4.0,0.0,2.0,1.0,1.0,119712.78
1,15001,15733032.0,K?,652.0,France,Female,39.0,9.0,0.0,1.0,1.0,0.0,54164.01
2,15002,15674928.0,Hsieh,675.0,France,Male,49.0,5.0,0.0,2.0,0.0,1.0,132973.21
3,15003,15783030.0,Mancini,753.0,Germany,Male,51.0,5.0,127864.4,1.0,0.0,1.0,183868.01
4,15004,15752953.0,Walker,627.0,Germany,Male,35.0,4.0,126663.51,2.0,1.0,0.0,79616.37


In [48]:
# keeping the "id" for submission file
test_ids = test_data["id"]

In [49]:
# Droping Irrelevant Columns
test_data = test_data.drop(columns = ["id", "CustomerId", "Surname"])

### Label Encoding

In [65]:
label_a = label_encode.fit_transform(test_data.Geography)
test_data["Geography"] = label_a

In [66]:
label_b = label_encode.fit_transform(test_data.Gender)
test_data["Gender"] = label_b

### Data Standardization

In [61]:
test_data[num_cols] = scaler.transform(test_data[num_cols])

# Step - 10 
### Predict Probabilities

In [62]:
test_probs = model.predict_proba(test_data)[:,1]

# Step - 11
### Prepare submission dataframe

In [63]:
submission = pd.DataFrame({
    'id': test_ids,
    'Exited': test_probs
})

In [64]:
# Save submission file
submission.to_csv("submission.csv", index=False)
print("Submission file created!")

Submission file created!
