<h1>Bank Customer Churn Analysis</h1>

<h4>Importing the dataset</h4>

In [79]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/Customer-Churn-Records.csv')

df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Card Type,Point Earned
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1,1,2,DIAMOND,464
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,1,3,DIAMOND,456
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,1,3,DIAMOND,377
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0,0,5,GOLD,350
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,0,5,GOLD,425


<h6>Checking for missing values & preparing dataset</h6>

In [80]:
df.isnull().sum()   # check for missing values

#df.describe()   # summary statistics

RowNumber             0
CustomerId            0
Surname               0
CreditScore           0
Geography             0
Gender                0
Age                   0
Tenure                0
Balance               0
NumOfProducts         0
HasCrCard             0
IsActiveMember        0
EstimatedSalary       0
Exited                0
Complain              0
Satisfaction Score    0
Card Type             0
Point Earned          0
dtype: int64

In [81]:
df = df.dropna()   # drop missing values

##or fill missing values with mean
#df = df.fillna(df.mean())

<h5>Revoming unnecessary columns</h5>

In [82]:
from sklearn.preprocessing import StandardScaler

# Load dataset (assuming df is already loaded)
df = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)  # Drop irrelevant columns

# Select numerical columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns   

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform only numerical columns
df[num_cols] = scaler.fit_transform(df[num_cols])

# Display the first few rows after scaling
print(df.head())

#displaying only numerical columns
print(df[num_cols].head())

   CreditScore Geography  Gender       Age    Tenure   Balance  NumOfProducts  \
0    -0.326221    France  Female  0.293517 -1.041760 -1.225848      -0.911583   
1    -0.440036     Spain  Female  0.198164 -1.387538  0.117350      -0.911583   
2    -1.536794    France  Female  0.293517  1.032908  1.333053       2.527057   
3     0.501521    France  Female  0.007457 -1.387538 -1.225848       0.807737   
4     2.063884     Spain  Female  0.388871 -1.041760  0.785728      -0.911583   

   HasCrCard  IsActiveMember  EstimatedSalary    Exited  Complain  \
0   0.646092        0.970243         0.021886  1.976555  1.972908   
1  -1.547768        0.970243         0.216534 -0.505931  1.972908   
2   0.646092       -1.030670         0.240687  1.976555  1.972908   
3  -1.547768       -1.030670        -0.108918 -0.505931 -0.506866   
4   0.646092        0.970243        -0.365276 -0.505931 -0.506866   

   Satisfaction Score Card Type  Point Earned  
0           -0.721130   DIAMOND     -0.630839  
1 

<h4>Split the dataset</h4>

In [83]:
#split the dataset

from sklearn.model_selection import train_test_split

# Define Features (X) and Target (y)
X = df[num_cols].drop('Exited', axis=1)   #drop the target column exited
y = df[num_cols]['Exited']        #target column

# Split the dataset into training and testing sets (80% training, 20% testing) 
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0) 
y_test = np.round(y_test)  # Round values to 0/1 for classification
y_train = np.round(y_train)  # Convert to 0/1 if binary classification

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)


(8000, 11) (2000, 11) (8000,) (2000,)


<h4>Train Classification Using Different Models</h4>

In [84]:
#logistic regression
from sklearn.linear_model import LogisticRegression

#using logistic regression
log_reg = LogisticRegression()

# Fit the model
log_reg.fit(x_train, y_train)

y_pred = log_reg.predict(x_test)

# Accuracy
accuracy = log_reg.score(x_test, y_test)
print(f'Accuracy: {accuracy*100:.2f}%')



Accuracy: 0.9995
