# Banking Churn Prediction

In [2]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd



sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/sriharshakoka/Library/Application Support/sagemaker/config.yaml


In [3]:
# create the boto3 client
sm_boto3 = boto3.client('sagemaker')
session = sagemaker.Session()
region = session.boto_region_name # region the notebook is running in and comes from the awscli
bucket = "churn-prediction-2025-12345" # bucket name

In [4]:
# print 
print(region)
print(bucket)

us-east-1
churn-prediction-2025-12345


In [5]:
import os
# read the data
data_file_path = '../data/raw/Churn_Modelling.csv'
if not os.path.exists(data_file_path):
    print(f"Data file {data_file_path} does not exist")
    exit(1)
df = pd.read_csv(data_file_path)
df.shape

(10000, 14)

In [6]:
# head
display(df.head())
# tail
display(df.tail())

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.0,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.0,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1
9999,10000,15628319,Walker,792,France,Female,28,4,130142.79,1,1,0,38190.78,0


In [7]:
# info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [8]:
# get object types
display(df.dtypes)

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [9]:
# check for null values
display(df.isnull().sum())

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [10]:
# check for duplicates
display(df.duplicated().sum())

0

In [11]:
# check if the target is balanced
display(df['Exited'].value_counts())

Exited
0    7963
1    2037
Name: count, dtype: int64

In [12]:
# get statistics
display(df.describe(include='object'))

Unnamed: 0,Surname,Geography,Gender
count,10000,10000,10000
unique,2932,3,2
top,Smith,France,Male
freq,32,5014,5457


In [13]:
# get statistics
display(df.describe())

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [14]:
# get columns as list
columns = df.columns.tolist()
columns

['RowNumber',
 'CustomerId',
 'Surname',
 'CreditScore',
 'Geography',
 'Gender',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary',
 'Exited']

In [15]:
# deep copy
df_copy = df.copy()

In [16]:
# drop the columns
df = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [17]:
# outliers
"""
calculates the z-scores for numerical columns in the DataFrame df, 
identifies rows with z-scores greater than 3 (outliers),
removes those rows, keeping only the data points that are within 3 standard deviations from the mean.
"""
from scipy import stats
import numpy as np
z = np.abs(stats.zscore(df.select_dtypes(include=[np.number])))
df = df[(z < 3).all(axis=1)]
df.shape

(9799, 11)

In [18]:
display(df.head())
display(df['Geography'].unique())
display(df['Gender'].unique())

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


array(['France', 'Spain', 'Germany'], dtype=object)

array(['Female', 'Male'], dtype=object)

In [20]:
# seperate the features and target
target_column = 'Exited'
X = df.drop(target_column, axis=1)
y = df[target_column]

# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
type(X_train)

pandas.core.frame.DataFrame

In [22]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7839, 10)
(1960, 10)
(7839,)
(1960,)


In [23]:
# convert the data into dataframes
df_X_train = pd.DataFrame(X_train, columns=X.columns)
df_X_train[target_column] = y_train
df_X_test = pd.DataFrame(X_test, columns=X.columns)
df_X_test[target_column] = y_test



In [26]:
# create folders
os.makedirs('../data/processed', exist_ok=True)
# save the data to csv
df_X_train.to_csv('../data/processed/X_train.csv', index=False)
df_X_test.to_csv('../data/processed/X_test.csv', index=False)


In [25]:
display(bucket)

'churn-prediction-2025-12345'

In [None]:
# one hot encoding
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Define the ColumnTransformer
# The ColumnTransformer is updated to pass through Tenure, NumOfProducts, HasCrCard, and IsActiveMember without any transformation.
# The columns that need scaling and encoding are still being processed accordingly.
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical_features', StandardScaler(), ['CreditScore', 'Age', 'Balance', 'EstimatedSalary']),
        ('cat_features', OneHotEncoder(), ['Geography', 'Gender']),
        # Pass-through the columns with small values
        ('pass-through', 'passthrough', ['NumOfProducts', 'HasCrCard', 'IsActiveMember', 'Tenure'])
    ]
)