# Banking Churn Prediction

In [None]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd

In [None]:
# create the boto3 client
sm_boto3 = boto3.client('sagemaker')
session = sagemaker.Session()
region = session.boto_region_name # region the notebook is running in and comes from the awscli
bucket = "churn-prediction-2025-12345" # bucket name

In [None]:
# print 
print(region)
print(bucket)

In [None]:
import os
# read the data
data_file_path = '../data/raw/Churn_Modelling.csv'
if not os.path.exists(data_file_path):
    print(f"Data file {data_file_path} does not exist")
    exit(1)
df = pd.read_csv(data_file_path)
df.shape

In [None]:
# head
display(df.head())
# tail
display(df.tail())

In [None]:
# info
df.info()

In [None]:
# get object types
display(df.dtypes)

In [None]:
# check for null values
display(df.isnull().sum())

In [None]:
# check for duplicates
display(df.duplicated().sum())

In [None]:
# check if the target is balanced
display(df['Exited'].value_counts())

In [None]:
# get statistics
display(df.describe(include='object'))

In [None]:
# get statistics
display(df.describe())

In [None]:
# get columns as list
columns = df.columns.tolist()
columns

In [None]:
# deep copy
df_copy = df.copy()

In [None]:
# drop the columns
df = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
df.head()

In [None]:
# outliers
"""
calculates the z-scores for numerical columns in the DataFrame df, 
identifies rows with z-scores greater than 3 (outliers),
removes those rows, keeping only the data points that are within 3 standard deviations from the mean.
"""
from scipy import stats
import numpy as np
z = np.abs(stats.zscore(df.select_dtypes(include=[np.number])))
df = df[(z < 3).all(axis=1)]
df.shape

In [None]:
display(df.head())
display(df['Geography'].unique())
display(df['Gender'].unique())

In [None]:
# seperate the features and target
target_column = 'Exited'
X = df.drop(target_column, axis=1)
y = df[target_column]

# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
type(X_train)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# convert the data into dataframes
df_X_train = pd.DataFrame(X_train, columns=X.columns)
df_X_train[target_column] = y_train
df_X_test = pd.DataFrame(X_test, columns=X.columns)
df_X_test[target_column] = y_test



In [None]:
# create folders
os.makedirs('../data/processed', exist_ok=True)
# save the data to csv
df_X_train.to_csv('../data/processed/train.csv', index=False)
df_X_test.to_csv('../data/processed/test.csv', index=False)


In [None]:
display(bucket)

In [None]:
# one hot encoding
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import joblib


# Define the ColumnTransformer
# The ColumnTransformer is updated to pass through Tenure, NumOfProducts, HasCrCard, and IsActiveMember without any transformation.
# The columns that need scaling and encoding are still being processed accordingly.
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical_features', StandardScaler(), ['CreditScore', 'Age', 'Balance', 'EstimatedSalary']),
        ('cat_features', OneHotEncoder(), ['Geography', 'Gender']),
        # Pass-through the columns with small values
        ('pass-through', 'passthrough', ['NumOfProducts', 'HasCrCard', 'IsActiveMember', 'Tenure'])
    ]
)

# Define the pipeline
pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier())
    ]
)

# Define the hyperparameters
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [5, 10, 15]
}

# Define the GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=5
)

# Fit the model
# when using pipelines we do not need to fit_transform the data as the pipeline will take care of it
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
best_params

# Get the best score
best_score = grid_search.best_score_
best_score

# Get the best model
best_model = grid_search.best_estimator_
best_model


# save the model
# create folders
os.makedirs('../models', exist_ok=True)
# save the model
joblib.dump(best_model, '../models/model.joblib')