In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pickle

In [2]:
## Load the dataset
data = pd.read_csv("Data/Churn_Modelling.csv")
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
## Preprocess the data
### Drop irrelevant data
data = data.drop(["RowNumber", "CustomerId", "Surname"], axis = 1)
data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [4]:
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [5]:
print(f"Unique categories in the data:")
for col in categorical_cols:
    print(f"- {col}: {data[col].nunique()} categories")

Unique categories in the data:
- Geography: 3 categories
- Gender: 2 categories


In [6]:
X = data.drop("Exited", axis=1)
y = data["Exited"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
onehot_encoder_gender = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_gender_encoded = onehot_encoder_gender.fit_transform(X_train[["Gender"]])
X_test_gender_encoded = onehot_encoder_gender.transform(X_test[["Gender"]])

In [8]:
onehot_encoder_geo = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_geo_encoded = onehot_encoder_geo.fit_transform(X_train[["Geography"]])
X_test_geo_encoded = onehot_encoder_geo.transform(X_test[["Geography"]])

In [9]:
gender_encoded_df = pd.DataFrame(X_train_gender_encoded, columns=onehot_encoder_gender.get_feature_names_out(["Gender"]))
gender_encoded_df

Unnamed: 0,Gender_Female,Gender_Male
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,1.0,0.0
4,0.0,1.0
...,...,...
7995,0.0,1.0
7996,1.0,0.0
7997,1.0,0.0
7998,0.0,1.0


In [10]:
geo_encoded_df = pd.DataFrame(X_train_geo_encoded, columns=onehot_encoder_geo.get_feature_names_out(["Geography"]))
geo_encoded_df

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,1.0,0.0
2,0.0,0.0,1.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0
...,...,...,...
7995,1.0,0.0,0.0
7996,1.0,0.0,0.0
7997,1.0,0.0,0.0
7998,1.0,0.0,0.0


In [11]:
gender_train_df = pd.DataFrame(X_train_gender_encoded, 
                           columns=onehot_encoder_gender.get_feature_names_out(["Gender"]),
                           index=X_train.index)
gender_test_df = pd.DataFrame(X_test_gender_encoded,
                          columns=onehot_encoder_gender.get_feature_names_out(["Gender"]),
                          index=X_test.index)

In [12]:
geo_train_df = pd.DataFrame(X_train_geo_encoded, 
                           columns=onehot_encoder_geo.get_feature_names_out(["Geography"]),
                           index=X_train.index)
geo_test_df = pd.DataFrame(X_test_geo_encoded,
                          columns=onehot_encoder_geo.get_feature_names_out(["Geography"]),
                          index=X_test.index)

In [13]:
X_train = pd.concat([X_train.drop(["Geography", "Gender"], axis=1), gender_train_df, geo_train_df], axis=1)
X_test = pd.concat([X_test.drop(["Geography", "Gender"], axis=1), gender_test_df, geo_test_df], axis=1)

In [14]:
X_train

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Gender_Female,Gender_Male,Geography_France,Geography_Germany,Geography_Spain
9254,686,32,6,0.00,2,1,1,179093.26,0.0,1.0,1.0,0.0,0.0
1561,632,42,4,119624.60,2,1,1,195978.86,0.0,1.0,0.0,1.0,0.0
1670,559,24,3,114739.92,1,1,0,85891.02,0.0,1.0,0.0,0.0,1.0
6087,561,27,9,135637.00,1,1,0,153080.40,1.0,0.0,1.0,0.0,0.0
6669,517,56,9,142147.32,1,0,0,39488.04,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,768,54,8,69712.74,1,1,1,69381.05,0.0,1.0,1.0,0.0,0.0
5191,682,58,1,0.00,1,1,1,706.50,1.0,0.0,1.0,0.0,0.0
5390,735,38,1,0.00,3,0,0,92220.12,1.0,0.0,1.0,0.0,0.0
860,667,43,8,190227.46,1,1,0,97508.04,0.0,1.0,1.0,0.0,0.0


In [15]:
X_test

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Gender_Female,Gender_Male,Geography_France,Geography_Germany,Geography_Spain
6252,596,32,3,96709.07,2,0,0,41788.37,0.0,1.0,0.0,1.0,0.0
4684,623,43,1,0.00,2,1,1,146379.30,0.0,1.0,1.0,0.0,0.0
1731,601,44,4,0.00,2,1,0,58561.31,1.0,0.0,0.0,0.0,1.0
4742,506,59,8,119152.10,2,1,1,170679.74,0.0,1.0,0.0,1.0,0.0
4521,560,27,7,124995.98,1,1,1,114669.79,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6412,602,53,5,98268.84,1,0,1,45038.29,1.0,0.0,0.0,1.0,0.0
8285,609,25,10,0.00,1,0,1,109895.16,0.0,1.0,1.0,0.0,0.0
7853,730,47,7,0.00,1,1,0,33373.26,1.0,0.0,1.0,0.0,0.0
1095,692,29,4,0.00,1,1,0,76755.99,0.0,1.0,1.0,0.0,0.0


In [16]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [17]:
## Save the encoders and scaler
with open ("artifacts/preprocessing/OHE_gender.pkl", "wb") as file:
    pickle.dump(onehot_encoder_gender, file)
with open("artifacts/preprocessing/OHE_geo.pkl", "wb") as file:
    pickle.dump(onehot_encoder_geo, file)
with open('artifacts/preprocessing/scaler.pkl','wb') as file:
    pickle.dump(scaler,file)

# ANN Implementation

In [18]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import datetime




In [19]:
X_train.shape[1]

13

In [20]:
## Build our ANN Model
model = Sequential([
    Dense(64, activation="relu", input_shape=(X_train.shape[1],)),## HL1 Connected with input layer
    Dense(32, activation = 'relu'), ## HL2
    Dense(1, activation = "sigmoid")## Output layer
])




In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                896       
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 3009 (11.75 KB)
Trainable params: 3009 (11.75 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [22]:
import tensorflow
opt = tensorflow.keras.optimizers.Adam(learning_rate = 0.01)
loss = tensorflow.keras.losses.BinaryCrossentropy()
loss

<keras.src.losses.BinaryCrossentropy at 0x191aaed08d0>

In [23]:
## compile the model
model.compile(optimizer = opt, loss = loss, metrics=['accuracy'])

In [24]:
## Set up the Tensorboard
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard

log_dir="logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorflow_callback=TensorBoard(log_dir=log_dir,histogram_freq=1)

In [25]:
early_stopping_callback = EarlyStopping(monitor = "val_loss", patience = 10, restore_best_weights=True)

In [26]:
### Train the model
history=model.fit(
    X_train,y_train,validation_data=(X_test,y_test),epochs=100,
    callbacks=[tensorflow_callback,early_stopping_callback]
)

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100


In [27]:
model.save('artifacts/model/model.h5')

  saving_api.save_model(


In [28]:
## Load tensorboard extension
%load_ext tensorboard

In [30]:
%tensorboard --logdir logs/fit/20260106-213320

Reusing TensorBoard on port 6006 (pid 17420), started 0:00:11 ago. (Use '!kill 17420' to kill it.)