In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pickle

### Load and Preprocess the Dataset

In [2]:
# Load the dataset
data = pd.read_csv('Churn_Modelling.csv')
# Display the first few rows of the dataset
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
# preprocess the dataset
data = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
# Encode categorical variables - Gender
label_encoder_gender = LabelEncoder()
data['Gender'] = label_encoder_gender.fit_transform(data['Gender'])
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


In [5]:
# one-hot encode the 'Geography' column
# import OneHotEncoder from sklearn
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder()
geography_encoded = one_hot_encoder.fit_transform(data[['Geography']])
# Create a DataFrame for the encoded geography
geography_df = pd.DataFrame(geography_encoded.toarray(), columns=one_hot_encoder.get_feature_names_out(['Geography']))
# Concatenate the encoded geography DataFrame with the original data
data = pd.concat([data, geography_df], axis=1)
# Drop the original 'Geography' column
data = data.drop(['Geography'], axis=1)
# Display the first few rows of the updated dataset
data.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [6]:
# save the label encoder and one-hot encoder as pickle files
with open('label_encoder_gender.pkl', 'wb') as f:
    pickle.dump(label_encoder_gender, f)

with open('one_hot_encoder_geography.pkl', 'wb') as f:
    pickle.dump(one_hot_encoder, f)

In [7]:
# get teh dependent and independent variables
X = data.drop(['Exited'], axis=1)
y = data['Exited']

In [8]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
# save the scaler as a pickle file
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

### ANN Implementation

In [11]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import datetime

In [12]:
# build ANN model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)), # First hidden layer
    Dense(32, activation='relu'), # Second hidden layer
    Dense(1, activation='sigmoid') # Output layer
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [13]:
# summary of the model
model.summary()

In [14]:
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [15]:
# setup the tensorboard
# impot tensorboard
from tensorflow.keras.callbacks import TensorBoard
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [17]:
# setup early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [18]:
# Train the model
history = model.fit(
    X_train, y_train, epochs=100,
    callbacks=[tensorboard_callback, early_stopping],
    validation_data=(X_test, y_test)
)

Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.7914 - loss: 0.5026 - val_accuracy: 0.8345 - val_loss: 0.3835
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8352 - loss: 0.3919 - val_accuracy: 0.8565 - val_loss: 0.3523
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8586 - loss: 0.3478 - val_accuracy: 0.8580 - val_loss: 0.3448
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8554 - loss: 0.3536 - val_accuracy: 0.8585 - val_loss: 0.3383
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8640 - loss: 0.3418 - val_accuracy: 0.8565 - val_loss: 0.3401
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8627 - loss: 0.3375 - val_accuracy: 0.8575 - val_loss: 0.3387
Epoch 7/100
[1m250/25

In [19]:
# save the model
model.save('churn_model.h5')



In [21]:
# load the tensorboard
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [23]:
%tensorboard --logdir logs/fit/20250715-173915

Reusing TensorBoard on port 6007 (pid 7528), started 0:00:30 ago. (Use '!kill 7528' to kill it.)

In [25]:
from tensorflow.keras.models import load_model

In [26]:
# load the model and all pickle files
with open('label_encoder_gender.pkl', 'rb') as f:
    label_encoder_gender = pickle.load(f)

with open('one_hot_encoder_geography.pkl', 'rb') as f:
    one_hot_encoder = pickle.load(f)

with open('scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

# load the model
model = load_model('churn_model.h5')



In [27]:
# prediction on new data
new_data = pd.DataFrame({
    'CreditScore': [600],
    'Geography': ['France'],
    'Gender': ['Male'],
    'Age': [40],
    'Tenure': [5],
    'Balance': [10000],
    'NumOfProducts': [2],
    'HasCrCard': [1],
    'IsActiveMember': [1],
    'EstimatedSalary': [50000]
})

# preprocess the new data
new_data['Gender'] = label_encoder_gender.transform(new_data['Gender'])
new_data_geography_encoded = one_hot_encoder.transform(new_data[['Geography']])

# Create a DataFrame for the encoded geography
new_data_geography_df = pd.DataFrame(new_data_geography_encoded.toarray(), columns=one_hot_encoder.get_feature_names_out(['Geography']))
# Concatenate the encoded geography DataFrame with the new data
new_data = pd.concat([new_data, new_data_geography_df], axis=1)
# Drop the original 'Geography' column
new_data = new_data.drop(['Geography'], axis=1)
# scale the new data
new_data_scaled = scaler.transform(new_data)

# make prediction
predictions = model.predict(new_data_scaled)
# convert predictions to binary
predictions_binary = (predictions > 0.5).astype(int)
# display the predictions
print("Predictions (1 = Churn, 0 = No Churn):", predictions_binary.flatten())


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 996ms/step
Predictions (1 = Churn, 0 = No Churn): [0]
