In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
import pickle

### Loading Data

In [13]:
data = pd.read_csv("data/Churn_Modelling.csv")
data

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


### Data Preprocessing

In [14]:
data = data.drop(["RowNumber", "CustomerId", "Surname"], axis = 1)
data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [15]:
gender_encoder = LabelEncoder()
data['Gender'] = gender_encoder.fit_transform(data["Gender"])
data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.80,3,1,0,113931.57,1
3,699,France,0,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,1,39,5,0.00,2,1,0,96270.64,0
9996,516,France,1,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,0,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,1,42,3,75075.31,2,1,0,92888.52,1


In [16]:
geo_encoder = OneHotEncoder(sparse_output = False)
ohe_geo = geo_encoder.fit_transform(data[['Geography']])
ohe_geo

array([[1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]], shape=(10000, 3))

In [17]:
geo_encoder.get_feature_names_out()

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [18]:
ohe_geo_dataframe = pd.DataFrame(data = ohe_geo, columns = geo_encoder.get_feature_names_out())

In [19]:
ohe_geo_dataframe

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0
...,...,...,...
9995,1.0,0.0,0.0
9996,1.0,0.0,0.0
9997,1.0,0.0,0.0
9998,0.0,1.0,0.0


In [20]:
final_df = pd.concat([data, ohe_geo_dataframe], axis = 1)

In [21]:
final_df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,France,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,France,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [22]:
final_df = final_df.drop('Geography', axis = 1)
final_df

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.00,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.80,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.00,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.10,0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,1,39,5,0.00,2,1,0,96270.64,0,1.0,0.0,0.0
9996,516,1,35,10,57369.61,1,1,1,101699.77,0,1.0,0.0,0.0
9997,709,0,36,7,0.00,1,0,1,42085.58,1,1.0,0.0,0.0
9998,772,1,42,3,75075.31,2,1,0,92888.52,1,0.0,1.0,0.0


### Saving Encoders for Reproducibility

In [23]:
with open("artifacts/preprocessing/geography_encoder.pkl", "wb") as file:
    pickle.dump(geo_encoder, file)

with open("artifacts/preprocessing/gender_encoder.pkl", "wb") as file:
    pickle.dump(gender_encoder, file)

### Splitting Data

In [24]:
X = final_df.drop("Exited", axis = 1)
y = final_df['Exited']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 67)

### Scaling Features

In [26]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train, X_test

(array([[-1.25752149,  0.91163733,  2.28831459, ..., -1.00125078,
         -0.57658047,  1.73262835],
        [-1.49613004,  0.91163733, -0.18076512, ...,  0.99875078,
         -0.57658047, -0.57715782],
        [ 0.51648116, -1.09692743, -0.84551735, ...,  0.99875078,
         -0.57658047, -0.57715782],
        ...,
        [ 0.11188407,  0.91163733,  0.00916408, ...,  0.99875078,
         -0.57658047, -0.57715782],
        [ 1.51241248,  0.91163733,  0.19909329, ..., -1.00125078,
         -0.57658047,  1.73262835],
        [-1.1226558 ,  0.91163733, -0.65558815, ...,  0.99875078,
         -0.57658047, -0.57715782]], shape=(8000, 12)),
 array([[-0.36533302,  0.91163733, -0.65558815, ...,  0.99875078,
         -0.57658047, -0.57715782],
        [ 1.66802675, -1.09692743, -0.37069433, ...,  0.99875078,
         -0.57658047, -0.57715782],
        [-0.3134616 ,  0.91163733,  0.2940579 , ..., -1.00125078,
         -0.57658047,  1.73262835],
        ...,
        [-0.01260735, -1.09692743,  

In [27]:
X_train.shape, X_test.shape

((8000, 12), (2000, 12))

### Saving Scaler

In [28]:
with open("artifacts/preprocessing/scaler.pkl", "wb") as file:
    pickle.dump(scaler, file)

### Training ANN

In [29]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import datetime

In [30]:
model = Sequential([
        Dense(64, activation = 'relu', input_shape = (X_train.shape[1], )), #Input -> H.L.1
        Dense(32, activation = 'relu'), # H.L. 2
        Dense(1, activation = 'sigmoid') # Output Layer
    ]
    )

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [31]:
model.summary()

In [32]:
opt = tf.keras.optimizers.Adam(learning_rate = 0.01)
loss = tf.keras.losses.BinaryCrossentropy()

In [33]:
model.compile(optimizer = opt,
              loss = loss,
              metrics = ['accuracy'])

In [34]:
train_date_time = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')

log_dir = f"logs/train_{train_date_time}"

tensorflow_callback = TensorBoard(log_dir = log_dir, histogram_freq = 1) #Visualize logs, can be done in matplotlib as well

In [35]:
# Early stopping (incase accuracy plateau)
early_stopping = EarlyStopping(monitor = 'val_loss', patience = 5, restore_best_weights = True) #5 epochs then stops

In [36]:
history = model.fit(
    X_train, y_train, 
    validation_data = (X_test, y_test), 
    epochs = 100,
    callbacks = [tensorflow_callback, early_stopping]
)

Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8342 - loss: 0.3954 - val_accuracy: 0.8615 - val_loss: 0.3673
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8526 - loss: 0.3548 - val_accuracy: 0.8615 - val_loss: 0.3495
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8571 - loss: 0.3468 - val_accuracy: 0.8625 - val_loss: 0.3403
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8574 - loss: 0.3441 - val_accuracy: 0.8610 - val_loss: 0.3465
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8594 - loss: 0.3402 - val_accuracy: 0.8575 - val_loss: 0.3601
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8559 - loss: 0.3389 - val_accuracy: 0.8635 - val_loss: 0.3455
Epoch 7/100
[1m250/25

In [37]:
model.save("artifacts/model/model.keras")

### Loading TensorBoard Extension

In [38]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [40]:
%tensorboard --logdir {log_dir}

Reusing TensorBoard on port 6010 (pid 7880), started 0:00:09 ago. (Use '!kill 7880' to kill it.)