In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder, OneHotEncoder
import pickle

In [9]:
data = pd.read_csv('../../data/Churn_Modelling.csv')
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


## Preprocess data

In [10]:
## drop irrelavnt features
data= data.drop(['RowNumber','CustomerId','Surname'], axis=1)

In [11]:
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  object 
 2   Gender           10000 non-null  object 
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
 10  Exited           10000 non-null  int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB


In [13]:
data['Geography'].value_counts()

Geography
France     5014
Germany    2509
Spain      2477
Name: count, dtype: int64

In [14]:
cat_features=data.select_dtypes(include='object').columns
cat_features

Index(['Geography', 'Gender'], dtype='object')

In [15]:
for col in cat_features:
    print(f" {col} has {data[col].nunique()} unique vales being : {data[col].unique()}")

 Geography has 3 unique vales being : ['France' 'Spain' 'Germany']
 Gender has 2 unique vales being : ['Female' 'Male']


In [16]:
label_encoder= LabelEncoder()
data['Gender']=label_encoder.fit_transform(data['Gender'])

In [17]:
One_encoder = OneHotEncoder(drop='first',sparse_output=False)
geo_encoder= One_encoder.fit_transform(data[['Geography']])

In [18]:
geo_df = pd.DataFrame(geo_encoder, columns=One_encoder.get_feature_names_out(['Geography']))

In [19]:
geo_df

Unnamed: 0,Geography_Germany,Geography_Spain
0,0.0,0.0
1,0.0,1.0
2,0.0,0.0
3,0.0,0.0
4,0.0,1.0
...,...,...
9995,0.0,0.0
9996,0.0,0.0
9997,0.0,0.0
9998,1.0,0.0


In [20]:
data = pd.concat([data.drop(['Geography'], axis=1), geo_df], axis=1)

In [21]:
data

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain
0,619,0,42,2,0.00,1,1,1,101348.88,1,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,1.0
2,502,0,42,8,159660.80,3,1,0,113931.57,1,0.0,0.0
3,699,0,39,1,0.00,2,0,0,93826.63,0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.10,0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,1,39,5,0.00,2,1,0,96270.64,0,0.0,0.0
9996,516,1,35,10,57369.61,1,1,1,101699.77,0,0.0,0.0
9997,709,0,36,7,0.00,1,0,1,42085.58,1,0.0,0.0
9998,772,1,42,3,75075.31,2,1,0,92888.52,1,1.0,0.0


In [22]:
## Save encoder and scaler
with open('label_encoder_gender.pkl','wb') as file:
    pickle.dump(label_encoder,file)

with open('onehot_encoder_geo.pkl','wb') as file:
    pickle.dump(One_encoder,file)    

In [23]:
## X and y
X= data.drop('Exited',axis=1)
y=data['Exited']

In [24]:
## training and test split
X_train ,X_test ,y_train ,y_test= train_test_split(X,y,test_size=0.2,random_state=42)
scaler= StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [25]:
with open('scaler.pkl','wb')as file:
    pickle.dump(scaler,file)

# ANN Implementation

In [26]:
import tensorflow as tf
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import datetime


#### Build Ann Model

In [27]:
model = Sequential([
    Dense(64,activation='relu',input_shape=(X_train.shape[1],)), #first hidden layer connected to input
    Dense(32,activation='relu'), ## hidden layer 2
    Dense(1,activation='sigmoid') ## output layer
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [28]:
model.summary()

In [29]:
import tensorflow
opt=tensorflow.keras.optimizers.Adam(learning_rate=0.01)
loss=tensorflow.keras.losses.BinaryCrossentropy()

In [30]:
##compile model
model.compile(optimizer=opt,loss=loss,metrics=['accuracy'])

In [31]:
## set up tensorboard
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard

log_dir="logs/fit" + datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
tensorflow_callback= TensorBoard(log_dir=log_dir,histogram_freq=1)

In [32]:
## setup Early Stopping
early_stopping_callback= EarlyStopping(monitor='val_loss',patience=10,restore_best_weights=True)

In [33]:
## train model
history = model.fit(
    X_train,y_train, validation_data=(X_test,y_test),epochs=100,
    callbacks=[tensorflow_callback,early_stopping_callback]
)

Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8195 - loss: 0.4339 - val_accuracy: 0.8570 - val_loss: 0.3497
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8526 - loss: 0.3575 - val_accuracy: 0.8465 - val_loss: 0.3586
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8617 - loss: 0.3405 - val_accuracy: 0.8605 - val_loss: 0.3487
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8594 - loss: 0.3364 - val_accuracy: 0.8500 - val_loss: 0.3511
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8681 - loss: 0.3297 - val_accuracy: 0.8575 - val_loss: 0.3420
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8642 - loss: 0.3314 - val_accuracy: 0.8575 - val_loss: 0.3488
Epoch 7/100
[1m250/25

In [34]:
model.save('model.h5')



In [35]:
## Load tensorboard Extension
%load_ext tensorboard

In [36]:
%tensorboard --logdir logs/fit

## HyperParamter Tunning

In [37]:
%pip install scikeras
from scikeras.wrappers import KerasClassifier

Note: you may need to restart the kernel to use updated packages.


In [38]:
def create_model(neurons=32,layers=1):
    model= Sequential()
    model.add(Dense(neurons, activation='relu', input_shape=(X_train.shape[1],)))
    for _ in range(layers-1):
        model.add(Dense(neurons, activation='relu'))
    
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [42]:
#keras classifier
model=KerasClassifier(layers=1,neurons=1,build_fn=create_model, epochs=50, batch_size=10, verbose=0)

In [40]:
#grid search parameters
param_grid = {
    'neurons': [32, 64, 128],
    'layers': [1, 2, 3],
    'batch_size': [10, 20],
    'epochs': [50, 100]
}

In [43]:
#perform grid search
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=1)
grid_result=grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [44]:
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")

Best: 0.8567495267142619 using {'batch_size': 20, 'epochs': 50, 'layers': 1, 'neurons': 64}


In [45]:
best_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),  # neurons=64, layers=1
    Dense(1, activation='sigmoid')
])
best_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [46]:
log_dir = "logs/best_model/" + datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = best_model.fit(
    X_train, 
    y_train,
    validation_data=(X_test, y_test),
    epochs=50,                # From best_params
    batch_size=20,           # From best_params
    callbacks=[tensorboard_callback, early_stopping]
)
best_model.save('../../Artifacts/classification/model.h5')


Epoch 1/50
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6822 - loss: 0.5707 - val_accuracy: 0.8225 - val_loss: 0.4039
Epoch 2/50
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8237 - loss: 0.4131 - val_accuracy: 0.8430 - val_loss: 0.3740
Epoch 3/50
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8382 - loss: 0.3842 - val_accuracy: 0.8570 - val_loss: 0.3576
Epoch 4/50
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8460 - loss: 0.3705 - val_accuracy: 0.8590 - val_loss: 0.3523
Epoch 5/50
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8588 - loss: 0.3488 - val_accuracy: 0.8575 - val_loss: 0.3485
Epoch 6/50
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8653 - loss: 0.3388 - val_accuracy: 0.8615 - val_loss: 0.3484
Epoch 7/50
[1m400/400[0m 

