### Determining the optimal number of hidden layers and neurons for an Artificial Neural Network (ANN) 
This can be challenging and often requires experimentation. However, there are some guidelines and methods that can help you in making an informed decision:

- Start Simple: Begin with a simple architecture and gradually increase complexity if needed.
- Grid Search/Random Search: Use grid search or random search to try different architectures.
- Cross-Validation: Use cross-validation to evaluate the performance of different architectures.
- Heuristics and Rules of Thumb: Some heuristics and empirical rules can provide starting points, such as:
  -    The number of neurons in the hidden layer should be between the size of the input layer and the size of the output layer.
  -  A common practice is to start with 1-2 hidden layers.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from scikeras.wrappers import KerasClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
import pickle

In [2]:
data=pd.read_csv('Churn_Modelling.csv')
data = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

label_encoder_gender = LabelEncoder()
data['Gender'] = label_encoder_gender.fit_transform(data['Gender'])

onehot_encoder_geo = OneHotEncoder(handle_unknown='ignore')
geo_encoded = onehot_encoder_geo.fit_transform(data[['Geography']]).toarray()
geo_encoded_df = pd.DataFrame(geo_encoded, columns=onehot_encoder_geo.get_feature_names_out(['Geography']))

data = pd.concat([data.drop('Geography', axis=1), geo_encoded_df], axis=1)

X = data.drop('Exited', axis=1)
y = data['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Save encoders and scaler for later use
with open('label_encoder_gender.pkl', 'wb') as file:
    pickle.dump(label_encoder_gender, file)

with open('onehot_encoder_geo.pkl', 'wb') as file:
    pickle.dump(onehot_encoder_geo, file)

with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

In [3]:
## Define a function to create the model and try different parameters(KerasClassifier)

def create_model(neurons=32,layers=1):
    model=Sequential()
    model.add(Dense(neurons,activation='relu',input_shape=(X_train.shape[1],)))

    for _ in range(layers-1):
        model.add(Dense(neurons,activation='relu'))

    model.add(Dense(1,activation='sigmoid'))
    model.compile(optimizer='adam',loss="binary_crossentropy",metrics=['accuracy'])

    return model



In [4]:
## Create a Keras classifier
model=KerasClassifier(layers=1,neurons=32,build_fn=create_model,verbose=1)

In [None]:

#  1.Define the grid search parameters
param_grid = {
    'neurons': [16, 32, 64, 128],
    'layers': [1, 2],
    'epochs': [50, 100]
}

In [6]:
# Perform grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3,verbose=1)
grid_result = grid.fit(X_train, y_train)

# Print the best parameters
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Fitting 3 folds for each of 16 candidates, totalling 48 fits


  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 949us/step - accuracy: 0.7443 - loss: 0.5552
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 890us/step - accuracy: 0.8115 - loss: 0.4409
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 815us/step - accuracy: 0.8268 - loss: 0.4087
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 905us/step - accuracy: 0.8289 - loss: 0.3988
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 806us/step - accuracy: 0.8404 - loss: 0.3785
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 769us/step - accuracy: 0.8402 - loss: 0.3801
Epoch 7/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 814us/step - accuracy: 0.8499 - loss: 0.3639
Epoch 8/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 810us/step - accuracy: 0.8530 - loss: 0.3613
Epoch 9/100
[1m

## RandomSearchCV

In [30]:
# Importing required libraries
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import tensorflow as tf

def create_model(units=64, dropout_rate=0.0, learning_rate=0.001):
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(X_train.shape[1],)),  # Explicitly define the input shape
        tf.keras.layers.Dense(units, activation='relu'),
        tf.keras.layers.Dropout(dropout_rate),
        tf.keras.layers.Dense(1, activation='sigmoid')  # Output layer for binary classification
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  loss='binary_crossentropy', metrics=['accuracy'])
    return model



In [31]:
# Wrapping the model with KerasClassifier
model = KerasClassifier(dropout_rate=0.0, learning_rate=0.001, units=64,model=create_model, verbose=0)

In [32]:
# 2. RandomizedSearchCV
param_dist = {
    'units': [32, 64, 128, 256],
    'dropout_rate': uniform(0, 0.5),
    'learning_rate': uniform(0.0001, 0.01),
    'epochs': [10, 20, 30]
}

In [33]:
# RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=10, cv=3, verbose=1)

random_search_result = random_search.fit(X_train, y_train)
print("Best parameters:", random_search_result.best_params_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters: {'dropout_rate': 0.4832321291990546, 'epochs': 20, 'learning_rate': 0.004361333099891205, 'units': 128}


In [34]:
print("Best parameters:", random_search_result.best_params_,random_search_result.best_score_)

Best parameters: {'dropout_rate': 0.4832321291990546, 'epochs': 20, 'learning_rate': 0.004361333099891205, 'units': 128} 0.8578746673920074


## Bayesian Optimization (using hyperopt)

In [49]:
# 3. Bayesian Optimization (using hyperopt)
from hyperopt import fmin, tpe, hp, Trials

def objective(params):
    # Ensure model creation only uses model-specific parameters (dropout_rate, learning_rate, units)
    model = create_model(dropout_rate=params['dropout_rate'],
                         learning_rate=params['learning_rate'],
                         units=int(params['units']))
    
    # Fit the model on the training data, using epochs and batch_size for training
    history = model.fit(X_train, y_train, batch_size=32,
                        epochs=int(params['epochs']), verbose=0)
    
    # Return negative accuracy to minimize the objective
    accuracy = history.history['accuracy'][-1]
    return -accuracy

In [50]:
# Define the hyperparameter search space
space = {
    'units': hp.choice('units', [32, 64, 128, 256]),
    'dropout_rate': hp.uniform('dropout_rate', 0, 0.5),
    'learning_rate': hp.loguniform('learning_rate', -5, -2),
    'epochs': hp.choice('epochs', [10, 20, 30])
}

In [51]:
trials = Trials()
best_params = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=20, trials=trials)
print("Best Bayesian Optimization parameters:", best_params)

100%|██████████| 20/20 [02:08<00:00,  6.44s/trial, best loss: -0.8693749904632568]
Best Bayesian Optimization parameters: {'dropout_rate': 0.04752716966784132, 'epochs': 2, 'learning_rate': 0.007022834576888465, 'units': 2}


Bayesian Optimization itself does not directly train the model; rather, it optimizes the hyperparameters of the model by minimizing (or maximizing) a loss function that you define in the objective function. The optimization process does not inherently involve training the model with the best hyperparameters or calculating accuracy directly.

In [53]:
from sklearn.metrics import accuracy_score

# Create the best model with the optimized parameters
best_model = create_model(dropout_rate=best_params['dropout_rate'],
                           learning_rate=best_params['learning_rate'],
                           units=int(best_params['units']))

# Train the model on the training data using the best number of epochs
best_model.fit(X_train, y_train, batch_size=32, epochs=int(best_params['epochs']))

# Predict on the test set
y_pred = best_model.predict(X_test)

# Convert probabilities to binary values (0 or 1)
y_pred_binary = (y_pred > 0.5).astype(int)

# Calculate accuracy on the test set
best_accuracy = accuracy_score(y_test, y_pred_binary)

# Output the final accuracy
print(f"Best Accuracy: {best_accuracy}")


Epoch 1/2
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5603 - loss: 0.8279
Epoch 2/2
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 943us/step - accuracy: 0.7978 - loss: 0.4940
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Best Accuracy: 0.841


## Monte Carlo Dropout

In [58]:
# 4. Monte Carlo Dropout
class MonteCarloDropout(tf.keras.layers.Dropout):
    def call(self, inputs, training=None):
        return super().call(inputs, training=True)

def create_model_with_mc(dropout_rate=0.0, learning_rate=0.001, units=64):
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(X_train.shape[1],)),
        tf.keras.layers.Dense(units, activation='relu'),
        MonteCarloDropout(dropout_rate),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [59]:
mc_model = create_model_with_mc(dropout_rate=0.2, learning_rate=0.001, units=64)

In [60]:
mc_model.fit(X_train, y_train, epochs=10, batch_size=32)


Epoch 1/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7830 - loss: 0.5077
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 997us/step - accuracy: 0.8208 - loss: 0.4308
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8338 - loss: 0.4064
Epoch 4/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8465 - loss: 0.3749
Epoch 5/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8459 - loss: 0.3673
Epoch 6/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8519 - loss: 0.3693
Epoch 7/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8557 - loss: 0.3608
Epoch 8/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8537 - loss: 0.3567
Epoch 9/10
[1m250/250[0m [32m━━━━━━

<keras.src.callbacks.history.History at 0x1fa26438150>

In [61]:
predictions = [mc_model.predict(X_test) for _ in range(100)]

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 841us/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 814us/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 961us/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 996us/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 923us/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 932us/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━

In [64]:
import numpy as np
uncertainty = np.var(predictions, axis=0)
print("Uncertainty estimates:", uncertainty)

Uncertainty estimates: [[0.00148102]
 [0.00027513]
 [0.00150224]
 ...
 [0.00947487]
 [0.00427217]
 [0.00571837]]
