In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import tensorflow as tf
import imblearn
import keras_tuner as kt

In [2]:
dataframe = pd.read_csv("train.csv")
dataframe.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [3]:
# Create new columns based on the Gender column
dataframe['Is_Male'] = dataframe['Gender'].apply(lambda x: 1 if x == 'Male' else 0)
dataframe['Is_Female'] = dataframe['Gender'].apply(lambda x: 1 if x == 'Female' else 0)

# Create new columns based on the Geography column
dataframe['Is_Germany'] = dataframe['Geography'].apply(lambda x: 1 if x == 'Germany' else 0)
dataframe['Is_Spain'] = dataframe['Geography'].apply(lambda x: 1 if x == 'Spain' else 0)
dataframe['Is_France'] = dataframe['Geography'].apply(lambda x: 1 if x == 'France' else 0)

In [4]:
# drop non-int columns
dataframe.drop(['Geography', 'Gender'], axis = 1, inplace = True)

In [5]:
y = dataframe.Exited.values
X = dataframe.drop(columns=['id','CustomerId','Surname','Exited'])

In [6]:
# SMOTE

from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')

# fit predictor and target variable
x_smote, y_smote = smote.fit_resample(X, y)

In [7]:
# Train Split
X_train, X_test, y_train, y_test = train_test_split(x_smote, y_smote, random_state=42, stratify=y_smote)

In [8]:
scaler = StandardScaler()

X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','sigmoid'])
    
    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=30,
        step=5), activation=activation, input_dim=13))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 5)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=30,
            step=5),
            activation=activation))
    
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
    
    return nn_model

In [10]:
tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2)

Reloading Tuner from .\untitled_project\tuner0.json


In [11]:
tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))

Trial 56 Complete [00h 01m 49s]
val_accuracy: 0.8625051975250244

Best val_accuracy So Far: 0.8995803594589233
Total elapsed time: 00h 37m 21s


In [12]:
# # # Define the deep learning model 
# nn_model = tf.keras.models.Sequential()
# nn_model.add(tf.keras.layers.Dense(units=32, activation="relu", input_dim=13))
# nn_model.add(tf.keras.layers.Dense(units=16, activation="relu"))
# nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# # Compile the Sequential model together and customise metrics
# nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# # Train the model
# fit_model = nn_model.fit(X_train_scaled, y_train, epochs=50)

# # Evaluate the model using the test data
# model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
# print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [19]:
top_hyper = tuner.get_best_hyperparameters(1)
for param in top_hyper:
    print(param.values)

{'activation': 'relu', 'first_units': 26, 'num_layers': 4, 'units_0': 26, 'units_1': 21, 'units_2': 21, 'units_3': 11, 'units_4': 11, 'tuner/epochs': 20, 'tuner/initial_epoch': 7, 'tuner/bracket': 1, 'tuner/round': 1, 'tuner/trial_id': '0018'}


In [20]:
top_model = tuner.get_best_models(1)
for model in top_model:
    model_loss, model_accuracy = model.evaluate(X_test_scaled,y_test,verbose=2)
    print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

2034/2034 - 1s - loss: 0.2374 - accuracy: 0.8996 - 1s/epoch - 550us/step
Loss: 0.2373967319726944, Accuracy: 0.8995803594589233


In [21]:
# Get the top hyperparameters
top_hyper = tuner.get_best_hyperparameters(3)

# Build and train a model using the top hyperparameters
best_model = tuner.hypermodel.build(top_hyper[0])
best_model.fit(X_train_scaled, y_train, epochs=20, validation_data=(X_test_scaled, y_test))


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x25201a327d0>

In [22]:
X_scaled = X_scaler.transform(X)

predictions = best_model.predict(X_scaled)

predicted_labels = (predictions > 0.5).astype(int)



In [23]:
predicted_df = pd.DataFrame(predicted_labels)

In [24]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, auc

conf_matrix = confusion_matrix(y, predicted_labels)

class_report = classification_report(y, predicted_labels)

accuracy = accuracy_score(y, predicted_labels)

# print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)
print("\nAccuracy Score:", accuracy)


Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.93      0.91    130113
           1       0.69      0.62      0.65     34921

    accuracy                           0.86    165034
   macro avg       0.80      0.77      0.78    165034
weighted avg       0.86      0.86      0.86    165034


Accuracy Score: 0.8606044815007816


In [25]:
import joblib

filename = 'smote_nn_model'
joblib.dump(best_model, filename)

['smote_nn_model']