In [1]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
import numpy as np
import pandas as pd
import pickle
import optuna
import tensorflow as tf
from feature_engine.encoding import OrdinalEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow import keras
from tensorflow.keras import layers

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load the encoder and scaler
with open('oh_encoder_geo.pkl','rb') as file:
    oh_encoder_geo=pickle.load(file)

with open('label_encoder_gender.pkl', 'rb') as file:
    label_encoder_gender = pickle.load(file)

with open('scaler.pkl', 'rb') as file:
    scaler = pickle.load(file)

In [3]:
df= pd.read_csv(r"C:\Users\furka\JupyterNotebookProjects\End-to-End-Churn-Modelling-Binary-Classification\Churn_Modelling.csv")

In [4]:
## Drop the unnecessary columns
df=df.drop(['RowNumber','CustomerId','Surname'],axis=1)

In [5]:
# Drop the null values.
# Since the number of missing data is low, we can directly delete the missing data instead of filling it in. 
# Because it will not affect the model. We can use dropna() for this.
df = df.dropna()
# Dropping the duplicate values
df = df.drop_duplicates()

print('Rows containing missing data and duplicate data were deleted.')

Rows containing missing data and duplicate data were deleted.


In [6]:
# Separate into train and test set
# Remember to set the seed (random_state for this sklearn function)
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['Exited'], axis=1), # predictive variables
    df['Exited'], # target
    test_size=0.2, # portion of dataset to allocate to test set
    random_state=0, # we are setting the seed here
)

print('The dataset separate dataset into train and test successfully')
X_train.shape, X_test.shape

The dataset separate dataset into train and test successfully


((8000, 10), (2000, 10))

In [7]:
# Encoding and Feature Scaling
X_train = label_encoder_gender.transform(X_train)
X_test = label_encoder_gender.transform(X_test)

X_train = oh_encoder_geo.transform(X_train)
X_test = oh_encoder_geo.transform(X_test)

X_train = pd.DataFrame(
    scaler.transform(X_train),
    columns=X_train.columns
)

X_test = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_train.columns
)

In [8]:
# Define a function to create the model with dynamic parameters
def create_model(trial):
    neurons = trial.suggest_categorical('neurons', [8, 16, 32, 64, 128])
    layers = trial.suggest_int('layers', 1, 3)
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True)
    
    model = Sequential()
    model.add(Dense(neurons, activation='relu', input_shape=(X_train.shape[1],)))
    
    for _ in range(layers - 1):
        model.add(Dense(neurons, activation='relu'))
    
    model.add(Dense(1, activation='sigmoid'))
    
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=['accuracy'])
    return model

In [9]:
# Define an objective function
def objective(trial):
    # Create and compile the model
    model = create_model(trial)
    
    # Get hyperparameters
    epochs = trial.suggest_categorical('epochs', [50, 100])
    batch_size = trial.suggest_categorical('batch_size', [8, 16, 32, 64])
    
    # Define EarlyStopping callback
    early_stopping = EarlyStopping(
        monitor='val_accuracy',
        patience=10,
        restore_best_weights=True
    )
    
    # K-Fold cross-validation
    kfold = KFold(n_splits=3, shuffle=True, random_state=42)
    scores = []
    
    for train_idx, val_idx in kfold.split(X_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        # Train the model
        history = model.fit(
            X_tr, y_tr,
            epochs=epochs,
            batch_size=batch_size,
            validation_data=(X_val, y_val),
            callbacks=[early_stopping],
            verbose=0
        )
        
        # Evaluate on validation set
        val_score = model.evaluate(X_val, y_val, verbose=0)[1]  # Get accuracy
        scores.append(val_score)
    
    return np.mean(scores)

In [10]:
# Run Optuna optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20, n_jobs=8)

# Best parameters
print("Best parameters found: ", study.best_params)
print("Best score: ", study.best_value)

[I 2024-11-03 21:59:19,749] A new study created in memory with name: no-name-a9d75a9e-7cbb-44f1-ab6b-abbb1e3f8d3b
[I 2024-11-03 21:59:47,949] Trial 1 finished with value: 0.7956252296765646 and parameters: {'neurons': 16, 'layers': 3, 'learning_rate': 1.2477923824405202e-05, 'epochs': 50, 'batch_size': 64}. Best is trial 1 with value: 0.7956252296765646.
[I 2024-11-03 22:00:07,891] Trial 7 finished with value: 0.7973750034968058 and parameters: {'neurons': 8, 'layers': 3, 'learning_rate': 4.655427676519257e-05, 'epochs': 50, 'batch_size': 16}. Best is trial 7 with value: 0.7973750034968058.
[I 2024-11-03 22:00:19,089] Trial 8 finished with value: 0.7968751589457194 and parameters: {'neurons': 16, 'layers': 1, 'learning_rate': 1.2882176570689467e-05, 'epochs': 100, 'batch_size': 64}. Best is trial 7 with value: 0.7973750034968058.
[I 2024-11-03 22:00:26,603] Trial 9 finished with value: 0.8663754463195801 and parameters: {'neurons': 16, 'layers': 3, 'learning_rate': 0.004957239331389894

Best parameters found:  {'neurons': 128, 'layers': 3, 'learning_rate': 2.7745708726617632e-05, 'epochs': 100, 'batch_size': 8}
Best score:  0.8705002466837565
