In [1]:
# Importing necessities for the analysis
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
import optuna
from sklearn.neural_network import MLPClassifier
from imblearn.pipeline import Pipeline  
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
import joblib
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("max_colwidth", None)

In [13]:
# Loading the data
df_churn=pd.read_csv('data/Churn_prepared.csv',index_col='customerID')

In [4]:
sampler = TPESampler(seed=1)
pruner = MedianPruner(n_startup_trials=10)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
scaler= StandardScaler()

In [5]:
# Prepare your training data for h1n1
X = df_churn.drop(columns='Churn')
y = df_churn['Churn']


# LogReg study

In [22]:


def objective_logistic(trial):
    # Define the hyperparameters to search
    params = {
        'penalty': trial.suggest_categorical('penalty', ['l2']),
        'C': trial.suggest_float('C', 1e-4, 1e4, log=True),
        'solver': trial.suggest_categorical('solver', ['lbfgs', 'sag', 'newton-cg']),
        'max_iter': trial.suggest_int('max_iter', 100, 2000),
        'tol': trial.suggest_float('tol', 1e-6, 1e-2),
        'fit_intercept': True,
    }

    # Create a pipeline with scaling and logistic regression
    pipe = Pipeline([
        ('scaler', StandardScaler()), 
        ('model', LogisticRegression(**params))  
    ])
    
    # Perform cross-validation and calculate the average AUC score
    auc_scores = cross_val_score(pipe, X, y, cv=skf, scoring='roc_auc', n_jobs=-1)

    # Return the mean AUC score across all folds
    return np.mean(auc_scores)

# Create and run the Optuna study
study_logistic = optuna.create_study(direction='maximize',sampler=sampler,pruner=pruner)
study_logistic.optimize(objective_logistic, n_trials=100, n_jobs=-1)

# Get the best hyperparameters
best_params_logistic = study_logistic.best_params

model_logreg = LogisticRegression(**best_params_logistic, random_state=1)
joblib.dump(model_logreg, 'logistic_regression_model.pkl')

print(f"Best hyperparameters for Logistic Regression: {best_params_logistic}")
print("Model saved as 'logistic_regression_model.pkl'")


[I 2024-10-11 22:59:04,551] A new study created in memory with name: no-name-59436bd4-bb71-40e6-9dcb-c5c0535d1ace
[I 2024-10-11 22:59:04,769] Trial 0 finished with value: 0.8414226553411528 and parameters: {'penalty': 'l2', 'C': 0.00436431819436026, 'solver': 'lbfgs', 'max_iter': 1889, 'tol': 0.0015807259463275515}. Best is trial 0 with value: 0.8414226553411528.
[I 2024-10-11 22:59:04,896] Trial 2 finished with value: 0.8433327945280829 and parameters: {'penalty': 'l2', 'C': 0.043741786309968334, 'solver': 'sag', 'max_iter': 1719, 'tol': 0.008458015727467473}. Best is trial 2 with value: 0.8433327945280829.
[I 2024-10-11 22:59:05,043] Trial 3 finished with value: 0.8434180934325847 and parameters: {'penalty': 'l2', 'C': 0.05170740006355034, 'solver': 'newton-cg', 'max_iter': 1842, 'tol': 0.000759379569611599}. Best is trial 3 with value: 0.8434180934325847.
[I 2024-10-11 22:59:05,113] Trial 4 finished with value: 0.8374682901295769 and parameters: {'penalty': 'l2', 'C': 0.000398156132

Best hyperparameters for Logistic Regression: {'penalty': 'l2', 'C': 1175.2447403855879, 'solver': 'lbfgs', 'max_iter': 1249, 'tol': 7.420432894756062e-05}
Model saved as 'logistic_regression_model.pkl'


# Random Forest Study

In [21]:

# Define the objective function for tuning Random Forest
def objective_random_forest(trial):
    # Define the hyperparameters to search
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),  # Number of trees
        'max_depth': trial.suggest_int('max_depth', 3, 20),  # Max tree depth
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),  # Min samples to split
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),  # Min samples at a leaf
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),  # Max features to split
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),  # Bootstrap sampling
    }

    # Create a Random Forest model with the hyperparameters
    model = RandomForestClassifier(**params, random_state=42)

    # Perform cross-validation and calculate the average accuracy score
    accuracy_scores = cross_val_score(model, X, y, cv=skf, scoring='roc_auc', n_jobs=-1)

    # Return the mean accuracy score across all folds
    return np.mean(accuracy_scores)

# Create and run the Optuna study for Random Forest
study_rf = optuna.create_study(direction='maximize',sampler=sampler,pruner=pruner)
study_rf.optimize(objective_random_forest, n_trials=100, n_jobs=-1)

# Get the best hyperparameters
best_params_rf = study_rf.best_params

# Save the model (with best hyperparameters, not fitted) using joblib
model_rf = RandomForestClassifier(**best_params_rf, random_state=1)
joblib.dump(model_rf, 'random_forest_model.pkl')

print(f"Best hyperparameters for Random Forest: {best_params_rf}")
print("Model saved as 'random_forest_model.pkl'")


[I 2024-10-11 22:50:43,672] A new study created in memory with name: no-name-18bb6fe0-0ff7-42e7-b55a-b68a75c9a070
[I 2024-10-11 22:50:47,385] Trial 0 finished with value: 0.8458276563633065 and parameters: {'n_estimators': 220, 'max_depth': 5, 'min_samples_split': 7, 'min_samples_leaf': 5, 'max_features': 'log2', 'bootstrap': True}. Best is trial 0 with value: 0.8458276563633065.
[I 2024-10-11 22:50:50,328] Trial 1 finished with value: 0.8479117519904718 and parameters: {'n_estimators': 315, 'max_depth': 7, 'min_samples_split': 5, 'min_samples_leaf': 17, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 1 with value: 0.8479117519904718.
[I 2024-10-11 22:50:52,041] Trial 3 finished with value: 0.8414833944749377 and parameters: {'n_estimators': 250, 'max_depth': 3, 'min_samples_split': 13, 'min_samples_leaf': 3, 'max_features': 'log2', 'bootstrap': False}. Best is trial 1 with value: 0.8479117519904718.
[I 2024-10-11 22:50:57,255] Trial 2 finished with value: 0.8433411862047466 

Best hyperparameters for Random Forest: {'n_estimators': 679, 'max_depth': 9, 'min_samples_split': 10, 'min_samples_leaf': 18, 'max_features': 'log2', 'bootstrap': True}
Model saved as 'random_forest_model.pkl'


# Support Vector Machine Study

In [20]:


# Define the objective function for tuning SVC with scaling
def objective_svc(trial):
    # Define the hyperparameters to search
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])  # Kernel
    
    params = {
        'C': trial.suggest_float('C', 1e-3, 1e3, log=True),  # Regularization 
        'degree': trial.suggest_int('degree', 2, 5) if kernel == 'poly' else 3,  # Degree for 'poly' kernel
        'gamma': trial.suggest_categorical('gamma', ['scale', 'auto']),  # Kernel coefficient
        'tol': trial.suggest_float('tol', 1e-6, 1e-2),  # Tolerance for stopping
    }

    # Create a pipeline with scaling and SVC
    pipe = Pipeline([
        ('scaler', StandardScaler()),  # Scaling
        ('svc', SVC(kernel=kernel, **params, probability=True, random_state=1))  
    ])

    # Perform cross-validation and calculate the average ROC AUC score
    accuracy_scores = cross_val_score(pipe, X, y, cv=skf, scoring='roc_auc', n_jobs=-1)

    # Return the mean ROC AUC score across all folds
    return np.mean(accuracy_scores)

# Create and run the Optuna study for SVC
study_svc = optuna.create_study(direction='maximize', sampler=sampler, pruner=pruner)
study_svc.optimize(objective_svc, n_trials=100, n_jobs=-1)

# Get the best hyperparameters
best_params_svc = study_svc.best_params

# Save the model (with best hyperparameters, not fitted) using joblib
model_svc = SVC(kernel=best_params_svc['kernel'], **{k: v for k, v in best_params_svc.items() if k != 'kernel'}, probability=True, random_state=1)
joblib.dump(model_svc, 'svc_model.pkl')

print(f"Best hyperparameters for SVC: {best_params_svc}")
print("Model saved as 'svc_model.pkl'")


[I 2024-10-11 22:10:17,342] A new study created in memory with name: no-name-e05b6fc6-7fce-4a39-81b1-4ac23ef4d482
[I 2024-10-11 22:10:38,733] Trial 0 finished with value: 0.7939595236213529 and parameters: {'kernel': 'poly', 'C': 0.04324043046420094, 'degree': 4, 'gamma': 'auto', 'tol': 0.009449520298876924}. Best is trial 0 with value: 0.7939595236213529.
[I 2024-10-11 22:10:54,981] Trial 4 finished with value: 0.7919812686430632 and parameters: {'kernel': 'poly', 'C': 0.6329967857634339, 'degree': 4, 'gamma': 'auto', 'tol': 0.006304917026991936}. Best is trial 0 with value: 0.7939595236213529.
[I 2024-10-11 22:11:23,853] Trial 1 finished with value: 0.738320672674568 and parameters: {'kernel': 'rbf', 'C': 983.8741611453303, 'gamma': 'scale', 'tol': 0.009561952178630623}. Best is trial 0 with value: 0.7939595236213529.
[I 2024-10-11 22:11:54,864] Trial 5 finished with value: 0.8220680929252824 and parameters: {'kernel': 'rbf', 'C': 0.04809907307106654, 'gamma': 'auto', 'tol': 0.005135

Best hyperparameters for SVC: {'kernel': 'sigmoid', 'C': 0.0340906254862345, 'gamma': 'auto', 'tol': 0.0009294937426134367}
Model saved as 'svc_model.pkl'


# XGB Gradient Boosting Classifier Study

In [19]:


# Define the objective function for tuning XGBoost
def objective_xgb(trial):
    # Define the hyperparameters to search
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),  # Number of boosting rounds
        'max_depth': trial.suggest_int('max_depth', 3, 10),  # Max depth of trees
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),  # Learning rate (eta)
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),  # Subsampling ratio
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),  # Feature subsampling
        'objective': 'binary:logistic',  # Binary classification
        'eval_metric': 'logloss',  # Evaluation metric
        'use_label_encoder': False  # Avoid unnecessary warning
    }

    # Create an XGBoost model with the hyperparameters
    model = xgb.XGBClassifier(**params, random_state=1)

    # Perform cross-validation and calculate the average accuracy score
    accuracy_scores = cross_val_score(model, X, y, cv=skf, scoring='roc_auc', n_jobs=-1)

    # Return the mean accuracy score across all folds
    return np.mean(accuracy_scores)

# Create and run the Optuna study for XGBoost
study_xgb = optuna.create_study(direction='maximize', sampler=sampler, pruner=pruner)
study_xgb.optimize(objective_xgb, n_trials=100, n_jobs=-1)

# Get the best hyperparameters
best_params_xgb = study_xgb.best_params

# Save the model (with best hyperparameters, not fitted) using joblib
model_xgb = xgb.XGBClassifier(**best_params_xgb, random_state=42)
joblib.dump(model_xgb, 'xgb_model.pkl')

print(f"Best hyperparameters for XGBoost: {best_params_xgb}")
print("Model saved as 'xgb_model.pkl'")


[I 2024-10-11 22:09:19,640] A new study created in memory with name: no-name-de127a60-cb00-4a43-9eb4-6a9a0060e4d6
[I 2024-10-11 22:09:24,036] Trial 0 finished with value: 0.829077270359034 and parameters: {'n_estimators': 314, 'max_depth': 4, 'learning_rate': 0.13747971619980964, 'subsample': 0.7469002847257884, 'colsample_bytree': 0.9405515834851348}. Best is trial 0 with value: 0.829077270359034.
[I 2024-10-11 22:09:25,970] Trial 7 finished with value: 0.8477354175160003 and parameters: {'n_estimators': 212, 'max_depth': 3, 'learning_rate': 0.06199109265288598, 'subsample': 0.7007994294299513, 'colsample_bytree': 0.590737343928041}. Best is trial 7 with value: 0.8477354175160003.
[I 2024-10-11 22:09:26,087] Trial 4 finished with value: 0.7983660672054788 and parameters: {'n_estimators': 406, 'max_depth': 9, 'learning_rate': 0.2822280517717014, 'subsample': 0.5854537739602208, 'colsample_bytree': 0.937046608063131}. Best is trial 7 with value: 0.8477354175160003.
[I 2024-10-11 22:09:2

Best hyperparameters for XGBoost: {'n_estimators': 149, 'max_depth': 3, 'learning_rate': 0.04078736842557374, 'subsample': 0.6130865171818347, 'colsample_bytree': 0.5225836933340453}
Model saved as 'xgb_model.pkl'


# Multi Layer Preceptron Study

In [9]:
def objective_mlp(trial):
    # Define the hyperparameters to search
    params = {
        'hidden_layer_sizes': trial.suggest_categorical('hidden_layer_sizes', [(10,), (20,), (30,), (50,), (100,)]),  # Number of neurons in each layer
        'activation': trial.suggest_categorical('activation', ['relu', 'tanh']),  # Activation function
        'solver': trial.suggest_categorical('solver', ['adam', 'sgd']),  # Optimization algorithm
        'alpha': trial.suggest_float('alpha', 1e-5, 1e-1, log=True),  # L2 penalty (regularization term)
        'learning_rate': trial.suggest_categorical('learning_rate', ['constant', 'adaptive']),  # Learning rate schedule
        'max_iter': trial.suggest_int('max_iter', 100, 1000),  # Maximum number of iterations
    }

    # Create a pipeline with scaling and MLP
    pipe = Pipeline([
        ('scaler', StandardScaler()),  # Scaling
        ('mlp', MLPClassifier(**params, random_state=1))  # MLP model
    ])
    
    # Perform cross-validation and calculate the average accuracy score
    accuracy_scores = cross_val_score(pipe, X, y, cv=skf, scoring='roc_auc', n_jobs=-1)

    # Return the mean accuracy score across all folds
    return np.mean(accuracy_scores)

# Create and run the Optuna study for MLP
study_mlp = optuna.create_study(direction='maximize', sampler=sampler, pruner=pruner)
study_mlp.optimize(objective_mlp, n_trials=100, n_jobs=-1)

# Get the best hyperparameters
best_params_mlp = study_mlp.best_params

# Save the model (with best hyperparameters, not fitted) using joblib
model_mlp = MLPClassifier(**best_params_mlp, random_state=1)
joblib.dump(model_mlp, 'mlp_model.pkl')

print(f"Best hyperparameters for MLP: {best_params_mlp}")
print("Model saved as 'mlp_model.pkl'")

[I 2024-10-11 21:01:32,890] A new study created in memory with name: no-name-7425d946-c907-4321-8237-74333cfaeacf
[I 2024-10-11 21:01:39,083] Trial 0 finished with value: 0.8391812478844034 and parameters: {'hidden_layer_sizes': (20,), 'activation': 'tanh', 'solver': 'sgd', 'alpha': 1.1294939637442101e-05, 'learning_rate': 'adaptive', 'max_iter': 774}. Best is trial 0 with value: 0.8391812478844034.
[I 2024-10-11 21:01:43,548] Trial 1 finished with value: 0.8380288019985651 and parameters: {'hidden_layer_sizes': (20,), 'activation': 'relu', 'solver': 'sgd', 'alpha': 0.008711343119478517, 'learning_rate': 'constant', 'max_iter': 915}. Best is trial 0 with value: 0.8391812478844034.
[I 2024-10-11 21:01:49,531] Trial 2 finished with value: 0.8420772552790888 and parameters: {'hidden_layer_sizes': (100,), 'activation': 'tanh', 'solver': 'sgd', 'alpha': 0.025096087857997686, 'learning_rate': 'adaptive', 'max_iter': 265}. Best is trial 2 with value: 0.8420772552790888.
[I 2024-10-11 21:02:18

Best hyperparameters for MLP: {'hidden_layer_sizes': (100,), 'activation': 'tanh', 'solver': 'sgd', 'alpha': 0.08733302858561147, 'learning_rate': 'adaptive', 'max_iter': 368}
Model saved as 'mlp_model.pkl'


# Naive Bayes Model Study

In [18]:
# Define the objective function for tuning Gaussian Naive Bayes
def objective_naive_bayes(trial):
    # Naive Bayes typically has fewer hyperparameters, but we can tune 'var_smoothing'
    params = {
        'var_smoothing': trial.suggest_float('var_smoothing', 1e-9, 1e-1, log=True)  # Smoothing parameter
    }

    # Create a Gaussian Naive Bayes model with the hyperparameters
    model = GaussianNB(**params)

    # Perform cross-validation and calculate the average accuracy score
    accuracy_scores = cross_val_score(model, X, y, cv=skf, scoring='roc_auc', n_jobs=-1)

    # Return the mean accuracy score across all folds
    return np.mean(accuracy_scores)

# Create and run the Optuna study for Naive Bayes
study_nb = optuna.create_study(direction='maximize', sampler=sampler, pruner=pruner)
study_nb.optimize(objective_naive_bayes, n_trials=100, n_jobs=-1)

# Get the best hyperparameters
best_params_nb = study_nb.best_params
model_gnb = GaussianNB(**best_params_nb)
joblib.dump(model_gnb, 'naive_bayes_model.pkl')

print(f"Best hyperparameters for Naive Bayes: {best_params_nb}")
print("Model saved as 'naive_bayes_model.pkl'")

[I 2024-10-11 21:32:15,730] A new study created in memory with name: no-name-391e7393-e9dd-4b2e-bfc5-80318c30b68e
[I 2024-10-11 21:32:15,850] Trial 0 finished with value: 0.8302505737367774 and parameters: {'var_smoothing': 4.004061308678203e-08}. Best is trial 0 with value: 0.8302505737367774.
[I 2024-10-11 21:32:15,974] Trial 3 finished with value: 0.8318932434202445 and parameters: {'var_smoothing': 2.8275567503327517e-08}. Best is trial 3 with value: 0.8318932434202445.
[I 2024-10-11 21:32:15,984] Trial 2 finished with value: 0.6637058925407826 and parameters: {'var_smoothing': 0.02936515628211917}. Best is trial 3 with value: 0.8318932434202445.
[I 2024-10-11 21:32:16,015] Trial 1 finished with value: 0.6644854798427322 and parameters: {'var_smoothing': 0.026602426419061055}. Best is trial 3 with value: 0.8318932434202445.
[I 2024-10-11 21:32:16,044] Trial 5 finished with value: 0.8326487852486519 and parameters: {'var_smoothing': 4.787512729442204e-09}. Best is trial 5 with value

Best hyperparameters for Naive Bayes: {'var_smoothing': 1.011436483981849e-08}
Model saved as 'naive_bayes_model.pkl'
