In [1]:
# Importing necessities for the analysis
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
import optuna
from sklearn.neural_network import MLPClassifier
from imblearn.pipeline import Pipeline  
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
import joblib
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("max_colwidth", None)

In [3]:
# Loading the data
df_churn=pd.read_csv('data/Churn_prepared.csv',index_col='customerID')

In [4]:
sampler = TPESampler(seed=1)
pruner = MedianPruner(n_startup_trials=10)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
scaler= StandardScaler()

In [5]:
# Prepare your training data for h1n1
X = df_churn.drop(columns=['Churn'])
y = df_churn['Churn']


# LogReg study

In [6]:


def objective_logistic(trial):
    # Define the hyperparameters to search
    params = {
        'penalty': trial.suggest_categorical('penalty', ['l2']),
        'C': trial.suggest_float('C', 1e-4, 1e4, log=True),
        'solver': trial.suggest_categorical('solver', ['lbfgs', 'sag', 'newton-cg']),
        'max_iter': trial.suggest_int('max_iter', 100, 2000),
        'tol': trial.suggest_float('tol', 1e-6, 1e-2),
        'fit_intercept': True,
    }

    # Create a pipeline with scaling and logistic regression
    pipe = Pipeline([
        ('scaler', StandardScaler()), 
        ('model', LogisticRegression(**params))  
    ])
    
    # Perform cross-validation and calculate the average AUC score
    auc_scores = cross_val_score(pipe, X, y, cv=skf, scoring='roc_auc', n_jobs=-1)

    # Return the mean AUC score across all folds
    return np.mean(auc_scores)

# Create and run the Optuna study
study_logistic = optuna.create_study(direction='maximize',sampler=sampler,pruner=pruner)
study_logistic.optimize(objective_logistic, n_trials=250)

# Get the best hyperparameters
best_params_logistic = study_logistic.best_params

model_logreg = LogisticRegression(**best_params_logistic)
joblib.dump(model_logreg, 'logistic_regression_model.pkl')

print(f"Best hyperparameters for Logistic Regression: {best_params_logistic}")
print("Model saved as 'logistic_regression_model.pkl'")


[I 2024-10-12 00:25:37,079] A new study created in memory with name: no-name-11445746-abee-4a4f-8201-b5774a808afb
[I 2024-10-12 00:25:40,346] Trial 0 finished with value: 0.8445030297283507 and parameters: {'penalty': 'l2', 'C': 0.21685829435749204, 'solver': 'lbfgs', 'max_iter': 378, 'tol': 0.0009242936090932093}. Best is trial 0 with value: 0.8445030297283507.
[I 2024-10-12 00:25:42,316] Trial 1 finished with value: 0.8409828155218968 and parameters: {'penalty': 'l2', 'C': 0.003090876773389772, 'solver': 'newton-cg', 'max_iter': 896, 'tol': 0.006852509784467199}. Best is trial 0 with value: 0.8445030297283507.
[I 2024-10-12 00:25:42,549] Trial 2 finished with value: 0.8409873459638876 and parameters: {'penalty': 'l2', 'C': 0.004321335623994385, 'solver': 'lbfgs', 'max_iter': 893, 'tol': 0.005587339594629072}. Best is trial 0 with value: 0.8445030297283507.
[I 2024-10-12 00:25:42,768] Trial 3 finished with value: 0.8398778055077054 and parameters: {'penalty': 'l2', 'C': 0.001327686406

Best hyperparameters for Logistic Regression: {'penalty': 'l2', 'C': 526.6887895805678, 'solver': 'sag', 'max_iter': 1766, 'tol': 0.008215829858949546}
Model saved as 'logistic_regression_model.pkl'


# Random Forest Study

In [8]:

# Define the objective function for tuning Random Forest
def objective_random_forest(trial):
    # Define the hyperparameters to search
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),  # Number of trees
        'max_depth': trial.suggest_int('max_depth', 3, 60),  # Max tree depth
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),  # Min samples to split
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),  # Min samples at a leaf
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),  # Max features to split
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),  # Bootstrap sampling
    }

    # Create a Random Forest model with the hyperparameters
    model = RandomForestClassifier(**params, random_state=42)

    # Perform cross-validation and calculate the average accuracy score
    accuracy_scores = cross_val_score(model, X, y, cv=skf, scoring='roc_auc', n_jobs=-1)

    # Return the mean accuracy score across all folds
    return np.mean(accuracy_scores)

# Create and run the Optuna study for Random Forest
study_rf = optuna.create_study(direction='maximize',sampler=sampler,pruner=pruner)
study_rf.optimize(objective_random_forest, n_trials=200)

# Get the best hyperparameters
best_params_rf = study_rf.best_params

# Save the model (with best hyperparameters, not fitted) using joblib
model_rf = RandomForestClassifier(**best_params_rf)
joblib.dump(model_rf, 'random_forest_model.pkl')

print(f"Best hyperparameters for Random Forest: {best_params_rf}")
print("Model saved as 'random_forest_model.pkl'")


[I 2024-10-12 00:31:30,375] A new study created in memory with name: no-name-1e3d3468-99a7-44f0-bb27-2d477b4c5a67
[I 2024-10-12 00:31:35,618] Trial 0 finished with value: 0.8418454292860138 and parameters: {'n_estimators': 164, 'max_depth': 59, 'min_samples_split': 12, 'min_samples_leaf': 5, 'max_features': 'log2', 'bootstrap': False}. Best is trial 0 with value: 0.8418454292860138.
[I 2024-10-12 00:31:48,045] Trial 1 finished with value: 0.8467864050852343 and parameters: {'n_estimators': 862, 'max_depth': 16, 'min_samples_split': 11, 'min_samples_leaf': 13, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 1 with value: 0.8467864050852343.
[I 2024-10-12 00:32:47,488] Trial 2 finished with value: 0.8479900890835894 and parameters: {'n_estimators': 646, 'max_depth': 35, 'min_samples_split': 8, 'min_samples_leaf': 20, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 2 with value: 0.8479900890835894.
[I 2024-10-12 00:33:32,211] Trial 3 finished with value: 0.84630532957

Best hyperparameters for Random Forest: {'n_estimators': 919, 'max_depth': 11, 'min_samples_split': 20, 'min_samples_leaf': 16, 'max_features': 'log2', 'bootstrap': True}
Model saved as 'random_forest_model.pkl'


# Support Vector Machine Study

In [9]:


# Define the objective function for tuning SVC with scaling
def objective_svc(trial):
    # Define the hyperparameters to search
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])  # Kernel
    
    params = {
        'C': trial.suggest_float('C', 1e-3, 1e3, log=True),  # Regularization 
        'degree': trial.suggest_int('degree', 2, 5) if kernel == 'poly' else 3,  # Degree for 'poly' kernel
        'gamma': trial.suggest_categorical('gamma', ['scale', 'auto']),  # Kernel coefficient
        'tol': trial.suggest_float('tol', 1e-6, 1e-2),  # Tolerance for stopping
    }

    # Create a pipeline with scaling and SVC
    pipe = Pipeline([
        ('scaler', StandardScaler()),  # Scaling
        ('svc', SVC(kernel=kernel, **params, probability=True, random_state=1))  
    ])

    # Perform cross-validation and calculate the average ROC AUC score
    accuracy_scores = cross_val_score(pipe, X, y, cv=skf, scoring='roc_auc', n_jobs=-1)

    # Return the mean ROC AUC score across all folds
    return np.mean(accuracy_scores)

# Create and run the Optuna study for SVC
study_svc = optuna.create_study(direction='maximize', sampler=sampler, pruner=pruner)
study_svc.optimize(objective_svc, n_trials=200)

# Get the best hyperparameters
best_params_svc = study_svc.best_params

# Save the model (with best hyperparameters, not fitted) using joblib
model_svc = SVC(kernel=best_params_svc['kernel'], **{k: v for k, v in best_params_svc.items() if k != 'kernel'}, probability=True, random_state=1)
joblib.dump(model_svc, 'svc_model.pkl')

print(f"Best hyperparameters for SVC: {best_params_svc}")
print("Model saved as 'svc_model.pkl'")


[I 2024-10-12 01:07:43,770] A new study created in memory with name: no-name-8cf6d267-2d13-445b-8359-bd6d214752e2
[I 2024-10-12 01:08:04,632] Trial 0 finished with value: 0.7366837736519875 and parameters: {'kernel': 'poly', 'C': 53.40419868113833, 'degree': 4, 'gamma': 'auto', 'tol': 0.006791009296817732}. Best is trial 0 with value: 0.7366837736519875.
[I 2024-10-12 01:08:45,959] Trial 1 finished with value: 0.7400010152943859 and parameters: {'kernel': 'rbf', 'C': 696.147439581564, 'gamma': 'auto', 'tol': 0.005747540335576316}. Best is trial 1 with value: 0.7400010152943859.
[I 2024-10-12 01:08:56,118] Trial 2 finished with value: 0.7416759534899369 and parameters: {'kernel': 'sigmoid', 'C': 141.2157115190222, 'gamma': 'scale', 'tol': 0.008644929821115943}. Best is trial 2 with value: 0.7416759534899369.
[I 2024-10-12 01:09:10,255] Trial 3 finished with value: 0.8218147309559655 and parameters: {'kernel': 'poly', 'C': 0.291654109080507, 'degree': 3, 'gamma': 'auto', 'tol': 0.0043030

Best hyperparameters for SVC: {'kernel': 'sigmoid', 'C': 0.03987717436103337, 'gamma': 'scale', 'tol': 0.0030448080756325384}
Model saved as 'svc_model.pkl'


# XGB Gradient Boosting Classifier Study

In [10]:


# Define the objective function for tuning XGBoost
def objective_xgb(trial):
    # Define the hyperparameters to search
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),  # Number of boosting rounds
        'max_depth': trial.suggest_int('max_depth', 3, 10),  # Max depth of trees
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),  # Learning rate (eta)
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),  # Subsampling ratio
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),  # Feature subsampling
        'objective': 'binary:logistic',  # Binary classification
        'eval_metric': 'logloss',  # Evaluation metric
        'use_label_encoder': False  # Avoid unnecessary warning
    }

    # Create an XGBoost model with the hyperparameters
    model = xgb.XGBClassifier(**params, random_state=1)

    # Perform cross-validation and calculate the average accuracy score
    accuracy_scores = cross_val_score(model, X, y, cv=skf, scoring='roc_auc', n_jobs=-1)

    # Return the mean accuracy score across all folds
    return np.mean(accuracy_scores)

# Create and run the Optuna study for XGBoost
study_xgb = optuna.create_study(direction='maximize', sampler=sampler, pruner=pruner)
study_xgb.optimize(objective_xgb, n_trials=300)

# Get the best hyperparameters
best_params_xgb = study_xgb.best_params

# Save the model (with best hyperparameters, not fitted) using joblib
model_xgb = xgb.XGBClassifier(**best_params_xgb)
joblib.dump(model_xgb, 'xgb_model.pkl')

print(f"Best hyperparameters for XGBoost: {best_params_xgb}")
print("Model saved as 'xgb_model.pkl'")


[I 2024-10-12 01:55:29,784] A new study created in memory with name: no-name-8740cc91-4677-4eee-9e1f-1f1334505c34
[I 2024-10-12 01:55:31,865] Trial 0 finished with value: 0.8332896127048807 and parameters: {'n_estimators': 564, 'max_depth': 6, 'learning_rate': 0.03900761237770102, 'subsample': 0.8109031158702078, 'colsample_bytree': 0.7720050594069691}. Best is trial 0 with value: 0.8332896127048807.
[I 2024-10-12 01:55:33,674] Trial 1 finished with value: 0.8144698976831964 and parameters: {'n_estimators': 689, 'max_depth': 4, 'learning_rate': 0.15279028525569635, 'subsample': 0.6110245698999613, 'colsample_bytree': 0.7596759121830166}. Best is trial 0 with value: 0.8332896127048807.
[I 2024-10-12 01:55:35,352] Trial 2 finished with value: 0.8354675366960352 and parameters: {'n_estimators': 807, 'max_depth': 3, 'learning_rate': 0.07162886734797544, 'subsample': 0.9364611882000555, 'colsample_bytree': 0.9223548038010347}. Best is trial 2 with value: 0.8354675366960352.
[I 2024-10-12 01

Best hyperparameters for XGBoost: {'n_estimators': 340, 'max_depth': 3, 'learning_rate': 0.01791817293518791, 'subsample': 0.7421838787460009, 'colsample_bytree': 0.8266311138487283}
Model saved as 'xgb_model.pkl'


# Multi Layer Preceptron Study

In [12]:
def objective_mlp(trial):
    # Define the hyperparameters to search
    params = {
        'hidden_layer_sizes': trial.suggest_categorical('hidden_layer_sizes', [(10,), (20,), (30,), (50,), (100,),(50,25),(100,50)]),  # Number of neurons in each layer
        'activation': trial.suggest_categorical('activation', ['relu', 'tanh']),  # Activation function
        'solver': trial.suggest_categorical('solver', ['adam', 'sgd']),  # Optimization algorithm
        'alpha': trial.suggest_float('alpha', 1e-5, 1e-1, log=True),  # L2 penalty (regularization term)
        'learning_rate': trial.suggest_categorical('learning_rate', ['constant', 'adaptive']),  # Learning rate schedule
        'max_iter': trial.suggest_int('max_iter', 100, 1000),  # Maximum number of iterations
    }

    # Create a pipeline with scaling and MLP
    pipe = Pipeline([
        ('scaler', StandardScaler()),  # Scaling
        ('mlp', MLPClassifier(**params, random_state=1))  # MLP model
    ])
    
    # Perform cross-validation and calculate the average accuracy score
    accuracy_scores = cross_val_score(pipe, X, y, cv=skf, scoring='roc_auc', n_jobs=-1)

    # Return the mean accuracy score across all folds
    return np.mean(accuracy_scores)

# Create and run the Optuna study for MLP
study_mlp = optuna.create_study(direction='maximize', sampler=sampler, pruner=pruner)
study_mlp.optimize(objective_mlp, n_trials=100, n_jobs=-1)

# Get the best hyperparameters
best_params_mlp = study_mlp.best_params

# Save the model (with best hyperparameters, not fitted) using joblib
model_mlp = MLPClassifier(**best_params_mlp, random_state=1)
joblib.dump(model_mlp, 'mlp_model.pkl')

print(f"Best hyperparameters for MLP: {best_params_mlp}")
print("Model saved as 'mlp_model.pkl'")

[I 2024-10-12 09:58:55,337] A new study created in memory with name: no-name-a65197fd-32f9-4269-93e0-5817d3e1d088
[I 2024-10-12 09:59:10,059] Trial 7 finished with value: 0.842200327209935 and parameters: {'hidden_layer_sizes': (100,), 'activation': 'relu', 'solver': 'sgd', 'alpha': 0.011029700649408844, 'learning_rate': 'constant', 'max_iter': 782}. Best is trial 7 with value: 0.842200327209935.
[I 2024-10-12 10:00:24,822] Trial 1 finished with value: 0.7810422802002317 and parameters: {'hidden_layer_sizes': (100,), 'activation': 'tanh', 'solver': 'adam', 'alpha': 5.703370363652065e-05, 'learning_rate': 'constant', 'max_iter': 395}. Best is trial 7 with value: 0.842200327209935.
[I 2024-10-12 10:00:31,010] Trial 0 finished with value: 0.842075698667624 and parameters: {'hidden_layer_sizes': (100,), 'activation': 'tanh', 'solver': 'sgd', 'alpha': 0.005021971619204941, 'learning_rate': 'adaptive', 'max_iter': 507}. Best is trial 7 with value: 0.842200327209935.
[I 2024-10-12 10:00:36,73

Best hyperparameters for MLP: {'hidden_layer_sizes': (100,), 'activation': 'relu', 'solver': 'sgd', 'alpha': 0.019403237378823834, 'learning_rate': 'constant', 'max_iter': 529}
Model saved as 'mlp_model.pkl'


# Naive Bayes Model Study

In [13]:
# Define the objective function for tuning Gaussian Naive Bayes
def objective_naive_bayes(trial):
    # Naive Bayes typically has fewer hyperparameters, but we can tune 'var_smoothing'
    params = {
        'var_smoothing': trial.suggest_float('var_smoothing', 1e-9, 1e-1, log=True)  # Smoothing parameter
    }

    # Create a Gaussian Naive Bayes model with the hyperparameters
    model = GaussianNB(**params)

    # Perform cross-validation and calculate the average accuracy score
    accuracy_scores = cross_val_score(model, X, y, cv=skf, scoring='roc_auc', n_jobs=-1)

    # Return the mean accuracy score across all folds
    return np.mean(accuracy_scores)

# Create and run the Optuna study for Naive Bayes
study_nb = optuna.create_study(direction='maximize', sampler=sampler, pruner=pruner)
study_nb.optimize(objective_naive_bayes, n_trials=200)

# Get the best hyperparameters
best_params_nb = study_nb.best_params
model_gnb = GaussianNB(**best_params_nb)
joblib.dump(model_gnb, 'naive_bayes_model.pkl')

print(f"Best hyperparameters for Naive Bayes: {best_params_nb}")
print("Model saved as 'naive_bayes_model.pkl'")

[I 2024-10-12 10:29:22,000] A new study created in memory with name: no-name-f3751f95-ad6d-4309-8cae-6fdc8a7b6429
[I 2024-10-12 10:29:25,200] Trial 0 finished with value: 0.778342099251646 and parameters: {'var_smoothing': 3.6584792285228627e-06}. Best is trial 0 with value: 0.778342099251646.
[I 2024-10-12 10:29:27,283] Trial 1 finished with value: 0.8115220321830954 and parameters: {'var_smoothing': 1.7058872118266142e-07}. Best is trial 1 with value: 0.8115220321830954.
[I 2024-10-12 10:29:27,504] Trial 2 finished with value: 0.714250856652235 and parameters: {'var_smoothing': 0.0014631591306579173}. Best is trial 1 with value: 0.8115220321830954.
[I 2024-10-12 10:29:27,778] Trial 3 finished with value: 0.7766791386978457 and parameters: {'var_smoothing': 6.820149230257266e-06}. Best is trial 1 with value: 0.8115220321830954.
[I 2024-10-12 10:29:28,050] Trial 4 finished with value: 0.8329761978835002 and parameters: {'var_smoothing': 8.198807053370584e-09}. Best is trial 4 with valu

Best hyperparameters for Naive Bayes: {'var_smoothing': 9.74537267887857e-09}
Model saved as 'naive_bayes_model.pkl'
