In [13]:
# Importing necessities for the analysis
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
import optuna
from sklearn.neural_network import MLPClassifier
from imblearn.pipeline import Pipeline  
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
import joblib
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from imblearn.ensemble import BalancedBaggingClassifier

In [14]:
pd.set_option("display.max_columns", None)
pd.set_option("max_colwidth", None)

In [15]:
# Loading the data
df_churn=pd.read_csv('data/Churn_prepared.csv',index_col='customerID')

In [16]:
sampler = TPESampler(seed=1)
pruner = MedianPruner(n_startup_trials=10)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
scaler= StandardScaler()

In [17]:
# Prepare your training data for h1n1
X = df_churn.drop(columns=['Churn'])
y = df_churn['Churn']


# LogReg study

In [None]:


def objective_logistic(trial):
    # Define the hyperparameters to search
    params = {
        'penalty': trial.suggest_categorical('penalty', ['l2']),
        'C': trial.suggest_float('C', 1e-4, 1e4, log=True),
        'solver': trial.suggest_categorical('solver', ['lbfgs', 'sag', 'newton-cg']),
        'max_iter': trial.suggest_int('max_iter', 100, 2000),
        'tol': trial.suggest_float('tol', 1e-6, 1e-2),
        'fit_intercept': True,
    }

    # Create a pipeline with scaling and logistic regression
    pipe = Pipeline([
        ('scaler', StandardScaler()), 
        ('model', LogisticRegression(**params))  
    ])
    
    # Perform cross-validation and calculate the average AUC score
    auc_scores = cross_val_score(pipe, X, y, cv=skf, scoring='roc_auc', n_jobs=-1)

    # Return the mean AUC score across all folds
    return np.mean(auc_scores)

# Create and run the Optuna study
study_logistic = optuna.create_study(direction='maximize',sampler=sampler,pruner=pruner)
study_logistic.optimize(objective_logistic, n_trials=250)

# Get the best hyperparameters
best_params_logistic = study_logistic.best_params

model_logreg = LogisticRegression(**best_params_logistic,random_state=1)
joblib.dump(model_logreg, 'logistic_regression_model.pkl')

print(f"Best hyperparameters for Logistic Regression: {best_params_logistic}")
print("Model saved as 'logistic_regression_model.pkl'")


# Random Forest Study

In [None]:

# Define the objective function for tuning Random Forest
def objective_random_forest(trial):
    # Define the hyperparameters to search
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),  # Number of trees
        'max_depth': trial.suggest_int('max_depth', 3, 60),  # Max tree depth
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),  # Min samples to split
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),  # Min samples at a leaf
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),  # Max features to split
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),  # Bootstrap sampling
    }

    # Create a Random Forest model with the hyperparameters
    model = RandomForestClassifier(**params, random_state=42)

    # Perform cross-validation and calculate the average accuracy score
    accuracy_scores = cross_val_score(model, X, y, cv=skf, scoring='roc_auc', n_jobs=-1)

    # Return the mean accuracy score across all folds
    return np.mean(accuracy_scores)

# Create and run the Optuna study for Random Forest
study_rf = optuna.create_study(direction='maximize',sampler=sampler,pruner=pruner)
study_rf.optimize(objective_random_forest, n_trials=200)

# Get the best hyperparameters
best_params_rf = study_rf.best_params

# Save the model (with best hyperparameters, not fitted) using joblib
model_rf = RandomForestClassifier(**best_params_rf,random_state=1)
joblib.dump(model_rf, 'random_forest_model.pkl')

print(f"Best hyperparameters for Random Forest: {best_params_rf}")
print("Model saved as 'random_forest_model.pkl'")


# Support Vector Machine Study

In [None]:


# Define the objective function for tuning SVC with scaling
def objective_svc(trial):
    # Define the hyperparameters to search
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])  # Kernel
    
    params = {
        'C': trial.suggest_float('C', 1e-3, 1e3, log=True),  # Regularization 
        'degree': trial.suggest_int('degree', 2, 5) if kernel == 'poly' else 3,  # Degree for 'poly' kernel
        'gamma': trial.suggest_categorical('gamma', ['scale', 'auto']),  # Kernel coefficient
        'tol': trial.suggest_float('tol', 1e-6, 1e-2),  # Tolerance for stopping
    }

    # Create a pipeline with scaling and SVC
    pipe = Pipeline([
        ('scaler', StandardScaler()),  # Scaling
        ('svc', SVC(kernel=kernel, **params, probability=True, random_state=1))  
    ])

    # Perform cross-validation and calculate the average ROC AUC score
    accuracy_scores = cross_val_score(pipe, X, y, cv=skf, scoring='roc_auc', n_jobs=-1)

    # Return the mean ROC AUC score across all folds
    return np.mean(accuracy_scores)

# Create and run the Optuna study for SVC
study_svc = optuna.create_study(direction='maximize', sampler=sampler, pruner=pruner)
study_svc.optimize(objective_svc, n_trials=200)

# Get the best hyperparameters
best_params_svc = study_svc.best_params

# Save the model (with best hyperparameters, not fitted) using joblib
model_svc = SVC(kernel=best_params_svc['kernel'], **{k: v for k, v in best_params_svc.items() if k != 'kernel'}, probability=True, random_state=1)
joblib.dump(model_svc, 'svc_model.pkl')

print(f"Best hyperparameters for SVC: {best_params_svc}")
print("Model saved as 'svc_model.pkl'")


# XGB Gradient Boosting Classifier Study

In [None]:


# Define the objective function for tuning XGBoost
def objective_xgb(trial):
    # Define the hyperparameters to search
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),  # Number of boosting rounds
        'max_depth': trial.suggest_int('max_depth', 3, 10),  # Max depth of trees
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),  # Learning rate (eta)
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),  # Subsampling ratio
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),  # Feature subsampling
        'objective': 'binary:logistic',  # Binary classification
        'eval_metric': 'logloss',  # Evaluation metric
        'use_label_encoder': False  # Avoid unnecessary warning
    }

    # Create an XGBoost model with the hyperparameters
    model = xgb.XGBClassifier(**params, random_state=1)

    # Perform cross-validation and calculate the average accuracy score
    accuracy_scores = cross_val_score(model, X, y, cv=skf, scoring='roc_auc', n_jobs=-1)

    # Return the mean accuracy score across all folds
    return np.mean(accuracy_scores)

# Create and run the Optuna study for XGBoost
study_xgb = optuna.create_study(direction='maximize', sampler=sampler, pruner=pruner)
study_xgb.optimize(objective_xgb, n_trials=300)

# Get the best hyperparameters
best_params_xgb = study_xgb.best_params

# Save the model (with best hyperparameters, not fitted) using joblib
model_xgb = xgb.XGBClassifier(**best_params_xgb,random_state=1)
joblib.dump(model_xgb, 'xgb_model.pkl')

print(f"Best hyperparameters for XGBoost: {best_params_xgb}")
print("Model saved as 'xgb_model.pkl'")


# Multi Layer Preceptron Study

In [None]:
def objective_mlp(trial):
    # Define the hyperparameters to search
    params = {
        'hidden_layer_sizes': trial.suggest_categorical('hidden_layer_sizes', [(10,), (20,), (30,), (50,), (100,),(50,25),(100,50)]),  # Number of neurons in each layer
        'activation': trial.suggest_categorical('activation', ['relu', 'tanh']),  # Activation function
        'solver': trial.suggest_categorical('solver', ['adam', 'sgd']),  # Optimization algorithm
        'alpha': trial.suggest_float('alpha', 1e-5, 1e-1, log=True),  # L2 penalty (regularization term)
        'learning_rate': trial.suggest_categorical('learning_rate', ['constant', 'adaptive']),  # Learning rate schedule
        'max_iter': trial.suggest_int('max_iter', 100, 1000),  # Maximum number of iterations
    }

    # Create a pipeline with scaling and MLP
    pipe = Pipeline([
        ('scaler', StandardScaler()),  # Scaling
        ('mlp', MLPClassifier(**params, random_state=1))  # MLP model
    ])
    
    # Perform cross-validation and calculate the average accuracy score
    accuracy_scores = cross_val_score(pipe, X, y, cv=skf, scoring='roc_auc', n_jobs=-1)

    # Return the mean accuracy score across all folds
    return np.mean(accuracy_scores)

# Create and run the Optuna study for MLP
study_mlp = optuna.create_study(direction='maximize', sampler=sampler, pruner=pruner)
study_mlp.optimize(objective_mlp, n_trials=100, n_jobs=-1)

# Get the best hyperparameters
best_params_mlp = study_mlp.best_params

# Save the model (with best hyperparameters, not fitted) using joblib
model_mlp = MLPClassifier(**best_params_mlp, random_state=1)
joblib.dump(model_mlp, 'mlp_model.pkl')

print(f"Best hyperparameters for MLP: {best_params_mlp}")
print("Model saved as 'mlp_model.pkl'")

# Naive Bayes Model Study

In [None]:
# Define the objective function for tuning Gaussian Naive Bayes
def objective_naive_bayes(trial):
    # Naive Bayes typically has fewer hyperparameters, but we can tune 'var_smoothing'
    params = {
        'var_smoothing': trial.suggest_float('var_smoothing', 1e-9, 1e-1, log=True)  # Smoothing parameter
    }

    # Create a Gaussian Naive Bayes model with the hyperparameters
    model = GaussianNB(**params)

    # Perform cross-validation and calculate the average accuracy score
    accuracy_scores = cross_val_score(model, X, y, cv=skf, scoring='roc_auc', n_jobs=-1)

    # Return the mean accuracy score across all folds
    return np.mean(accuracy_scores)

# Create and run the Optuna study for Naive Bayes
study_nb = optuna.create_study(direction='maximize', sampler=sampler, pruner=pruner)
study_nb.optimize(objective_naive_bayes, n_trials=200)

# Get the best hyperparameters
best_params_nb = study_nb.best_params
model_gnb = GaussianNB(**best_params_nb)
joblib.dump(model_gnb, 'naive_bayes_model.pkl')

print(f"Best hyperparameters for Naive Bayes: {best_params_nb}")
print("Model saved as 'naive_bayes_model.pkl'")