In [1]:
# ============================================================
#                OPTUNA HPO for 5 MODELS
#     RandomForest, XGBoost, CatBoost, SVM, MLPClassifier
# ============================================================

import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import pandas as pd

# ============================================================
# Load dataset
# ============================================================

df = pd.read_csv("cleaned_dataset.csv")

# Update target encoding
df["Dataset Label"] = df["Dataset Label"].map({1: 1, 2: 0})

X = df.drop("Dataset Label", axis=1)
y = df["Dataset Label"]

# ============================================================
# Utility: CV accuracy scorer
# ============================================================

def cv(model):
    return cross_val_score(model, X, y, cv=5, scoring="accuracy").mean()

# ============================================================
# RANDOM FOREST OPTUNA OBJECTIVE
# ============================================================

def objective_rf(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 800)
    max_depth = trial.suggest_int("max_depth", 3, 20)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        class_weight="balanced",
        random_state=42
    )

    return cv(model)


# ============================================================
# XGBOOST OPTUNA OBJECTIVE
# ============================================================

def objective_xgb(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "n_estimators": trial.suggest_int("n_estimators", 100, 800),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "eval_metric": "logloss"
    }
    model = XGBClassifier(**params)
    return cv(model)


# ============================================================
# CATBOOST OPTUNA OBJECTIVE
# ============================================================

def objective_cat(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 200, 800),
        "depth": trial.suggest_int("depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "loss_function": "Logloss",
        "verbose": 0
    }
    model = CatBoostClassifier(**params)
    return cv(model)


# ============================================================
# SVM OPTUNA OBJECTIVE
# ============================================================

def objective_svm(trial):
    C = trial.suggest_float("C", 0.1, 10.0)
    gamma = trial.suggest_float("gamma", 0.001, 1.0)

    model = Pipeline([
        ("scaler", StandardScaler()),
        ("svm", SVC(kernel="rbf", C=C, gamma=gamma))
    ])

    return cv(model)

# ============================================================
# MLP CLASSIFIER OPTUNA OBJECTIVE (Neural Network)
# ============================================================

def objective_mlp(trial):
    hidden_layer_sizes = trial.suggest_categorical(
        "hidden_layer_sizes",
        [(32,), (64,), (128,), (64,32), (128,64)]
    )
    alpha = trial.suggest_float("alpha", 1e-5, 1e-1)
    learning_rate_init = trial.suggest_float("learning_rate_init", 0.0005, 0.1)

    model = Pipeline([
        ("scaler", StandardScaler()),
        ("mlp", MLPClassifier(
            hidden_layer_sizes=hidden_layer_sizes,
            alpha=alpha,
            learning_rate_init=learning_rate_init,
            max_iter=800,
            random_state=42
        ))
    ])

    return cv(model)

# ============================================================
# RUN ALL OPTIMIZATIONS (50 trials each)
# ============================================================

print("Running Random Forest optimization…")
study_rf = optuna.create_study(direction="maximize")
study_rf.optimize(objective_rf, n_trials=50)

print("Running XGBoost optimization…")
study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(objective_xgb, n_trials=50)

print("Running CatBoost optimization…")
study_cat = optuna.create_study(direction="maximize")
study_cat.optimize(objective_cat, n_trials=50)

print("Running SVM optimization…")
study_svm = optuna.create_study(direction="maximize")
study_svm.optimize(objective_svm, n_trials=50)

print("Running MLP optimization…")
study_mlp = optuna.create_study(direction="maximize")
study_mlp.optimize(objective_mlp, n_trials=50)

# ============================================================
# PRINT BEST RESULTS
# ============================================================

print("\nBEST HYPERPARAMETERS:")
print("Random Forest:", study_rf.best_params, "Accuracy:", study_rf.best_value)
print("XGBoost:", study_xgb.best_params, "Accuracy:", study_xgb.best_value)
print("CatBoost:", study_cat.best_params, "Accuracy:", study_cat.best_value)
print("SVM:", study_svm.best_params, "Accuracy:", study_svm.best_value)
print("MLP:", study_mlp.best_params, "Accuracy:", study_mlp.best_value)


  from .autonotebook import tqdm as notebook_tqdm
[I 2025-11-25 18:40:31,781] A new study created in memory with name: no-name-960f3ecf-5d3a-4edb-bfa9-31e72ff84146


Running Random Forest optimization…


[I 2025-11-25 18:40:33,094] Trial 0 finished with value: 0.6723990568818154 and parameters: {'n_estimators': 179, 'max_depth': 9, 'min_samples_split': 12}. Best is trial 0 with value: 0.6723990568818154.
[I 2025-11-25 18:40:37,446] Trial 1 finished with value: 0.6432508104921898 and parameters: {'n_estimators': 708, 'max_depth': 3, 'min_samples_split': 8}. Best is trial 0 with value: 0.6723990568818154.
[I 2025-11-25 18:40:42,137] Trial 2 finished with value: 0.7083407014441497 and parameters: {'n_estimators': 586, 'max_depth': 15, 'min_samples_split': 2}. Best is trial 2 with value: 0.7083407014441497.
[I 2025-11-25 18:40:46,328] Trial 3 finished with value: 0.6758178603006189 and parameters: {'n_estimators': 580, 'max_depth': 10, 'min_samples_split': 12}. Best is trial 2 with value: 0.7083407014441497.
[I 2025-11-25 18:40:47,717] Trial 4 finished with value: 0.6484232242852932 and parameters: {'n_estimators': 210, 'max_depth': 4, 'min_samples_split': 16}. Best is trial 2 with value: 

Running XGBoost optimization…


[I 2025-11-25 18:44:28,541] Trial 0 finished with value: 0.692911877394636 and parameters: {'max_depth': 9, 'learning_rate': 0.1837306481148283, 'n_estimators': 735, 'subsample': 0.625910014088867, 'colsample_bytree': 0.6383967633278813}. Best is trial 0 with value: 0.692911877394636.
[I 2025-11-25 18:44:29,509] Trial 1 finished with value: 0.6912614205717653 and parameters: {'max_depth': 4, 'learning_rate': 0.12034504637571866, 'n_estimators': 701, 'subsample': 0.9550129523447862, 'colsample_bytree': 0.8387380321458641}. Best is trial 0 with value: 0.692911877394636.
[I 2025-11-25 18:44:30,453] Trial 2 finished with value: 0.7015620394930739 and parameters: {'max_depth': 8, 'learning_rate': 0.09769010568006306, 'n_estimators': 544, 'subsample': 0.911267080577699, 'colsample_bytree': 0.9633183683780175}. Best is trial 2 with value: 0.7015620394930739.
[I 2025-11-25 18:44:30,978] Trial 3 finished with value: 0.6895225464190982 and parameters: {'max_depth': 4, 'learning_rate': 0.05975293

Running CatBoost optimization…


[I 2025-11-25 18:45:01,021] Trial 0 finished with value: 0.6930002947244326 and parameters: {'iterations': 268, 'depth': 4, 'learning_rate': 0.212338064434299}. Best is trial 0 with value: 0.6930002947244326.
[I 2025-11-25 18:45:04,208] Trial 1 finished with value: 0.6946212791040377 and parameters: {'iterations': 492, 'depth': 6, 'learning_rate': 0.033516995092517335}. Best is trial 1 with value: 0.6946212791040377.
[I 2025-11-25 18:45:13,071] Trial 2 finished with value: 0.699808429118774 and parameters: {'iterations': 632, 'depth': 8, 'learning_rate': 0.22638760126754498}. Best is trial 2 with value: 0.699808429118774.
[I 2025-11-25 18:45:25,442] Trial 3 finished with value: 0.691290893015031 and parameters: {'iterations': 319, 'depth': 10, 'learning_rate': 0.23839189612260386}. Best is trial 2 with value: 0.699808429118774.
[I 2025-11-25 18:45:41,331] Trial 4 finished with value: 0.6963159445918066 and parameters: {'iterations': 414, 'depth': 10, 'learning_rate': 0.1863445949881709

Running SVM optimization…


[I 2025-11-25 18:54:10,057] Trial 2 finished with value: 0.6724137931034483 and parameters: {'C': 8.788279314689957, 'gamma': 0.42629454613763057}. Best is trial 0 with value: 0.6740937223695844.
[I 2025-11-25 18:54:10,141] Trial 3 finished with value: 0.6671824344238138 and parameters: {'C': 6.457590398376756, 'gamma': 0.6485026262617875}. Best is trial 0 with value: 0.6740937223695844.
[I 2025-11-25 18:54:10,206] Trial 4 finished with value: 0.7033009136457411 and parameters: {'C': 2.7303480796753763, 'gamma': 0.12310341791590373}. Best is trial 4 with value: 0.7033009136457411.
[I 2025-11-25 18:54:10,272] Trial 5 finished with value: 0.6689360447981137 and parameters: {'C': 7.4865546414694535, 'gamma': 0.176827669052681}. Best is trial 4 with value: 0.7033009136457411.
[I 2025-11-25 18:54:10,337] Trial 6 finished with value: 0.6638667845564398 and parameters: {'C': 6.182999979482638, 'gamma': 0.34435562032597633}. Best is trial 4 with value: 0.7033009136457411.
[I 2025-11-25 18:54:1

Running MLP optimization…


[I 2025-11-25 18:54:14,055] Trial 0 finished with value: 0.6655025051576776 and parameters: {'hidden_layer_sizes': (128,), 'alpha': 0.002991850728747885, 'learning_rate_init': 0.06961437246755196}. Best is trial 0 with value: 0.6655025051576776.
[I 2025-11-25 18:54:14,420] Trial 1 finished with value: 0.6655172413793103 and parameters: {'hidden_layer_sizes': (64, 32), 'alpha': 0.030713546668370062, 'learning_rate_init': 0.09734251476654317}. Best is trial 1 with value: 0.6655172413793103.
[I 2025-11-25 18:54:14,872] Trial 2 finished with value: 0.6707338638373121 and parameters: {'hidden_layer_sizes': (64, 32), 'alpha': 0.05290272773063227, 'learning_rate_init': 0.04910956234803101}. Best is trial 2 with value: 0.6707338638373121.
[I 2025-11-25 18:54:15,286] Trial 3 finished with value: 0.6551871500147362 and parameters: {'hidden_layer_sizes': (128,), 'alpha': 0.021415606095076492, 'learning_rate_init': 0.0823188468500001}. Best is trial 2 with value: 0.6707338638373121.
[I 2025-11-25 


BEST HYPERPARAMETERS:
Random Forest: {'n_estimators': 790, 'max_depth': 20, 'min_samples_split': 6} Accuracy: 0.7151783082817567
XGBoost: {'max_depth': 9, 'learning_rate': 0.20454894441576676, 'n_estimators': 395, 'subsample': 0.6233378206061393, 'colsample_bytree': 0.6243579078960152} Accuracy: 0.7220748600058945
CatBoost: {'iterations': 665, 'depth': 10, 'learning_rate': 0.0102405941841716} Accuracy: 0.7169319186560565
SVM: {'C': 0.526672142694957, 'gamma': 0.9149504274572957} Accuracy: 0.7238726790450929
MLP: {'hidden_layer_sizes': (64, 32), 'alpha': 0.09846357856193613, 'learning_rate_init': 0.033778265542581946} Accuracy: 0.7015325670498084
