In [1]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.preprocessing import StandardScaler

from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')

In [2]:
SEED = 42
N_SPLIT = 3
BEST_SCENARIO = {
    'log': False,
    'oversampling': True,
    'oversampling_method': RandomOverSampler
}

DATA_PATH = "../data/preprocessed/feature_engineered_data.csv"

data = pd.read_csv(DATA_PATH)
data.head()

Unnamed: 0,infection_group,"age, months",who_score,sex,"height, m","weight, kg","wbc, 10^6/L","neutrophils, 10^6/L","lymphocytes, 10^6/L","monocytes, 10^6/L",...,days_past_onset,rotavirus vaccine doses,NLR,PLR,MLR,NMR,NPR,LMR,red2white,age_group
0,1,12,1,0,0.81,10.3,21890,17380,2070,1160,...,1.0,2,8.396135,132.36715,0.560386,14.982759,0.063431,1.784483,0.000186,1
1,1,24,1,1,0.84,12.9,23900,11980,6790,4900,...,5.0,1,1.764359,50.073638,0.721649,2.444898,0.035235,1.385714,0.000125,3
2,1,24,3,0,0.82,5.8,12600,8830,2710,860,...,3.0,0,3.258303,170.110701,0.317343,10.267442,0.019154,3.151163,0.000345,3
3,1,6,1,1,0.62,6.0,15030,6840,6600,1230,...,1.0,0,1.036364,54.69697,0.186364,5.560976,0.018947,5.365854,0.000251,1
4,1,11,1,1,0.72,8.0,17350,8880,6250,1560,...,2.0,0,1.4208,82.88,0.2496,5.692308,0.017143,4.00641,0.000259,1


In [5]:
models = {
    'SVC': SVC,
    'XGBClassifier': XGBClassifier,
    'LGBMClassifier': LGBMClassifier,
    'CatBoostClassifier': CatBoostClassifier,
    'LogisticRegression': LogisticRegression,
    'RidgeClassifier': RidgeClassifier,
    'SGDClassifier': SGDClassifier,
    'RandomForestClassifier': RandomForestClassifier,
    'ExtraTreesClassifier': ExtraTreesClassifier,
    'HistGradientBoostingClassifier': HistGradientBoostingClassifier,
    'GradientBoostingClassifier': GradientBoostingClassifier,
    'AdaBoostClassifier': AdaBoostClassifier
}


num_kolom = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
num_kolom += ["NLR", "PLR", "MLR", "NMR", "NPR", "LMR", "red2white"]

num_kolom.remove("infection_group")
num_kolom

['age, months',
 'who_score',
 'sex',
 'height, m',
 'weight, kg',
 'wbc, 10^6/L',
 'neutrophils, 10^6/L',
 'lymphocytes, 10^6/L',
 'monocytes, 10^6/L',
 'eosinophils, 10^6/L',
 'basophils, 10^6/L',
 'red blood cell, 10^6/uL',
 'platelets, cells/uL',
 'highest_temp',
 'duration_days',
 'days_past_onset',
 'rotavirus vaccine doses',
 'NLR',
 'PLR',
 'MLR',
 'NMR',
 'NPR',
 'LMR',
 'red2white',
 'age_group',
 'NLR',
 'PLR',
 'MLR',
 'NMR',
 'NPR',
 'LMR',
 'red2white']

In [6]:
X, y = data.drop(columns=["infection_group"]), data["infection_group"]
results = {models_name: [] for models_name in models.keys()}

for model_name, model in tqdm(models.items()):
    skf = StratifiedKFold(n_splits=N_SPLIT, random_state=SEED, shuffle=True)
    accs = []
    precs = []
    recs = []
    f1s = []
    aucs = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        scaler = StandardScaler()
        X_train[num_kolom] = scaler.fit_transform(X_train[num_kolom])
        X_test[num_kolom] = scaler.transform(X_test[num_kolom])
        
        if BEST_SCENARIO['log']:
            X_train = X_train.apply(lambda x: np.log(x) if x.name in num_kolom and x.min() > 0 else x)
            X_test = X_test.apply(lambda x: np.log(x) if x.name in num_kolom and x.min() > 0 else x)

        if BEST_SCENARIO['oversampling']:
            oversample = BEST_SCENARIO['oversampling_method'](random_state=SEED)
            X_train, y_train = oversample.fit_resample(X_train, y_train)

        try:
            model_instance = model(random_state=SEED, verbose=0)
        except:
            model_instance = model(random_state=SEED)
        model_instance.fit(X_train.to_numpy(), y_train)
        y_pred = model_instance.predict(X_test.to_numpy())

        k_acc = accuracy_score(y_test, y_pred)
        k_prec = precision_score(y_test, y_pred)
        k_rec = recall_score(y_test, y_pred)
        k_f1 = f1_score(y_test, y_pred)
        k_fpr, k_tpr, k_thresholds = roc_curve(y_test, y_pred)
        k_auc = auc(k_fpr, k_tpr)
        
        accs.append(k_acc)
        precs.append(k_prec)
        recs.append(k_rec)
        f1s.append(k_f1)
        aucs.append(k_auc)
    
    results[model_name].append({
        'acc': accs,
        'prec': precs,
        'rec': recs,
        'f1': f1s,
        'auc': aucs
    })
    print(results)
    print(f"{model_name} done!")
    print(f"AUC mean: {np.mean([result['auc'] for result in results[model_name]])}")

  8%|▊         | 1/12 [00:00<00:01,  6.80it/s]

{'SVC': [{'acc': [0.6, 0.7636363636363637, 0.6296296296296297], 'prec': [0.7352941176470589, 0.8157894736842105, 0.7741935483870968], 'rec': [0.6578947368421053, 0.8378378378378378, 0.6486486486486487], 'f1': [0.6944444444444444, 0.8266666666666667, 0.7058823529411765], 'auc': [0.5642414860681115, 0.7244744744744744, 0.6184419713831479]}], 'XGBClassifier': [], 'LGBMClassifier': [], 'CatBoostClassifier': [], 'LogisticRegression': [], 'RidgeClassifier': [], 'SGDClassifier': [], 'RandomForestClassifier': [], 'ExtraTreesClassifier': [], 'HistGradientBoostingClassifier': [], 'GradientBoostingClassifier': [], 'AdaBoostClassifier': []}
SVC done!
AUC mean: 0.6357193106419113
{'SVC': [{'acc': [0.6, 0.7636363636363637, 0.6296296296296297], 'prec': [0.7352941176470589, 0.8157894736842105, 0.7741935483870968], 'rec': [0.6578947368421053, 0.8378378378378378, 0.6486486486486487], 'f1': [0.6944444444444444, 0.8266666666666667, 0.7058823529411765], 'auc': [0.5642414860681115, 0.7244744744744744, 0.618

 25%|██▌       | 3/12 [00:00<00:01,  5.78it/s]

{'SVC': [{'acc': [0.6, 0.7636363636363637, 0.6296296296296297], 'prec': [0.7352941176470589, 0.8157894736842105, 0.7741935483870968], 'rec': [0.6578947368421053, 0.8378378378378378, 0.6486486486486487], 'f1': [0.6944444444444444, 0.8266666666666667, 0.7058823529411765], 'auc': [0.5642414860681115, 0.7244744744744744, 0.6184419713831479]}], 'XGBClassifier': [{'acc': [0.8181818181818182, 0.7454545454545455, 0.8333333333333334], 'prec': [0.85, 0.7948717948717948, 0.9117647058823529], 'rec': [0.8947368421052632, 0.8378378378378378, 0.8378378378378378], 'f1': [0.8717948717948718, 0.8157894736842105, 0.8732394366197183], 'auc': [0.7708978328173375, 0.6966966966966968, 0.8306836248012718]}], 'LGBMClassifier': [{'acc': [0.8, 0.7454545454545455, 0.8148148148148148], 'prec': [0.8292682926829268, 0.7948717948717948, 0.8857142857142857], 'rec': [0.8947368421052632, 0.8378378378378378, 0.8378378378378378], 'f1': [0.8607594936708861, 0.8157894736842105, 0.8611111111111112], 'auc': [0.741486068111455

 50%|█████     | 6/12 [00:06<00:07,  1.19s/it]

{'SVC': [{'acc': [0.6, 0.7636363636363637, 0.6296296296296297], 'prec': [0.7352941176470589, 0.8157894736842105, 0.7741935483870968], 'rec': [0.6578947368421053, 0.8378378378378378, 0.6486486486486487], 'f1': [0.6944444444444444, 0.8266666666666667, 0.7058823529411765], 'auc': [0.5642414860681115, 0.7244744744744744, 0.6184419713831479]}], 'XGBClassifier': [{'acc': [0.8181818181818182, 0.7454545454545455, 0.8333333333333334], 'prec': [0.85, 0.7948717948717948, 0.9117647058823529], 'rec': [0.8947368421052632, 0.8378378378378378, 0.8378378378378378], 'f1': [0.8717948717948718, 0.8157894736842105, 0.8732394366197183], 'auc': [0.7708978328173375, 0.6966966966966968, 0.8306836248012718]}], 'LGBMClassifier': [{'acc': [0.8, 0.7454545454545455, 0.8148148148148148], 'prec': [0.8292682926829268, 0.7948717948717948, 0.8857142857142857], 'rec': [0.8947368421052632, 0.8378378378378378, 0.8378378378378378], 'f1': [0.8607594936708861, 0.8157894736842105, 0.8611111111111112], 'auc': [0.741486068111455

 67%|██████▋   | 8/12 [00:07<00:03,  1.30it/s]

{'SVC': [{'acc': [0.6, 0.7636363636363637, 0.6296296296296297], 'prec': [0.7352941176470589, 0.8157894736842105, 0.7741935483870968], 'rec': [0.6578947368421053, 0.8378378378378378, 0.6486486486486487], 'f1': [0.6944444444444444, 0.8266666666666667, 0.7058823529411765], 'auc': [0.5642414860681115, 0.7244744744744744, 0.6184419713831479]}], 'XGBClassifier': [{'acc': [0.8181818181818182, 0.7454545454545455, 0.8333333333333334], 'prec': [0.85, 0.7948717948717948, 0.9117647058823529], 'rec': [0.8947368421052632, 0.8378378378378378, 0.8378378378378378], 'f1': [0.8717948717948718, 0.8157894736842105, 0.8732394366197183], 'auc': [0.7708978328173375, 0.6966966966966968, 0.8306836248012718]}], 'LGBMClassifier': [{'acc': [0.8, 0.7454545454545455, 0.8148148148148148], 'prec': [0.8292682926829268, 0.7948717948717948, 0.8857142857142857], 'rec': [0.8947368421052632, 0.8378378378378378, 0.8378378378378378], 'f1': [0.8607594936708861, 0.8157894736842105, 0.8611111111111112], 'auc': [0.741486068111455

 75%|███████▌  | 9/12 [00:07<00:01,  1.52it/s]

{'SVC': [{'acc': [0.6, 0.7636363636363637, 0.6296296296296297], 'prec': [0.7352941176470589, 0.8157894736842105, 0.7741935483870968], 'rec': [0.6578947368421053, 0.8378378378378378, 0.6486486486486487], 'f1': [0.6944444444444444, 0.8266666666666667, 0.7058823529411765], 'auc': [0.5642414860681115, 0.7244744744744744, 0.6184419713831479]}], 'XGBClassifier': [{'acc': [0.8181818181818182, 0.7454545454545455, 0.8333333333333334], 'prec': [0.85, 0.7948717948717948, 0.9117647058823529], 'rec': [0.8947368421052632, 0.8378378378378378, 0.8378378378378378], 'f1': [0.8717948717948718, 0.8157894736842105, 0.8732394366197183], 'auc': [0.7708978328173375, 0.6966966966966968, 0.8306836248012718]}], 'LGBMClassifier': [{'acc': [0.8, 0.7454545454545455, 0.8148148148148148], 'prec': [0.8292682926829268, 0.7948717948717948, 0.8857142857142857], 'rec': [0.8947368421052632, 0.8378378378378378, 0.8378378378378378], 'f1': [0.8607594936708861, 0.8157894736842105, 0.8611111111111112], 'auc': [0.741486068111455

 92%|█████████▏| 11/12 [00:07<00:00,  1.96it/s]

{'SVC': [{'acc': [0.6, 0.7636363636363637, 0.6296296296296297], 'prec': [0.7352941176470589, 0.8157894736842105, 0.7741935483870968], 'rec': [0.6578947368421053, 0.8378378378378378, 0.6486486486486487], 'f1': [0.6944444444444444, 0.8266666666666667, 0.7058823529411765], 'auc': [0.5642414860681115, 0.7244744744744744, 0.6184419713831479]}], 'XGBClassifier': [{'acc': [0.8181818181818182, 0.7454545454545455, 0.8333333333333334], 'prec': [0.85, 0.7948717948717948, 0.9117647058823529], 'rec': [0.8947368421052632, 0.8378378378378378, 0.8378378378378378], 'f1': [0.8717948717948718, 0.8157894736842105, 0.8732394366197183], 'auc': [0.7708978328173375, 0.6966966966966968, 0.8306836248012718]}], 'LGBMClassifier': [{'acc': [0.8, 0.7454545454545455, 0.8148148148148148], 'prec': [0.8292682926829268, 0.7948717948717948, 0.8857142857142857], 'rec': [0.8947368421052632, 0.8378378378378378, 0.8378378378378378], 'f1': [0.8607594936708861, 0.8157894736842105, 0.8611111111111112], 'auc': [0.741486068111455

100%|██████████| 12/12 [00:08<00:00,  1.46it/s]

{'SVC': [{'acc': [0.6, 0.7636363636363637, 0.6296296296296297], 'prec': [0.7352941176470589, 0.8157894736842105, 0.7741935483870968], 'rec': [0.6578947368421053, 0.8378378378378378, 0.6486486486486487], 'f1': [0.6944444444444444, 0.8266666666666667, 0.7058823529411765], 'auc': [0.5642414860681115, 0.7244744744744744, 0.6184419713831479]}], 'XGBClassifier': [{'acc': [0.8181818181818182, 0.7454545454545455, 0.8333333333333334], 'prec': [0.85, 0.7948717948717948, 0.9117647058823529], 'rec': [0.8947368421052632, 0.8378378378378378, 0.8378378378378378], 'f1': [0.8717948717948718, 0.8157894736842105, 0.8732394366197183], 'auc': [0.7708978328173375, 0.6966966966966968, 0.8306836248012718]}], 'LGBMClassifier': [{'acc': [0.8, 0.7454545454545455, 0.8148148148148148], 'prec': [0.8292682926829268, 0.7948717948717948, 0.8857142857142857], 'rec': [0.8947368421052632, 0.8378378378378378, 0.8378378378378378], 'f1': [0.8607594936708861, 0.8157894736842105, 0.8611111111111112], 'auc': [0.741486068111455




In [12]:
avg_results = {
    model_name: {
        'acc': np.mean([result['acc'] for result in results[model_name]]),
        'prec': np.mean([result['prec'] for result in results[model_name]]),
        'rec': np.mean([result['rec'] for result in results[model_name]]),
        'f1': np.mean([result['f1'] for result in results[model_name]]),
        'auc': np.mean([result['auc'] for result in results[model_name]])
    } for model_name in results.keys()
}

all_model = [model_name for model_name in results.keys()]
all_acc = [avg_results[model_name]['acc'] for model_name in results.keys()]
all_prec = [avg_results[model_name]['prec'] for model_name in results.keys()]
all_rec = [avg_results[model_name]['rec'] for model_name in results.keys()]
all_f1 = [avg_results[model_name]['f1'] for model_name in results.keys()]
all_auc = [avg_results[model_name]['auc'] for model_name in results.keys()]

result_df = pd.DataFrame({
    'model': all_model,
    'acc': all_acc,
    'prec': all_prec,
    'rec': all_rec,
    'f1': all_f1,
    'auc': all_auc
})

result_df.sort_values(by='auc', ascending=False, inplace=True)
result_df

Unnamed: 0,model,acc,prec,rec,f1,auc
7,RandomForestClassifier,0.835354,0.853576,0.91963,0.883892,0.787702
10,GradientBoostingClassifier,0.805051,0.851852,0.865813,0.85854,0.770597
1,XGBClassifier,0.79899,0.852212,0.856804,0.853608,0.766093
9,HistGradientBoostingClassifier,0.774523,0.845564,0.821242,0.832801,0.748312
2,LGBMClassifier,0.786756,0.836618,0.856804,0.845887,0.746485
3,CatBoostClassifier,0.786195,0.82366,0.874822,0.846127,0.735886
11,AdaBoostClassifier,0.731762,0.859342,0.732101,0.788194,0.732063
4,LogisticRegression,0.682941,0.812092,0.697487,0.745761,0.676086
8,ExtraTreesClassifier,0.743883,0.769041,0.893314,0.82622,0.657986
5,RidgeClassifier,0.652637,0.790251,0.670697,0.721776,0.643627


In [13]:
top_3 = result_df.head(3)
top_3

Unnamed: 0,model,acc,prec,rec,f1,auc
7,RandomForestClassifier,0.835354,0.853576,0.91963,0.883892,0.787702
10,GradientBoostingClassifier,0.805051,0.851852,0.865813,0.85854,0.770597
1,XGBClassifier,0.79899,0.852212,0.856804,0.853608,0.766093


In [30]:
import optuna

def rf_objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 10, 500, log=True)
    max_depth = trial.suggest_int("max_depth", 2, 32)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    criterion = trial.suggest_categorical("criterion", ["gini", "entropy"])
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        criterion=criterion,
        random_state=SEED
    )
    
    skf = StratifiedKFold(n_splits=N_SPLIT, random_state=SEED, shuffle=True)
    aucs = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        scaler = StandardScaler()
        X_train[num_kolom] = scaler.fit_transform(X_train[num_kolom])
        X_test[num_kolom] = scaler.transform(X_test[num_kolom])
        
        if BEST_SCENARIO['log']:
            X_train = X_train.apply(lambda x: np.log(x) if x.name in num_kolom and x.min() > 0 else x)
            X_test = X_test.apply(lambda x: np.log(x) if x.name in num_kolom and x.min() > 0 else x)

        if BEST_SCENARIO['oversampling']:
            oversample = BEST_SCENARIO['oversampling_method'](random_state=SEED)
            X_train, y_train = oversample.fit_resample(X_train, y_train)

        model.fit(X_train.to_numpy(), y_train)
        y_pred = model.predict(X_test.to_numpy())
        k_fpr, k_tpr, k_thresholds = roc_curve(y_test, y_pred)
        k_auc = auc(k_fpr, k_tpr)
        aucs.append(k_auc)
    
    return np.mean(aucs)

def xgb_objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 10, 500)
    max_depth = trial.suggest_int("max_depth", 2, 32)
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1)
    subsample = trial.suggest_float("subsample", 0.1, 1)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1)
    model = XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        random_state=SEED
    )
    
    skf = StratifiedKFold(n_splits=N_SPLIT, random_state=SEED, shuffle=True)
    aucs = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        scaler = StandardScaler()
        X_train[num_kolom] = scaler.fit_transform(X_train[num_kolom])
        X_test[num_kolom] = scaler.transform(X_test[num_kolom])
        
        if BEST_SCENARIO['log']:
            X_train = X_train.apply(lambda x: np.log(x) if x.name in num_kolom and x.min() > 0 else x)
            X_test = X_test.apply(lambda x: np.log(x) if x.name in num_kolom and x.min() > 0 else x)

        if BEST_SCENARIO['oversampling']:
            oversample = BEST_SCENARIO['oversampling_method'](random_state=SEED)
            X_train, y_train = oversample.fit_resample(X_train, y_train)

        model.fit(X_train.to_numpy(), y_train)
        y_pred = model.predict(X_test.to_numpy())
        k_fpr, k_tpr, k_thresholds = roc_curve(y_test, y_pred)
        k_auc = auc(k_fpr, k_tpr)
        aucs.append(k_auc)
    
    return np.mean(aucs)

def gb_objective(trial):
    
    n_estimator = trial.suggest_int("n_estimator", 10, 500)
    max_depth = trial.suggest_int("max_depth", 2, 32)
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1)
    subsample = trial.suggest_float("subsample", 0.1, 1)
    
    model = GradientBoostingClassifier(
        n_estimators=n_estimator,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        random_state=SEED
    )
    
    skf = StratifiedKFold(n_splits=N_SPLIT, random_state=SEED, shuffle=True)
    aucs = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        scaler = StandardScaler()
        X_train[num_kolom] = scaler.fit_transform(X_train[num_kolom])
        X_test[num_kolom] = scaler.transform(X_test[num_kolom])
        
        if BEST_SCENARIO['log']:
            X_train = X_train.apply(lambda x: np.log(x) if x.name in num_kolom and x.min() > 0 else x)
            X_test = X_test.apply(lambda x: np.log(x) if x.name in num_kolom and x.min() > 0 else x)

        if BEST_SCENARIO['oversampling']:
            oversample = BEST_SCENARIO['oversampling_method'](random_state=SEED)
            X_train, y_train = oversample.fit_resample(X_train, y_train)

        model.fit(X_train.to_numpy(), y_train)
        y_pred = model.predict(X_test.to_numpy())
        k_fpr, k_tpr, k_thresholds = roc_curve(y_test, y_pred)
        k_auc = auc(k_fpr, k_tpr)
        aucs.append(k_auc)
        
    return np.mean(aucs)

def catboost_objective(trial):
    n_estimator = trial.suggest_int("n_estimator", 10, 5000)
    max_depth = trial.suggest_int("max_depth", 2, 16)
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1)
    subsample = trial.suggest_float("subsample", 0.1, 1)
    
    model = CatBoostClassifier(
        n_estimators=n_estimator,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        random_state=SEED,
        verbose=0
    )
    
    skf = StratifiedKFold(n_splits=N_SPLIT, random_state=SEED, shuffle=True)
    aucs = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        scaler = StandardScaler()
        X_train[num_kolom] = scaler.fit_transform(X_train[num_kolom])
        X_test[num_kolom] = scaler.transform(X_test[num_kolom])
        
        if BEST_SCENARIO['log']:
            X_train = X_train.apply(lambda x: np.log(x) if x.name in num_kolom and x.min() > 0 else x)
            X_test = X_test.apply(lambda x: np.log(x) if x.name in num_kolom and x.min() > 0 else x)

        if BEST_SCENARIO['oversampling']:
            oversample = BEST_SCENARIO['oversampling_method'](random_state=SEED)
            X_train, y_train = oversample.fit_resample(X_train, y_train)

        model.fit(X_train.to_numpy(), y_train)
        y_pred = model.predict(X_test.to_numpy())
        k_fpr, k_tpr, k_thresholds = roc_curve(y_test, y_pred)
        k_auc = auc(k_fpr, k_tpr)
        aucs.append(k_auc)
        
    return np.mean(aucs)

In [20]:
rf_study = optuna.create_study(direction="maximize")
rf_study.optimize(rf_objective, n_trials=1000)

[I 2024-05-02 19:22:47,974] A new study created in memory with name: no-name-747c31a5-80ab-4462-a77b-b4ec89fdf12f
[I 2024-05-02 19:22:48,275] Trial 0 finished with value: 0.7143381771864744 and parameters: {'n_estimators': 82, 'max_depth': 29, 'min_samples_split': 5, 'min_samples_leaf': 10, 'criterion': 'gini'}. Best is trial 0 with value: 0.7143381771864744.
[I 2024-05-02 19:22:48,643] Trial 1 finished with value: 0.742335989240014 and parameters: {'n_estimators': 103, 'max_depth': 13, 'min_samples_split': 3, 'min_samples_leaf': 4, 'criterion': 'entropy'}. Best is trial 1 with value: 0.742335989240014.
[I 2024-05-02 19:22:49,215] Trial 2 finished with value: 0.7147642999036187 and parameters: {'n_estimators': 173, 'max_depth': 12, 'min_samples_split': 4, 'min_samples_leaf': 9, 'criterion': 'entropy'}. Best is trial 1 with value: 0.742335989240014.
[I 2024-05-02 19:22:49,408] Trial 3 finished with value: 0.7639582306764968 and parameters: {'n_estimators': 45, 'max_depth': 30, 'min_samp

In [21]:
xgb_study = optuna.create_study(direction="maximize")
xgb_study.optimize(xgb_objective, n_trials=1000)

[I 2024-05-02 19:30:53,860] A new study created in memory with name: no-name-0fd0917e-c97a-48bc-90eb-0c83412fd725
[I 2024-05-02 19:30:54,022] Trial 0 finished with value: 0.7445448544519752 and parameters: {'n_estimators': 108, 'max_depth': 3, 'learning_rate': 0.982170885195049, 'subsample': 0.8512561038678367, 'colsample_bytree': 0.656602763366533}. Best is trial 0 with value: 0.7445448544519752.
[I 2024-05-02 19:30:54,206] Trial 1 finished with value: 0.7123354004158958 and parameters: {'n_estimators': 283, 'max_depth': 26, 'learning_rate': 0.8225340956063627, 'subsample': 0.4219431409994202, 'colsample_bytree': 0.8859225235898712}. Best is trial 0 with value: 0.7445448544519752.
[I 2024-05-02 19:30:54,382] Trial 2 finished with value: 0.7080377591213504 and parameters: {'n_estimators': 269, 'max_depth': 24, 'learning_rate': 0.6369044814432597, 'subsample': 0.24860640246194832, 'colsample_bytree': 0.8495331975661908}. Best is trial 0 with value: 0.7445448544519752.
[I 2024-05-02 19:3

In [22]:
xgb_study.best_params

{'n_estimators': 281,
 'max_depth': 9,
 'learning_rate': 0.017957138799569422,
 'subsample': 0.9475936609928898,
 'colsample_bytree': 0.9593180704193125}

In [23]:
xgb_study.best_value

0.8227817600882616

In [24]:
gb_study = optuna.create_study(direction="maximize")
gb_study.optimize(gb_objective, n_trials=1000)

[I 2024-05-02 19:42:27,324] A new study created in memory with name: no-name-48ae5780-782a-47c3-9382-e969cfa66970
[I 2024-05-02 19:42:28,140] Trial 0 finished with value: 0.7647531432361153 and parameters: {'n_estimator': 170, 'max_depth': 4, 'learning_rate': 0.15715077305018446, 'subsample': 0.7460022012161085}. Best is trial 0 with value: 0.7647531432361153.
[I 2024-05-02 19:42:28,952] Trial 1 finished with value: 0.6681720420110513 and parameters: {'n_estimator': 239, 'max_depth': 21, 'learning_rate': 0.8293205109137636, 'subsample': 0.3883311424735181}. Best is trial 0 with value: 0.7647531432361153.
[I 2024-05-02 19:42:30,147] Trial 2 finished with value: 0.7493995543531148 and parameters: {'n_estimator': 85, 'max_depth': 26, 'learning_rate': 0.13615327212659645, 'subsample': 0.6986280446409815}. Best is trial 0 with value: 0.7647531432361153.
[I 2024-05-02 19:42:31,042] Trial 3 finished with value: 0.7372868224261414 and parameters: {'n_estimator': 115, 'max_depth': 20, 'learning

In [25]:
gb_study.best_params

{'n_estimator': 33,
 'max_depth': 3,
 'learning_rate': 0.10653740408400243,
 'subsample': 0.9991709860605422}

In [26]:
gb_study.best_value

0.826741602283398

In [31]:
cb_study = optuna.create_study(direction="maximize")
cb_study.optimize(catboost_objective, n_trials=100)

[I 2024-05-02 21:46:46,039] A new study created in memory with name: no-name-75c2caed-6cb5-48cd-adea-3a9ca52324db
[I 2024-05-02 21:47:30,510] Trial 0 finished with value: 0.6741346609767662 and parameters: {'n_estimator': 882, 'max_depth': 12, 'learning_rate': 0.13233206962775454, 'subsample': 0.1489959565735559}. Best is trial 0 with value: 0.6741346609767662.
[I 2024-05-02 21:47:34,541] Trial 1 finished with value: 0.7104968745835617 and parameters: {'n_estimator': 517, 'max_depth': 8, 'learning_rate': 0.869354277641319, 'subsample': 0.45541304763081636}. Best is trial 1 with value: 0.7104968745835617.
[I 2024-05-02 21:47:42,215] Trial 2 finished with value: 0.7620755430352953 and parameters: {'n_estimator': 2886, 'max_depth': 2, 'learning_rate': 0.7276853260631109, 'subsample': 0.3250456125991657}. Best is trial 2 with value: 0.7620755430352953.
[I 2024-05-02 21:47:49,055] Trial 3 finished with value: 0.7656665953879577 and parameters: {'n_estimator': 2330, 'max_depth': 3, 'learning

KeyboardInterrupt: 