In [11]:
import optuna

import numpy as np
import pandas as pd

from catboost import CatBoostClassifier
from sklearn.model_selection import KFold, train_test_split
import seaborn as sns
from sklearn.metrics import f1_score
import warnings 

warnings.simplefilter('ignore')

In [5]:
df = sns.load_dataset('penguins')

In [7]:
df.dropna(inplace=True)

In [8]:
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male
...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,Female
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [15]:
targets = ['sex']
cat_features = ['species', 'island']
filtered_features = [i for i in df.columns if i not in targets]

In [16]:
def fit_catboost(trial, train, val):
    X_train, y_train = train
    X_val, y_val = val

    param = {
        'iterations' : 400,
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.05),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 2, 50),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.8),
        
        "auto_class_weights": trial.suggest_categorical("auto_class_weights", ["SqrtBalanced", "Balanced", "None"]),
        "depth": trial.suggest_int("depth", 3, 9),
        
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        "used_ram_limit": "14gb",
        "eval_metric": "Accuracy", 
    }

    
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 20)
        
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)
        

    clf = CatBoostClassifier(
        **param,
        thread_count=-1,
        random_seed=22,
        cat_features=cat_features,
    )

    clf.fit(
        X_train,
        y_train,
        eval_set=(X_val, y_val),
        verbose=0,
        plot=False,
        early_stopping_rounds=5,
    )

    y_pred = clf.predict(X_val)
    return clf, y_pred

In [33]:
def objective(trial, return_models=False):
    n_splits = 3
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    X_train = df[filtered_features].drop(targets, axis=1, errors="ignore")
    y_train = df[targets]

    scores, models = [], []
    
    for train_idx, valid_idx in kf.split(X_train):
        train_data = X_train.iloc[train_idx, :], y_train.iloc[train_idx]
        valid_data = X_train.iloc[valid_idx, :], y_train.iloc[valid_idx]

        # Подаем trials для перебора
        model, y_pred = fit_catboost(trial, train_data, valid_data) # Определили выше
        scores.append(f1_score(y_pred, valid_data[1], average="binary", pos_label="Male"))
        models.append(model)
        break
         

    result = np.mean(scores)
    
    if return_models:
        return result, models
    else:
        return result

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective,
               n_trials=600,
               show_progress_bar=True,)

In [37]:
study.best_trial.value

0.9464285714285714

In [38]:
study.best_trial.params

{'learning_rate': 0.048772277009571235,
 'l2_leaf_reg': 24,
 'colsample_bylevel': 0.6363723336164767,
 'auto_class_weights': 'None',
 'depth': 8,
 'boosting_type': 'Ordered',
 'bootstrap_type': 'Bayesian',
 'bagging_temperature': 6.481080893132568}

In [39]:
valid_scores, models = objective(
    optuna.trial.FixedTrial(study.best_params),
    return_models=True,
)

In [42]:
model = models[0]

In [44]:
from optuna_integration import CatBoostPruningCallback

In [57]:
def fit_catboost(trial, train, val):
    X_train, y_train = train
    X_val, y_val = val

    param = {
        'iterations' : 400,
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.05),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 2, 50),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.8),
        
        "auto_class_weights": trial.suggest_categorical("auto_class_weights", ["SqrtBalanced", "Balanced", "None"]),
        "depth": trial.suggest_int("depth", 3, 9),
        
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        "used_ram_limit": "14gb",
        "eval_metric": "Accuracy", 
    }

    
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 20)
        
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)
        

    clf = CatBoostClassifier(
        **param,
        thread_count=-1,
        random_seed=22,
        cat_features=cat_features,
    )
    pruning_callback = CatBoostPruningCallback(trial, "Accuracy")
    clf.fit(
        X_train,
        y_train,
        eval_set=(X_val, y_val),
        verbose=0,
        plot=False,
        early_stopping_rounds=5,
        callbacks=[pruning_callback],
    )
    pruning_callback.check_pruned()
    y_pred = clf.predict(X_val)
    return clf, y_pred

In [58]:
def objective(trial, return_models=False):
    n_splits = 3
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    X_train = df[filtered_features].drop(targets, axis=1, errors="ignore")
    y_train = df[targets]

    scores, models = [], []
    
    for train_idx, valid_idx in kf.split(X_train):
        train_data = X_train.iloc[train_idx, :], y_train.iloc[train_idx]
        valid_data = X_train.iloc[valid_idx, :], y_train.iloc[valid_idx]

        # Подаем trials для перебора
        model, y_pred = fit_catboost(trial, train_data, valid_data) # Определили выше
        scores.append(f1_score(y_pred, valid_data[1], average="binary", pos_label="Male"))
        models.append(model)
        break
         

    result = np.mean(scores)
    
    if return_models:
        return result, models
    else:
        return result

In [59]:
study = optuna.create_study(
     pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
     direction="maximize",
 )

study.optimize(objective,
                n_trials=1000,
                n_jobs = -1,
               )

[I 2024-08-07 16:08:17,259] A new study created in memory with name: no-name-266a2e31-2a35-43f6-ad76-4e012f942fa6
[I 2024-08-07 16:08:17,546] Trial 1 finished with value: 0.8739495798319328 and parameters: {'learning_rate': 0.020036204367077495, 'l2_leaf_reg': 33, 'colsample_bylevel': 0.2621996203626756, 'auto_class_weights': 'None', 'depth': 4, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 1 with value: 0.8739495798319328.
[I 2024-08-07 16:08:17,554] Trial 6 finished with value: 0.85 and parameters: {'learning_rate': 0.04694927039853952, 'l2_leaf_reg': 13, 'colsample_bylevel': 0.05655485951573276, 'auto_class_weights': 'Balanced', 'depth': 4, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 1 with value: 0.8739495798319328.
[I 2024-08-07 16:08:17,737] Trial 7 finished with value: 0.875 and parameters: {'learning_rate': 0.04656021344250694, 'l2_leaf_reg': 14, 'colsample_bylevel': 0.5721123057963309, 'auto_class_weights': 'Balanced', 'depth': 7, 'boo

In [60]:
study.best_trial.value

0.9401709401709402

In [61]:
valid_scores, models = objective(
    optuna.trial.FixedTrial(study.best_params),
    return_models=True,
)

In [62]:
model = models[0]