# Задача

Сравнительный анализ фреймворков для поиска параметров моделей машинного обучения. В ходе работы будут рассмотрены 5 фреймворков:
* Ray Tune
* Optuna
* Hyperopt
* Bayesian Optimization
* Talos

Каждый из фреймворков будет использован для поиска гиперпараметров заданной модели машинного обучения. Будет проведено 10 запусков для каждого фреймворка по 25 испытаний в каждом запуске. 

# Подготовка



## Подключение библиотек

In [48]:
# Python
import os
import sys
import copy
import time
import random
import logging
from numbers import Number
from typing import Tuple, List, Type

# File work
import joblib

# Data work
import pandas as pd

# Math
import numpy as np

# ML
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Frameworks
import ray
from ray import train, tune

import optuna

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

from bayes_opt import BayesianOptimization, Events

import talos

# My
module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.insert(1, module_path)
from src.managers.managers_builders import (
    AFrameworkManagerBuilder, ModelManagerBuilder,
    RayTuneManagerBuilder, OptunaManagerBuilder,
    HyperoptManagerBuilder, BayesianManagerBuilder, TalosManagerBuilder
)
from src.managers.fw_manager import FrameworkManager
from src.managers.fw_manager import Logger

## Создание общих вспомогательных объектов

In [11]:
class CFG:
    '''
    Конфигурационный класс, хранящий различные константы, флаги и прочее
    '''
    # Technical
    seed = 2024

    # Train
    test_size = 0.2

    # Frameworks
    max_iter = 10
    n_trials = 25

    # Path
    general_folder_path = "../"
    data_path = f"{general_folder_path}data/"
    results_path = f"{general_folder_path}Results/"

In [12]:
#Фиксация сидов для воспроизводимости
def seed_everything(seed):
    random.seed(seed) # фиксируем генератор случайных чисел
    os.environ['PYTHONHASHSEED'] = str(seed) # фиксируем заполнения хешей
    np.random.seed(seed) # фиксируем генератор случайных чисел numpy

In [13]:
seed_everything(CFG.seed)

# Данные

## Обработка признаков

In [14]:
def sort_categories(counts: dict):
    labels = sorted(counts.keys(), reverse=True, key=lambda x: counts[x])
    return {key: counts[key] for key in labels}

In [15]:
def get_to_replace_same(dt, col_name):
    all_names = dt[col_name].unique()
    to_replace = {}
    for name in all_names:
        if name in to_replace.keys() or name.lower() == name:
            continue
        if name.lower() in all_names:
            to_replace[name.lower()] = name
    return to_replace

In [16]:
def get_to_replace_low(dt, col_name, count):
    low_make_mask = dt[col_name].value_counts() < count
    low_make = dt[col_name].value_counts()[low_make_mask]
    return {name: "other" for name in low_make.index}

In [17]:
filtered_dt = pd.read_csv(CFG.data_path + "filtered_dt.csv")

In [18]:
filtered_dt.head(5)

Unnamed: 0,year,make,model,trim,body,transmission,state,condition,odometer,color,interior,seller,mmr,sellingprice,sale_day_of_week,sale_year,sale_day,sale_month
0,2015,Kia,Sorento,LX,SUV,automatic,ca,5.0,16639.0,white,black,kia motors america inc,20500.0,21500.0,2,2014,16,12
1,2015,Kia,Sorento,LX,SUV,automatic,ca,5.0,9393.0,white,beige,kia motors america inc,20800.0,21500.0,2,2014,16,12
2,2014,BMW,3 Series,328i SULEV,Sedan,automatic,ca,45.0,1331.0,gray,black,financial services remarketing (lease),31900.0,30000.0,4,2015,15,1
3,2015,Volvo,S60,T5,Sedan,automatic,ca,41.0,14282.0,white,black,volvo na rep/world omni,27500.0,27750.0,4,2015,29,1
4,2015,Nissan,Altima,2.5 S,Sedan,automatic,ca,1.0,5554.0,gray,black,enterprise vehicle exchange / tra / rental / t...,15350.0,10900.0,2,2014,30,12


In [19]:
numerical_features = ['year', 'condition', 'odometer', 'mmr',
                      'sale_day_of_week', 'sale_year',
                      'sale_day', 'sale_month']
categorical_features = ['make', 'model', 'trim', 'body', 'transmission',
                        'state', 'color', 'interior', 'seller']
target_col = "sellingprice"

Посмотрим на количество уникальных элементов в каждом категориальном признаке

In [20]:
for col in categorical_features:
    print(f"Category count for {col} = {len(filtered_dt[col].unique())}")

Category count for make = 86
Category count for model = 863
Category count for trim = 1751
Category count for body = 82
Category count for transmission = 2
Category count for state = 38
Category count for color = 20
Category count for interior = 17
Category count for seller = 13557


In [21]:
final_dt = copy.copy(filtered_dt)

Так как количество категорий в большинстве столбцов крайне велико, попробуем уменьшить это количество.

In [22]:
col_to_squeeze = ["make", "model", "trim", "body", "seller"]

Сначала совместим однотипные названия, например в случае 'kia' и 'Kia' можно переименовать 'kia' в 'Kia', т.е. названия, отличающиеся только из-за регистра букв, будем считать одним и тем же названием

In [23]:
for col in col_to_squeeze:
    to_replace_same = get_to_replace_same(final_dt, col)
    final_dt[col] = final_dt[col].replace(to_replace_same)
    print(f"Category count for {col} = {len(final_dt[col].unique())}")

Category count for make = 56
Category count for model = 750
Category count for trim = 1692
Category count for body = 44
Category count for seller = 13557


Далее заменим категории, количество которых в столбце меньше некоего порога, на 'other'. В качестве порога будем выбирать 1-ый квартиль (0.25-квантиль) распределения количества каждой категории.

In [24]:
for col in col_to_squeeze:
    threshold = final_dt[col].value_counts().describe()["25%"]
    to_replace_low = get_to_replace_low(final_dt, col, threshold)
    final_dt[col] = final_dt[col].replace(to_replace_low)
    print(f"Category count for {col} = {len(final_dt[col].unique())}; threshold = {threshold}")

Category count for make = 43; threshold = 146.0
Category count for model = 565; threshold = 21.0
Category count for trim = 1358; threshold = 3.0
Category count for body = 34; threshold = 31.75
Category count for seller = 13557; threshold = 1.0


In [25]:
final_dt.to_csv(CFG.data_path + "final_dt.csv", index=False)

Количество категорий всё ещё достаточно велико, поэтому для кодирования категорий будем использовать [TargetEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.TargetEncoder.html#sklearn.preprocessing.TargetEncoder)

## Подготовка признаков

In [26]:
def TE(df: pd.DataFrame, columns: List[str], target: pd.Series) -> Tuple[pd.DataFrame, TargetEncoder]:
  index = df.index
  te = TargetEncoder()
  values = te.fit_transform(df[columns], target)
  labels = te.get_feature_names_in()
  df = df.drop(columns, axis=1)
  df = df.reset_index(drop=True)
  df = pd.concat([df, pd.DataFrame(values, columns=labels)], axis=1)
  df = df.set_index(index)
  return (df, te)

In [27]:
final_dt = pd.read_csv(CFG.data_path + "final_dt.csv")

In [28]:
dt_for_model, te = TE(final_dt, categorical_features, final_dt[target_col])

In [29]:
dt_for_model

Unnamed: 0,year,condition,odometer,mmr,sellingprice,sale_day_of_week,sale_year,sale_day,sale_month,make,model,trim,body,transmission,state,color,interior,seller
0,2015,5.0,16639.0,20500.0,21500.0,2,2014,16,12,11761.293257,14707.642202,10597.426576,14814.735029,12836.655635,12893.840993,13735.835712,14358.740658,15842.746703
1,2015,5.0,9393.0,20800.0,21500.0,2,2014,16,12,11761.293257,14707.642202,10597.426576,14814.735029,12836.655635,12893.840993,13735.835712,12609.020926,15842.746703
2,2014,45.0,1331.0,31900.0,30000.0,4,2015,15,1,16477.142100,15640.454848,21345.451257,11273.231886,12836.655635,12893.840993,13157.923234,14358.740658,23996.281287
3,2015,41.0,14282.0,27500.0,27750.0,4,2015,29,1,11785.773575,14053.358541,16443.305335,11273.231886,12836.655635,12893.840993,13735.835712,14358.740658,25809.736308
4,2015,1.0,5554.0,15350.0,10900.0,2,2014,30,12,11795.655477,11541.751915,11524.462756,11273.231886,12836.655635,12893.840993,13157.923234,14358.740658,10873.012337
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
520871,2012,26.0,35858.0,9950.0,10400.0,3,2015,8,7,11795.655477,8828.523083,9702.039848,11273.231886,12836.655635,15996.685701,13735.835712,10975.509134,13804.760984
520872,2011,39.0,66403.0,20300.0,22800.0,2,2015,7,7,16477.142100,17743.155810,21303.846154,11273.231886,12836.655635,12993.662563,13735.835712,16190.062581,17261.007463
520873,2012,5.0,54393.0,30200.0,30800.0,3,2015,8,7,22156.127775,25080.111111,8519.894737,18867.148874,12836.655635,13714.283554,13735.835712,14358.740658,15108.756361
520874,2015,38.0,16658.0,15100.0,11100.0,4,2015,9,7,11795.655477,11541.751915,11524.462756,11273.231886,12836.655635,12192.684217,13735.835712,14358.740658,10873.012337


In [30]:
te.mapping

{'make': make
  1     11761.293257
  2     16477.142100
  3     11785.773575
  4     11795.655477
  5     11173.525034
  6     15400.617030
  7     14099.456266
  8     10921.537006
  9     10961.865022
  10    14077.134811
  11    13104.237208
  12    19546.182652
  13    18742.443584
  14     8389.256485
  15    17807.563462
  16    14265.028393
  17    10211.929422
  18    12302.873204
  19    16341.546305
  20    18498.494461
  21    11993.231132
  22     9402.642084
  23    12449.329127
  24    15464.543644
  25     9671.165459
  26    11158.897540
  27    10380.831787
  28    11182.318359
  29    11447.195454
  30    15910.126257
  31    22156.127775
  32     6247.088608
  33     3911.862061
  34     8095.756216
  35     3564.413650
  36     4162.594187
  37    14915.858612
  38    13008.133970
  39    15330.034722
  40     3667.002119
  41     3952.171239
  42     1056.433225
  43     1887.789586
 -1     12761.838716
 -2     12761.838716
 dtype: float64,
 'model': model
  1     

Нормализация данных не нужна, т.к. мы будем использовать модели, основанные на решающих деревьях, а они не зависят от распределения исходных данных.

## Разделение на выборки

In [31]:
x_train, x_test, y_train, y_test = train_test_split(dt_for_model.drop(target_col, axis=1),
                                                  dt_for_model[target_col],
                                                  test_size=CFG.test_size,
                                                  random_state=CFG.seed)

In [32]:
print("Sizes")
pd.DataFrame({"Total": len(dt_for_model),
              "Train": len(x_train),
              "Test": len(x_test)}, index=[""])

Sizes


Unnamed: 0,Total,Train,Test
,520876,416700,104176


In [33]:
x_train.head(5)

Unnamed: 0,year,condition,odometer,mmr,sale_day_of_week,sale_year,sale_day,sale_month,make,model,trim,body,transmission,state,color,interior,seller
237587,2014,5.0,17783.0,30000.0,2,2015,10,2,14265.028393,21010.85763,24754.839024,14814.735029,12836.655635,17055.35248,13157.923234,14358.740658,21502.490887
387247,2014,42.0,21855.0,11900.0,3,2015,27,5,11795.655477,8828.523083,15283.317237,11273.231886,12836.655635,14840.368846,13157.923234,14358.740658,21105.627834
87695,2002,21.0,144526.0,2475.0,3,2015,18,2,14265.028393,7914.138856,11636.572031,14814.735029,12836.655635,13359.658676,11346.306478,14358.740658,2071.070553
77940,2002,19.0,89256.0,1450.0,3,2015,21,1,11158.89754,1354.315961,7108.972205,11273.231886,12836.655635,12577.367374,11106.267208,14358.740658,7621.634615
93249,2006,19.0,169154.0,1450.0,2,2015,3,2,11761.293257,13423.727606,10597.426576,11273.231886,12836.655635,14700.685369,13157.923234,10975.509134,3143.010204


# Бейзлайн

В качестве бейзлайна возьмём DecisionTreeRegressor со стандартными параметрами.

In [None]:
scoring_fn = ["neg_mean_absolute_error", "r2"]

In [None]:
# model = RandomForestRegressor(n_jobs=-1)
model = DecisionTreeRegressor()
cv_result = cross_validate(model, x_train, y_train, n_jobs=-1,
                           return_train_score=True,
                           scoring=scoring_fn)

In [None]:
cv_result

{'fit_time': array([5.06074214, 5.00669289, 5.0627439 , 5.04172421, 5.02370834]),
 'score_time': array([0.09408522, 0.07607007, 0.09508634, 0.09208465, 0.07406783]),
 'test_neg_mean_absolute_error': array([-1275.24061675, -1287.80111591, -1281.76099112, -1279.51899448,
        -1281.66652268]),
 'train_neg_mean_absolute_error': array([-0., -0., -0., -0., -0.]),
 'test_r2': array([0.93655342, 0.93605488, 0.93523462, 0.93554734, 0.93554192]),
 'train_r2': array([1., 1., 1., 1., 1.])}

In [None]:
for key, value in cv_result.items():
    print(f"Avg {key}: {value.mean()}")

Avg fit_time: 5.039122295379639
Avg score_time: 0.08627882003784179
Avg test_neg_mean_absolute_error: -1281.197648188145
Avg train_neg_mean_absolute_error: 0.0
Avg test_r2: 0.9357864369930058
Avg train_r2: 1.0


In [None]:
model.fit(x_train, y_train)

In [None]:
predicted = model.predict(x_test)

In [None]:
print(f"Test MAE: {mean_absolute_error(y_test, predicted)}")
print(f"Test R2: {r2_score(y_test, predicted)}")

Test MAE: 1277.133696820765
Test R2: 0.9353456208449169


In [None]:
print(f"Tree depth: {model.get_depth()}")
print(f"Tree leaves number: {model.get_n_leaves()}")

Tree depth: 53
Tree leaves number: 367991


# Фреймворки

## Общее для фреймворков

### Логирование

Структура файлов выглядит примерно так:

    root/
        Results/
            Framework1/
                results.csv
                models/
                    model_0.joblib
                    model_0.joblib
                    ...
                trials/
                    trials_0.csv
                    trials_1.csv
                    ...
            Framework2/
                ...
            ...

Описание файлов:
* `results.csv` - хранит информацию по итогам каждого запуска:
    * Время:
        * Общее время поиска
        * Среднее время обучения модели
        * Среднее время валидации
        * Среднее время одной попытки
        * Финальное время на Test
    * Качество R2:
        * Среднее на train
        * Среднее на Val
        * Финальное на Test
    * Качество MAE:
        * Среднее на train
        * Среднее на Val
        * Финальное на Test
    * Лучшие найденные параметры модели (каждый параметр в отдельном столбце)
* `model.joblib` - экземпляр лучшей для данного запуска модели
* `trials.csv` - хранит историю поиска каждого запуска (параметры, качество модели)


### Logger

In [39]:
class NotebookLogger(Logger):
    def log(self, message: str) -> None:
        print(message)

### Parameters grid

Варьировать будем:
* `criterion` - критерий, используемый для обучения дерева. Доступные значения:
    * `squared_error`
    * `friedman_mse`
    * `absolute_error`
    * `poisson`
* `max_depth` - максимальная глубина (высота) дерева. По умолчанию не установлено и дерево растёт, пока не получатся "чистые" узлы или узлы, содержащие меньше, чем `min_samples_split`
* `min_samples_split`
* `min_samples_leaf`

In [34]:
params = {
    "criterion": ['squared_error', 'friedman_mse', 'poisson'],
    "max_depth": [5, 60, 5], 
    "min_samples_split": [2, 20, 1],
    "min_samples_leaf": [1, 10, 1]
}

In [35]:
count = 1
for key, value in params.items():
    values = value
    if isinstance(value[0], Number):
        values = np.arange(value[0], value[1] + 1, value[2])
    print(f"Options count for {key}: {len(values)}")
    count *= len(values)
print(f"Total options count: {count}")

Options count for criterion: 3
Options count for max_depth: 12
Options count for min_samples_split: 19
Options count for min_samples_leaf: 10
Total options count: 6840


### Builder

In [50]:
def build_manager(builder_class: Type[AFrameworkManagerBuilder], folder_name: str) -> FrameworkManager:
    builder = builder_class(ModelManagerBuilder())
    builder.set_model(DecisionTreeRegressor).set_train_data(x_train, y_train)\
           .set_test_data(x_test, y_test).set_params(params)\
           .set_path(f"{CFG.results_path}{folder_name}/").set_config(CFG.max_iter, CFG.n_trials)
    return builder.build()
    

## Ray-Tune

In [51]:
manager_ray = build_manager(RayTuneManagerBuilder, "RayTune")

In [52]:
os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = "1"
if ray.is_initialized():
        ray.shutdown()
ray.init(ignore_reinit_error=True, _temp_dir=os.path.abspath("./Temp/Ray"), logging_level=logging.ERROR) 
manager_ray.search(NotebookLogger())

In [53]:
results_ray = manager_ray.loader.load_results()
results_ray = results_ray.sort_index(axis=1, level=[0])
print(f"Current iterations count: {manager_ray.loader.iter_count}")
display(results_ray)
display(manager_ray.loader.load_trials(0))

Current iterations count: 10


Unnamed: 0_level_0,MAE,MAE,MAE,Params,Params,Params,Params,R2,R2,R2,Time,Time,Time,Time
Unnamed: 0_level_1,Test,Train,Val,criterion,max_depth,min_samples_leaf,min_samples_split,Test,Train,Val,Iteration,Test,Train,Val
0,923.20325,707.337548,1064.915165,poisson,10,6,19,0.965636,0.979307,0.95689,124.645052,0.018,6.761841,0.059383
1,923.139323,685.828787,1061.097043,friedman_mse,10,9,19,0.965715,0.979889,0.956963,122.04034,0.016,6.569225,0.060213
2,923.199568,686.885762,1072.279096,friedman_mse,10,5,18,0.96568,0.979892,0.956208,128.538316,0.017,6.685416,0.062696
3,923.642418,661.60332,1082.948034,poisson,10,7,15,0.965596,0.981747,0.955819,127.670259,0.019003,8.024953,0.065893
4,923.597067,702.682382,1068.301449,poisson,10,6,13,0.965577,0.97955,0.956717,121.501925,0.016,6.64959,0.060069
5,923.208597,685.875801,1062.317447,friedman_mse,10,9,18,0.965708,0.979968,0.956888,126.81731,0.017999,7.079557,0.064978
6,923.642418,661.605324,1082.991071,poisson,10,7,15,0.965596,0.981747,0.955815,118.415039,0.015,7.388335,0.064801
7,923.436054,656.765014,1085.517764,poisson,10,7,18,0.965627,0.981931,0.955636,121.562659,0.015001,7.040447,0.059897
8,923.290461,693.761274,1052.36242,friedman_mse,10,8,16,0.965696,0.98011,0.957887,135.974717,0.032,9.339295,0.186241
9,1042.048273,673.709257,1094.953327,squared_error,50,8,19,0.958706,0.980743,0.95497,128.499998,0.032002,6.711639,0.067985


Unnamed: 0,Duration,Value,criterion,max_depth,min_samples_split,min_samples_leaf
0,30.705266,0.959513,squared_error,45,19,9
1,25.358171,0.955983,poisson,5,7,2
2,31.47033,0.955498,squared_error,30,15,4
3,27.76322,0.957583,poisson,40,15,7
4,35.017379,0.95627,friedman_mse,5,8,4
5,19.908,0.956659,squared_error,30,18,4
6,40.532516,0.959043,poisson,45,18,8
7,42.547483,0.95729,friedman_mse,40,18,5
8,38.364422,0.959529,poisson,40,18,9
9,32.364381,0.965963,friedman_mse,10,17,6


## Optuna

### SaveLoader

Т.к. Optuna предоставляет функционал по отрисовки различных графиков на основе объекта Study, будем дополнительно сохранять данный объект на каждой итерации. Таким образом структура файлов в случае использования Optuna будет выглядеть следующим образом

    root/
        Results/
            Optuna/
                results.csv
                models/
                    ...
                trials/
                    ...
                studies/
                    study_0.joblib
                    study_1.joblib
                    ...

### Search

In [54]:
manager_optuna = build_manager(OptunaManagerBuilder, "Optuna")

In [55]:
manager_optuna.search(NotebookLogger())

In [56]:
results_optuna = manager_optuna.loader.load_results()
results_optuna = results_optuna.sort_index(axis=1, level=[0])
print(f"Current iterations count: {manager_optuna.loader.iter_count}")
display(results_optuna)
display(manager_optuna.loader.load_trials(0))

Current iterations count: 10


Unnamed: 0_level_0,MAE,MAE,MAE,Params,Params,Params,Params,R2,R2,R2,Time,Time,Time,Time
Unnamed: 0_level_1,Test,Train,Val,criterion,max_depth,min_samples_leaf,min_samples_split,Test,Train,Val,Iteration,Test,Train,Val
0,923.20325,707.337548,1064.915165,poisson,10,6,19,0.965636,0.979307,0.95689,124.645052,0.018,6.761841,0.059383
1,923.139323,685.828787,1061.097043,friedman_mse,10,9,19,0.965715,0.979889,0.956963,122.04034,0.016,6.569225,0.060213
2,923.199568,686.885762,1072.279096,friedman_mse,10,5,18,0.96568,0.979892,0.956208,128.538316,0.017,6.685416,0.062696
3,923.642418,661.60332,1082.948034,poisson,10,7,15,0.965596,0.981747,0.955819,127.670259,0.019003,8.024953,0.065893
4,923.597067,702.682382,1068.301449,poisson,10,6,13,0.965577,0.97955,0.956717,121.501925,0.016,6.64959,0.060069
5,923.208597,685.875801,1062.317447,friedman_mse,10,9,18,0.965708,0.979968,0.956888,126.81731,0.017999,7.079557,0.064978
6,923.642418,661.605324,1082.991071,poisson,10,7,15,0.965596,0.981747,0.955815,118.415039,0.015,7.388335,0.064801
7,923.436054,656.765014,1085.517764,poisson,10,7,18,0.965627,0.981931,0.955636,121.562659,0.015001,7.040447,0.059897
8,923.290461,693.761274,1052.36242,friedman_mse,10,8,16,0.965696,0.98011,0.957887,135.974717,0.032,9.339295,0.186241
9,1042.048273,673.709257,1094.953327,squared_error,50,8,19,0.958706,0.980743,0.95497,128.499998,0.032002,6.711639,0.067985


Unnamed: 0,Duration,Value,criterion,max_depth,min_samples_split,min_samples_leaf
0,30.705266,0.959513,squared_error,45,19,9
1,25.358171,0.955983,poisson,5,7,2
2,31.47033,0.955498,squared_error,30,15,4
3,27.76322,0.957583,poisson,40,15,7
4,35.017379,0.95627,friedman_mse,5,8,4
5,19.908,0.956659,squared_error,30,18,4
6,40.532516,0.959043,poisson,45,18,8
7,42.547483,0.95729,friedman_mse,40,18,5
8,38.364422,0.959529,poisson,40,18,9
9,32.364381,0.965963,friedman_mse,10,17,6


## Hyperopt

### Search

In [60]:
manager_hyperopt = build_manager(HyperoptManagerBuilder, "Hyperopt")

In [61]:
manager_hyperopt.search(NotebookLogger())

In [62]:
results_hyperopt = manager_hyperopt.loader.load_results()
results_hyperopt = results_hyperopt.sort_index(axis=1, level=[0])
print(f"Current iterations count: {manager_hyperopt.loader.iter_count}")
display(results_hyperopt)
display(manager_hyperopt.loader.load_trials(0))

Current iterations count: 10


Unnamed: 0_level_0,MAE,MAE,MAE,Params,Params,Params,Params,R2,R2,R2,Time,Time,Time,Time
Unnamed: 0_level_1,Test,Train,Val,criterion,max_depth,min_samples_leaf,min_samples_split,Test,Train,Val,Iteration,Test,Train,Val
0,923.20325,707.337548,1064.915165,poisson,10,6,19,0.965636,0.979307,0.95689,124.645052,0.018,6.761841,0.059383
1,923.139323,685.828787,1061.097043,friedman_mse,10,9,19,0.965715,0.979889,0.956963,122.04034,0.016,6.569225,0.060213
2,923.199568,686.885762,1072.279096,friedman_mse,10,5,18,0.96568,0.979892,0.956208,128.538316,0.017,6.685416,0.062696
3,923.642418,661.60332,1082.948034,poisson,10,7,15,0.965596,0.981747,0.955819,127.670259,0.019003,8.024953,0.065893
4,923.597067,702.682382,1068.301449,poisson,10,6,13,0.965577,0.97955,0.956717,121.501925,0.016,6.64959,0.060069
5,923.208597,685.875801,1062.317447,friedman_mse,10,9,18,0.965708,0.979968,0.956888,126.81731,0.017999,7.079557,0.064978
6,923.642418,661.605324,1082.991071,poisson,10,7,15,0.965596,0.981747,0.955815,118.415039,0.015,7.388335,0.064801
7,923.436054,656.765014,1085.517764,poisson,10,7,18,0.965627,0.981931,0.955636,121.562659,0.015001,7.040447,0.059897
8,923.290461,693.761274,1052.36242,friedman_mse,10,8,16,0.965696,0.98011,0.957887,135.974717,0.032,9.339295,0.186241
9,1042.048273,673.709257,1094.953327,squared_error,50,8,19,0.958706,0.980743,0.95497,128.499998,0.032002,6.711639,0.067985


Unnamed: 0,Duration,Value,criterion,max_depth,min_samples_split,min_samples_leaf
0,30.705266,0.959513,squared_error,45,19,9
1,25.358171,0.955983,poisson,5,7,2
2,31.47033,0.955498,squared_error,30,15,4
3,27.76322,0.957583,poisson,40,15,7
4,35.017379,0.95627,friedman_mse,5,8,4
5,19.908,0.956659,squared_error,30,18,4
6,40.532516,0.959043,poisson,45,18,8
7,42.547483,0.95729,friedman_mse,40,18,5
8,38.364422,0.959529,poisson,40,18,9
9,32.364381,0.965963,friedman_mse,10,17,6


## Bayesian Optimization

### Manager

Данная библиотека позволяет задавать лишь границы изменения параметров, при этом параметры всегда будут вещественными числами. Поэтому задавать пространство поиска будем следующим образом:
* `criterion`: [0, 2], будем предсказывать индекс в исходном массиве, число будем округлять до ближайшего целого
* `max_depth`: [5, 60], полученное число будем округлять до ближайшего целого кратного 5
* `min_samples_leaf`: [1, 10], полученное число будем округлять до ближайшего целого
* `min_samples_split`: [0, 1], будем предсказывать относительное значение от предсказанного min_samples_leaf до 20, результат округлим до ближайшего целого.

### Search

In [63]:
manager_bayesian = build_manager(BayesianManagerBuilder, "Bayesian")

In [64]:
manager_bayesian.search(NotebookLogger())

In [65]:
results_bayesian = manager_bayesian.loader.load_results()
results_bayesian = results_bayesian.sort_index(axis=1, level=[0])
print(f"Current iterations count: {manager_bayesian.loader.iter_count}")
display(results_bayesian)
display(manager_bayesian.loader.load_trials(0))

Current iterations count: 10


Unnamed: 0_level_0,MAE,MAE,MAE,Params,Params,Params,Params,R2,R2,R2,Time,Time,Time,Time
Unnamed: 0_level_1,Test,Train,Val,criterion,max_depth,min_samples_leaf,min_samples_split,Test,Train,Val,Iteration,Test,Train,Val
0,923.20325,707.337548,1064.915165,poisson,10,6,19,0.965636,0.979307,0.95689,124.645052,0.018,6.761841,0.059383
1,923.139323,685.828787,1061.097043,friedman_mse,10,9,19,0.965715,0.979889,0.956963,122.04034,0.016,6.569225,0.060213
2,923.199568,686.885762,1072.279096,friedman_mse,10,5,18,0.96568,0.979892,0.956208,128.538316,0.017,6.685416,0.062696
3,923.642418,661.60332,1082.948034,poisson,10,7,15,0.965596,0.981747,0.955819,127.670259,0.019003,8.024953,0.065893
4,923.597067,702.682382,1068.301449,poisson,10,6,13,0.965577,0.97955,0.956717,121.501925,0.016,6.64959,0.060069
5,923.208597,685.875801,1062.317447,friedman_mse,10,9,18,0.965708,0.979968,0.956888,126.81731,0.017999,7.079557,0.064978
6,923.642418,661.605324,1082.991071,poisson,10,7,15,0.965596,0.981747,0.955815,118.415039,0.015,7.388335,0.064801
7,923.436054,656.765014,1085.517764,poisson,10,7,18,0.965627,0.981931,0.955636,121.562659,0.015001,7.040447,0.059897
8,923.290461,693.761274,1052.36242,friedman_mse,10,8,16,0.965696,0.98011,0.957887,135.974717,0.032,9.339295,0.186241
9,1042.048273,673.709257,1094.953327,squared_error,50,8,19,0.958706,0.980743,0.95497,128.499998,0.032002,6.711639,0.067985


Unnamed: 0,Duration,Value,criterion,max_depth,min_samples_split,min_samples_leaf
0,30.705266,0.959513,squared_error,45,19,9
1,25.358171,0.955983,poisson,5,7,2
2,31.47033,0.955498,squared_error,30,15,4
3,27.76322,0.957583,poisson,40,15,7
4,35.017379,0.95627,friedman_mse,5,8,4
5,19.908,0.956659,squared_error,30,18,4
6,40.532516,0.959043,poisson,45,18,8
7,42.547483,0.95729,friedman_mse,40,18,5
8,38.364422,0.959529,poisson,40,18,9
9,32.364381,0.965963,friedman_mse,10,17,6


## Talos

### Search

In [66]:
manager_talos = build_manager(TalosManagerBuilder, "Talos")

In [67]:
manager_talos.search()

In [68]:
results_talos = manager_talos.loader.load_results()
results_talos = results_talos.sort_index(axis=1, level=[0])
print(f"Current iterations count: {manager_talos.loader.iter_count}")
display(results_talos)
display(manager_talos.loader.load_trials(0))

Current iterations count: 10


Unnamed: 0_level_0,MAE,MAE,MAE,Params,Params,Params,Params,R2,R2,R2,Time,Time,Time,Time
Unnamed: 0_level_1,Test,Train,Val,criterion,max_depth,min_samples_leaf,min_samples_split,Test,Train,Val,Iteration,Test,Train,Val
0,923.20325,707.337548,1064.915165,poisson,10,6,19,0.965636,0.979307,0.95689,124.645052,0.018,6.761841,0.059383
1,923.139323,685.828787,1061.097043,friedman_mse,10,9,19,0.965715,0.979889,0.956963,122.04034,0.016,6.569225,0.060213
2,923.199568,686.885762,1072.279096,friedman_mse,10,5,18,0.96568,0.979892,0.956208,128.538316,0.017,6.685416,0.062696
3,923.642418,661.60332,1082.948034,poisson,10,7,15,0.965596,0.981747,0.955819,127.670259,0.019003,8.024953,0.065893
4,923.597067,702.682382,1068.301449,poisson,10,6,13,0.965577,0.97955,0.956717,121.501925,0.016,6.64959,0.060069
5,923.208597,685.875801,1062.317447,friedman_mse,10,9,18,0.965708,0.979968,0.956888,126.81731,0.017999,7.079557,0.064978
6,923.642418,661.605324,1082.991071,poisson,10,7,15,0.965596,0.981747,0.955815,118.415039,0.015,7.388335,0.064801
7,923.436054,656.765014,1085.517764,poisson,10,7,18,0.965627,0.981931,0.955636,121.562659,0.015001,7.040447,0.059897
8,923.290461,693.761274,1052.36242,friedman_mse,10,8,16,0.965696,0.98011,0.957887,135.974717,0.032,9.339295,0.186241
9,1042.048273,673.709257,1094.953327,squared_error,50,8,19,0.958706,0.980743,0.95497,128.499998,0.032002,6.711639,0.067985


Unnamed: 0,Duration,Value,criterion,max_depth,min_samples_split,min_samples_leaf
0,30.705266,0.959513,squared_error,45,19,9
1,25.358171,0.955983,poisson,5,7,2
2,31.47033,0.955498,squared_error,30,15,4
3,27.76322,0.957583,poisson,40,15,7
4,35.017379,0.95627,friedman_mse,5,8,4
5,19.908,0.956659,squared_error,30,18,4
6,40.532516,0.959043,poisson,45,18,8
7,42.547483,0.95729,friedman_mse,40,18,5
8,38.364422,0.959529,poisson,40,18,9
9,32.364381,0.965963,friedman_mse,10,17,6
