# XGboost optuna tuning job

Nesta seção faremos o fine tuning dos hyper parametros do xgboost utilizando a lib optuna.

In [1]:
import sys
from pathlib import Path

module_path = str(Path.cwd().parents[0])

if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

df = pd.read_csv('../data/processed/HotelReservationsLabelP.csv')
df.head(1)

Unnamed: 0,label_avg_price_per_room,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,no_of_special_requests,no_total_people,no_total_nights,season_group
0,1,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,0,2,3,Autumn


In [4]:
from scripts import data_preprocess
# Definir a coluna alvo
target_column = 'label_avg_price_per_room'

# Pré-processamento dos dados
preprocessor, X, y = data_preprocess.preprocess(df, target_column)

y = y - 1
preprocessor

In [5]:
X

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,no_of_special_requests,no_total_people,no_total_nights,season_group
0,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,0,2,3,Autumn
1,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,1,2,5,Autumn
2,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,0,1,3,Winter
3,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,0,2,2,Spring
4,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,0,2,2,Spring
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36270,3,0,2,6,Meal Plan 1,0,Room_Type 4,85,2018,8,3,Online,0,0,0,1,3,8,Summer
36271,2,0,1,3,Meal Plan 1,0,Room_Type 1,228,2018,10,17,Online,0,0,0,2,2,4,Autumn
36272,2,0,2,6,Meal Plan 1,0,Room_Type 1,148,2018,7,1,Online,0,0,0,2,2,8,Summer
36273,2,0,0,3,Not Selected,0,Room_Type 1,63,2018,4,21,Online,0,0,0,0,2,3,Spring


In [6]:
y

0        0
1        1
2        0
3        1
4        1
        ..
36270    2
36271    1
36272    1
36273    1
36274    2
Name: label_avg_price_per_room, Length: 36275, dtype: int64

In [7]:
X_prep = preprocessor.fit_transform(X)
X_prep

array([[2., 0., 1., ..., 0., 0., 0.],
       [2., 0., 2., ..., 0., 0., 0.],
       [1., 0., 2., ..., 0., 0., 1.],
       ...,
       [2., 0., 2., ..., 0., 1., 0.],
       [2., 0., 0., ..., 1., 0., 0.],
       [2., 0., 1., ..., 0., 0., 1.]])

In [8]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns
categorical_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)

feature_names = list(numeric_features) + list(categorical_feature_names)
X_df = pd.DataFrame(X_prep, columns=feature_names)
X_df

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,arrival_year,arrival_month,arrival_date,repeated_guest,...,room_type_reserved_Room_Type 7,market_segment_type_Aviation,market_segment_type_Complementary,market_segment_type_Corporate,market_segment_type_Offline,market_segment_type_Online,season_group_Autumn,season_group_Spring,season_group_Summer,season_group_Winter
0,2.0,0.0,1.0,2.0,0.0,224.0,2017.0,10.0,2.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1,2.0,0.0,2.0,3.0,0.0,5.0,2018.0,11.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,1.0,0.0,2.0,1.0,0.0,1.0,2018.0,2.0,28.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,2.0,0.0,0.0,2.0,0.0,211.0,2018.0,5.0,20.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,2.0,0.0,1.0,1.0,0.0,48.0,2018.0,4.0,11.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36270,3.0,0.0,2.0,6.0,0.0,85.0,2018.0,8.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
36271,2.0,0.0,1.0,3.0,0.0,228.0,2018.0,10.0,17.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
36272,2.0,0.0,2.0,6.0,0.0,148.0,2018.0,7.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
36273,2.0,0.0,0.0,3.0,0.0,63.0,2018.0,4.0,21.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [9]:
from sklearn.model_selection import train_test_split
from scripts import data_oversampler

X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)

X_resampled, y_resampled = data_oversampler.oversample(X_train, y_train)
X_resampled

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,arrival_year,arrival_month,arrival_date,repeated_guest,...,room_type_reserved_Room_Type 7,market_segment_type_Aviation,market_segment_type_Complementary,market_segment_type_Corporate,market_segment_type_Offline,market_segment_type_Online,season_group_Autumn,season_group_Spring,season_group_Summer,season_group_Winter
0,2.0,1.0,2.0,1.0,0.0,26.0,2017.0,10.0,17.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,2.0,1.0,1.0,1.0,0.0,98.0,2018.0,7.0,16.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,2.0,0.0,0.0,3.0,0.0,433.0,2018.0,9.0,8.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,2.0,0.0,2.0,5.0,0.0,195.0,2018.0,8.0,8.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,2.0,0.0,188.0,2018.0,6.0,15.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32662,2.0,0.0,0.0,3.0,0.0,20.0,2018.0,10.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
32663,1.0,0.0,0.0,3.0,0.0,4.0,2018.0,7.0,19.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
32664,2.0,0.0,0.0,3.0,0.0,80.0,2018.0,10.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
32665,2.0,1.0,0.0,3.0,0.0,40.0,2018.0,6.0,29.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [10]:
y_resampled

0        2
1        2
2        0
3        0
4        2
        ..
32662    2
32663    2
32664    2
32665    2
32666    2
Name: label_avg_price_per_room, Length: 32667, dtype: int64

In [12]:
import logging
import optuna
import xgboost as xgb
from sklearn.metrics import accuracy_score

dtrain = xgb.DMatrix(X_resampled, label=y_resampled)
dtest = xgb.DMatrix(X_test, label=y_test)

log_file_path = "../logs/xgboost_logs"
logging.basicConfig(filename=log_file_path, level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')

# Callback function para o log dos trials do tuning job
def logging_callback(study, trial):
    logging.info(f'Trial {trial.number} finished with value: {trial.value} and parameters: {trial.params}. Best is trial {study.best_trial.number} with value: {study.best_value}.')
   

def objective(trial):

    # Definição dos hyperparametros
    param = {
        'objective': 'multi:softmax',
        'num_class': 3,
        'max_depth': trial.suggest_int('max_depth', 19, 40),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0), #0.9978867864846833
        'eta': trial.suggest_float('eta', 0.001, 0.03) # 0.018279850742369694
    }
    num_round = trial.suggest_int('num_round', 300, 900)

    # Treino do modelo
    model = xgb.train(param, dtrain, num_round)

    # Predidct do modelo
    preds = model.predict(dtest)
    
    # Avaliação com base na acurácia
    accuracy = accuracy_score(y_test, preds)
    return accuracy


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, callbacks=[logging_callback])

logging.info(f'Best hyperparameters: {study.best_params}')
logging.info(f'Best value: {study.best_value}')                
print("Best hyperparameters: ", study.best_params)
print("Best value: ", study.best_value)

[I 2024-06-23 12:02:15,640] A new study created in memory with name: no-name-5e7482b3-f250-40ec-a1ea-a69d49a64b62
[I 2024-06-23 12:02:30,828] Trial 0 finished with value: 0.881461061337009 and parameters: {'max_depth': 32, 'subsample': 0.8683453581112338, 'colsample_bytree': 0.9371859836502532, 'eta': 0.01697458067220914, 'num_round': 416}. Best is trial 0 with value: 0.881461061337009.
[I 2024-06-23 12:02:48,807] Trial 1 finished with value: 0.8806340454858718 and parameters: {'max_depth': 33, 'subsample': 0.672429077709872, 'colsample_bytree': 0.7529061125580415, 'eta': 0.008633627623831056, 'num_round': 521}. Best is trial 0 with value: 0.881461061337009.
[I 2024-06-23 12:03:03,224] Trial 2 finished with value: 0.8822880771881461 and parameters: {'max_depth': 20, 'subsample': 0.615116806798843, 'colsample_bytree': 0.8220469967981066, 'eta': 0.01982273029741734, 'num_round': 722}. Best is trial 2 with value: 0.8822880771881461.
[I 2024-06-23 12:03:21,638] Trial 3 finished with value:

Best hyperparameters:  {'max_depth': 25, 'subsample': 0.7764487695729669, 'colsample_bytree': 0.9289367870204254, 'eta': 0.018242939171707197, 'num_round': 562}
Best value:  0.8850447966919366
