# Decision tree tuning job
Nesta seção faremos o fine tuning dos hyper parametros do decision tree utilizando a lib optuna

In [11]:
import sys
from pathlib import Path

module_path = str(Path.cwd().parents[0])

if module_path not in sys.path:
    sys.path.append(module_path)

In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

df = pd.read_csv('../data/processed/HotelReservationsLabelP.csv')
df.head(1)

Unnamed: 0,label_avg_price_per_room,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,no_of_special_requests,no_total_people,no_total_nights,season_group
0,1,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,0,2,3,Autumn


In [13]:
from scripts import data_preprocess
# Definir a coluna alvo
target_column = 'label_avg_price_per_room'

# Pré-processamento dos dados
preprocessor, X, y = data_preprocess.preprocess(df, target_column)

y = y - 1
preprocessor

In [14]:
X.head(1)

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,no_of_special_requests,no_total_people,no_total_nights,season_group
0,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,0,2,3,Autumn


In [15]:
y.head(1)

0    0
Name: label_avg_price_per_room, dtype: int64

In [16]:
X_prep = preprocessor.fit_transform(X)
X_prep

array([[2., 0., 1., ..., 0., 0., 0.],
       [2., 0., 2., ..., 0., 0., 0.],
       [1., 0., 2., ..., 0., 0., 1.],
       ...,
       [2., 0., 2., ..., 0., 1., 0.],
       [2., 0., 0., ..., 1., 0., 0.],
       [2., 0., 1., ..., 0., 0., 1.]])

In [17]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns
categorical_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)

feature_names = list(numeric_features) + list(categorical_feature_names)
X_df = pd.DataFrame(X_prep, columns=feature_names)
X_df.head(1)

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,arrival_year,arrival_month,arrival_date,repeated_guest,...,room_type_reserved_Room_Type 7,market_segment_type_Aviation,market_segment_type_Complementary,market_segment_type_Corporate,market_segment_type_Offline,market_segment_type_Online,season_group_Autumn,season_group_Spring,season_group_Summer,season_group_Winter
0,2.0,0.0,1.0,2.0,0.0,224.0,2017.0,10.0,2.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [18]:
from sklearn.model_selection import train_test_split
from scripts import data_oversampler

X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)

X_resampled, y_resampled = data_oversampler.oversample(X_train, y_train)
X_resampled.head(1)

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,arrival_year,arrival_month,arrival_date,repeated_guest,...,room_type_reserved_Room_Type 7,market_segment_type_Aviation,market_segment_type_Complementary,market_segment_type_Corporate,market_segment_type_Offline,market_segment_type_Online,season_group_Autumn,season_group_Spring,season_group_Summer,season_group_Winter
0,2.0,1.0,2.0,1.0,0.0,26.0,2017.0,10.0,17.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [19]:
y_resampled.head(1)

0    2
Name: label_avg_price_per_room, dtype: int64

In [47]:
import logging
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

log_file_path = "../logs/dt_logs"
logging.basicConfig(filename=log_file_path, level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')

# Callback function para o log dos trials do tuning job
def logging_callback(study, trial):
    logging.info(f'Trial {trial.number} finished with value: {trial.value} and parameters: {trial.params}. Best is trial {study.best_trial.number} with value: {study.best_value}.')
   

def objective(trial): # Definição dos hyperparametros
    criterion = 'gini'
    splitter = 'best'
    max_depth = None
    min_samples_split = 6 #trial.suggest_int('min_samples_split', 2, 200)
    min_samples_leaf = 1 #trial.suggest_int('min_samples_leaf', 1, 200)
    max_features = None
    min_impurity_decrease = 2.8594812578230564e-05 #trial.suggest_float('min_impurity_decrease', 0.000000001, 1)
    ccp_alpha = 5.395001099118109e-05 #trial.suggest_float('ccp_alpha', 0.000000001, 1)

    dt = DecisionTreeClassifier(
        criterion=criterion,
        splitter=splitter,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        min_impurity_decrease=min_impurity_decrease,
        ccp_alpha=ccp_alpha,
        random_state=42
    )

    dt.fit(X_train, y_train)
    preds = dt.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    return accuracy


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10000, callbacks=[logging_callback])

logging.info(f'Best hyperparameters: {study.best_params}')
logging.info(f'Best value: {study.best_value}')                
print("Best hyperparameters: ", study.best_params)
print("Best value: ", study.best_value)

[I 2024-06-23 14:31:36,740] A new study created in memory with name: no-name-00aa6b82-6a55-4f92-ba6d-f49dd2a86a7a
[I 2024-06-23 14:31:36,917] Trial 0 finished with value: 0.3736733287388008 and parameters: {'min_impurity_decrease': 0.47282212622738023, 'ccp_alpha': 0.5207053681934386}. Best is trial 0 with value: 0.3736733287388008.
[I 2024-06-23 14:31:37,093] Trial 1 finished with value: 0.3736733287388008 and parameters: {'min_impurity_decrease': 0.4520619619405532, 'ccp_alpha': 0.5804457717347624}. Best is trial 0 with value: 0.3736733287388008.
[I 2024-06-23 14:31:37,333] Trial 2 finished with value: 0.3736733287388008 and parameters: {'min_impurity_decrease': 0.5802318382989307, 'ccp_alpha': 0.6325751934798952}. Best is trial 0 with value: 0.3736733287388008.
[I 2024-06-23 14:31:37,415] Trial 3 finished with value: 0.3736733287388008 and parameters: {'min_impurity_decrease': 0.6359655932995998, 'ccp_alpha': 0.8758464710567493}. Best is trial 0 with value: 0.3736733287388008.
[I 20

Best hyperparameters:  {'min_impurity_decrease': 2.8594812578230564e-05, 'ccp_alpha': 5.395001099118109e-05}
Best value:  0.8381805651274983
