In [1]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Adicionar caminho do módulo
module_path = str(Path.cwd().parents[0])
if module_path not in sys.path:
    sys.path.append(module_path)

# Carregar dados
df = pd.read_csv('../data/processed/HotelReservationsLabelP.csv')

# Função de classificação de preços
def classify_price(price):
    if price == 1:
        return 0.0
    elif price == 2:
        return 1.0
    else:
        return 2.0

df['label_avg_price_per_room'] = df['label_avg_price_per_room'].apply(classify_price)
print(df.head())
from scripts import data_preprocess

# Definir a coluna alvo
target_column = 'label_avg_price_per_room'

# Pré-processamento dos dados
preprocessor, X, y = data_preprocess.preprocess(df, target_column)
print("Dados pré-processados (X):")
print(X.head())
print("Target (y):")
print(y.head())


   label_avg_price_per_room  no_of_adults  no_of_children  \
0                       0.0             2               0   
1                       1.0             2               0   
2                       0.0             1               0   
3                       1.0             2               0   
4                       1.0             2               0   

   no_of_weekend_nights  no_of_week_nights type_of_meal_plan  \
0                     1                  2       Meal Plan 1   
1                     2                  3      Not Selected   
2                     2                  1       Meal Plan 1   
3                     0                  2       Meal Plan 1   
4                     1                  1      Not Selected   

   required_car_parking_space room_type_reserved  lead_time  arrival_year  \
0                           0        Room_Type 1        224          2017   
1                           0        Room_Type 1          5          2018   
2                

In [2]:
from sklearn.model_selection import train_test_split
from scripts import data_oversampler

X_prep = preprocessor.fit_transform(X)
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns
categorical_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)

feature_names = list(numeric_features) + list(categorical_feature_names)
X_df = pd.DataFrame(X_prep, columns=feature_names)
y_df = y.reset_index(drop=True).to_frame()

print(X_df.head())

final_df = pd.concat([y_df, X_df], axis=1)
final_df.rename(columns={0: 'label_avg_price_per_room'}, inplace=True)
df_target = final_df['label_avg_price_per_room']
df_feat = final_df.drop(columns='label_avg_price_per_room')
X = np.array(df_feat).astype('float32')
y = np.array(df_target).astype('float32')
y = y.reshape(-1, 1)

print(final_df.head())

# Divisão dos dados
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
X_resampled, y_resampled = data_oversampler.oversample(X_train, y_train)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5)


   no_of_adults  no_of_children  no_of_weekend_nights  no_of_week_nights  \
0           2.0             0.0                   1.0                2.0   
1           2.0             0.0                   2.0                3.0   
2           1.0             0.0                   2.0                1.0   
3           2.0             0.0                   0.0                2.0   
4           2.0             0.0                   1.0                1.0   

   required_car_parking_space  lead_time  arrival_year  arrival_month  \
0                         0.0      224.0        2017.0           10.0   
1                         0.0        5.0        2018.0           11.0   
2                         0.0        1.0        2018.0            2.0   
3                         0.0      211.0        2018.0            5.0   
4                         0.0       48.0        2018.0            4.0   

   arrival_date  repeated_guest  ...  room_type_reserved_Room_Type 7  \
0           2.0             0.0 

In [1]:
# Esse ta funcionando, 85% de precisão.
from dotenv import load_dotenv
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.sklearn import SKLearn
import os

load_dotenv()

# Carregar variáveis de ambiente
role = os.getenv('SAGEMAKER_ROLE')
BUCKET = os.getenv('BUCKET')
MODEL_SUBFOLDER = os.getenv('MODEL_SUBFOLDER')
DATASET_SUBFOLDER = os.getenv('DATASET_SUBFOLDER')
KEY_TRAIN = os.getenv('KEY_TRAIN')
KEY_TEST = os.getenv('KEY_TEST')
OUTPUT_LOCATION = os.getenv('OUTPUT_LOCATION')
S3_TRAIN_DATA = os.getenv('S3_TRAIN_DATA')
S3_TEST_DATA = os.getenv('S3_TEST_DATA')

# Inicializar sessão do SageMaker
session = sagemaker.Session(boto3.Session())

# Recuperar a URI da imagem do Random Forest
container = sagemaker.image_uris.retrieve(framework='sklearn', region=boto3.Session().region_name, version='0.23-1')

# Definir entradas de treinamento e validação
train_input = TrainingInput(s3_data=S3_TRAIN_DATA, content_type='csv')
validation_input = TrainingInput(s3_data=S3_TEST_DATA, content_type='csv')

import subprocess

# Executar o script train.py e capturar a saída
process = subprocess.Popen(['python3', 'train.py'],
                           stdout=subprocess.PIPE,
                           stderr=subprocess.PIPE,
                           text=True)

# Iterar sobre a saída e exibir no notebook
for line in process.stdout:
    print(line, end='')

for line in process.stderr:
    print(line, end='')

# Aguardar o término do processo
process.wait()



# Configurar o estimador do SageMaker para Random Forest
random_forest = SKLearn(
    entry_point='train_script.py',
    role=role,
    instance_count=1,
    instance_type='ml.m4.xlarge',
    framework_version='0.23-1',
    hyperparameters={
        'n_estimators': 262,
        'max_depth': 45,
        'min_samples_split': 4,
        'min_samples_leaf': 1,
        'bootstrap': 'True'  # Corrigir valor booleano
    },
    output_path=OUTPUT_LOCATION,
    sagemaker_session=session
)

# Treinamento do modelo
random_forest.fit({'train': train_input, 'validation': validation_input})


sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\pacie\AppData\Local\sagemaker\sagemaker\config.yaml
Python não encontrado; execute sem argumentos para instalar na Microsoft Store ou desabilite este atalho a partir de Configurações > Gerenciar Aliases de Execução do Aplicativo.


INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2024-06-24-12-54-47-950


2024-06-24 12:54:51 Starting - Starting the training job...
2024-06-24 12:55:05 Starting - Preparing the instances for training...
2024-06-24 12:55:36 Downloading - Downloading input data...
2024-06-24 12:56:06 Downloading - Downloading the training image...
2024-06-24 12:56:46 Training - Training image download completed. Training in progress..2024-06-24 12:56:58,741 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2024-06-24 12:56:58,744 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-06-24 12:56:58,792 sagemaker_sklearn_container.training INFO     Invoking user training script.
2024-06-24 12:56:58,984 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-06-24 12:56:58,996 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-06-24 12:56:59,009 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-06-24 12: