# Compass.UOL - Sprint 4 & 5
## Equipe 3 - José Pedro, Pedro Montenegro, Natália Cardoso, Renan Mazzilli


## Treinamento do modelo XGBoost no Sagemaker

### Imports e instalações necessárias

In [None]:
# Instalar as bibliotecas necessárias para o projeto localmente
%pip install pandas boto3 joblib scikit-learn sagemaker seaborn

# Importar as bibliotecas necessárias para o projeto no ambiente local
import time
import boto3
import sagemaker
import pandas as pd
from sagemaker import image_uris
from sklearn.model_selection import train_test_split

### Configuração base do treinamento e declaração de variáveis globais

In [50]:
# Carregando configurações do arquivo JSON de configuração
with open('config.json', 'r') as config_file:
    config = json.load(config_file)

# Atribuir as variáveis de configuração a variáveis locais
table_name = config['table_name']
model_metrics_table = config['model_metrics_table']
dataset_path = config['dataset_path']
processed_dataset_path = config['processed_dataset_path']
model_path = config['model_path']
profile_name = config['profile_name']
aws_region = config['aws_region']

# Obtendo a role do SageMaker e o nome do bucket S3
role = config['role_name'] 
bucket_name = config['bucket_name']

# Inicialização da sessão AWS e dos clientes do DynamoDB, IAM e S3
session = boto3.Session(profile_name=profile_name, region_name=aws_region)
dynamodb_client = session.client('dynamodb')
iam_client = session.client('iam')
s3_client = boto3.client('s3')

# Inicializando a sessão do SageMaker e o cliente do SageMaker
sagemaker_session = sagemaker.Session()
sagemaker_client = sagemaker_session.boto_session.client('sagemaker')

# Definindo o local de saída dos dados de treinamento
output_location = f's3://{bucket_name}/output'

### Carregando dados do DynamoDB e realizando o *split* entre dados de treino e dados de teste para o treinamento do modelo

In [52]:
# Função para carregar os dados do DynamoDB para o DataFrame
def fetch_data_from_dynamodb(table_name):
    session = boto3.Session()
    dynamodb = session.resource('dynamodb')
    table = dynamodb.Table(table_name)
    response = table.scan()
    return pd.DataFrame(response['Items'])
# Preparar os dados para treinamento e validação do modelo no SageMaker seguindo a documentação do XGBoost
def prepare_data(df):
    df['label_avg_price_per_room'] = df['label_avg_price_per_room'].astype(int)
    X = df.drop(columns=['label_avg_price_per_room'])
    y = df['label_avg_price_per_room']
    return train_test_split(X, y, test_size=0.25, random_state=42)

# Carregar dados do DynamoDB
df = fetch_data_from_dynamodb('HotelReservations-test4')

# Preparar dados para treinamento e validação
X_train, X_test, y_train, y_test = prepare_data(df)

# Verifique a distribuição dos rótulos
print(y_train.value_counts())
print(y_test.value_counts())

# Mapear rótulos para o intervalo [0, num_class)
label_mapping = {value: idx for idx, value in enumerate(sorted(y_train.unique()))}
y_train = y_train.map(label_mapping)
y_test = y_test.map(label_mapping)

# Converter valores booleanos para inteiros
X_train = X_train.map(lambda x: int(x) if isinstance(x, bool) else x)
X_test = X_test.map(lambda x: int(x) if isinstance(x, bool) else x)

# Salvar os dados localmente sem cabeçalho
train_data = pd.concat([y_train, X_train], axis=1)
test_data = pd.concat([y_test, X_test], axis=1)

train_data.to_csv('train.csv', index=False, header=False)
test_data.to_csv('test.csv', index=False, header=False)

# Fazer upload dos dados para o S3
s3_client.upload_file('train.csv', bucket_name, 'train/train.csv')
s3_client.upload_file('test.csv', bucket_name, 'test/test.csv')

s3_train_data = f's3://{bucket_name}/train/train.csv'
s3_test_data = f's3://{bucket_name}/test/test.csv'

label_avg_price_per_room
2    371
1    246
3    220
Name: count, dtype: int64
label_avg_price_per_room
2    132
1     79
3     68
Name: count, dtype: int64


### Treinamento com Container Image do Modelo XGBoost

In [53]:
# Recuperar a URI da imagem do XGBoost
container = image_uris.retrieve(framework='xgboost', region=boto3.Session().region_name, version='latest')

# Configuração do estimador XGBoost
xgboost = sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m5.2xlarge',
    output_path=output_location,
    sagemaker_session=sagemaker_session
)

# Definindo os hiperparâmetros
xgboost.set_hyperparameters(
    num_round=100,
    objective='multi:softprob',
    num_class=3
)

# Configurar os inputs de treinamento e validação usando as URIs S3
train_input = sagemaker.inputs.TrainingInput(s3_data=s3_train_data, content_type='csv')
validation_input = sagemaker.inputs.TrainingInput(s3_data=s3_test_data, content_type='csv')

# Treinamento do modelo
data_channels = {'train': train_input, 'validation': validation_input}
xgboost.fit(data_channels)

# Esperar pelo treinamento e coletar logs
while True:
    training_job_name = xgboost.latest_training_job.name
    status = sagemaker_client.describe_training_job(TrainingJobName=training_job_name)['TrainingJobStatus']
    if status == 'Completed':
        print("Training completed successfully.")
        break
    elif status == 'Failed':
        print("Training failed. Check the logs for details.")
        break
    else:
        print(f"Training job status: {status}. Waiting for completion...")
        time.sleep(60)



INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: xgboost-2024-05-24-18-50-31-520


2024-05-24 18:50:31 Starting - Starting the training job...
2024-05-24 18:50:47 Starting - Preparing the instances for training...
2024-05-24 18:51:12 Downloading - Downloading input data...
2024-05-24 18:51:32 Downloading - Downloading the training image...
2024-05-24 18:52:28 Training - Training image download completed. Training in progress.
2024-05-24 18:52:28 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2024-05-24:18:52:21:INFO] Running standalone xgboost training.[0m
[34m[2024-05-24:18:52:21:INFO] File size need to be processed in the node: 0.24mb. Available memory size in the node: 23881.56mb[0m
[34m[2024-05-24:18:52:21:INFO] Determined delimiter of CSV input is ','[0m
[34m[18:52:21] S3DistributionType set as FullyReplicated[0m
[34m[18:52:21] 837x34 matrix with 28458 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2024-05-24:18:52:21:INFO] Determined delimiter of CSV input is ','[0m
[34m[18:5

## Deploy teste do modelo XGBoost para obtenção de métricas

In [None]:
# Deploy do modelo como um endpoint
predictor = xgboost.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.large'
)

# Fazer previsões
import numpy as np
import json
from decimal import Decimal
from io import StringIO
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sagemaker.serializers import CSVSerializer

# Converter valores Decimal para float usando applymap
X_test_float = X_test.applymap(lambda x: float(x) if isinstance(x, Decimal) else x)

# Configurar o serializador para CSV
predictor.serializer = CSVSerializer()

# Converter os dados de teste para formato NumPy
test_data_no_label = X_test_float.to_numpy()

# Fazer previsões
response = predictor.predict(test_data_no_label)

# Verificar o formato da resposta
print("Raw response:")
print(response)

# Tentar decodificar a resposta corretamente
try:
    predictions_array = np.array(json.loads(response))
except json.JSONDecodeError as e:
    print(f"JSONDecodeError: {e}")
    # Tentar outra abordagem de processamento da resposta
    try:
        predictions_array = np.array(eval(response))
    except Exception as e:
        print(f"Error decoding response: {e}")
        predictions_array = None

# Verifique se predictions_array foi criado corretamente
if predictions_array is not None:
    predicted_labels = np.argmax(predictions_array, axis=1)

    # Inverter o mapeamento dos rótulos (opcional, se necessário)
    inverse_label_mapping = {v: k for k, v in label_mapping.items()}
    predicted_labels = np.vectorize(inverse_label_mapping.get)(predicted_labels)
    y_test_mapped = y_test.map(inverse_label_mapping)

    # Calcular as métricas
    accuracy = accuracy_score(y_test_mapped, predicted_labels)
    precision = precision_score(y_test_mapped, predicted_labels, average='weighted')
    recall = recall_score(y_test_mapped, predicted_labels, average='weighted')
    f1 = f1_score(y_test_mapped, predicted_labels, average='weighted')
    conf_matrix = confusion_matrix(y_test_mapped, predicted_labels)

    # Imprimir as métricas
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')
    print('Confusion Matrix:')
    print(conf_matrix)
else:
    print("Failed to decode predictions array.")

# Encerrar o endpoint para evitar cobranças adicionais
predictor.delete_endpoint()


INFO:sagemaker:Creating model with name: xgboost-2024-05-23-04-11-33-142
INFO:sagemaker:Creating endpoint-config with name xgboost-2024-05-23-04-11-33-142
INFO:sagemaker:Creating endpoint with name xgboost-2024-05-23-04-11-33-142


-----!Raw response:
b'[0.9900056719779968, 0.009982947260141373, 1.1388174243620597e-05],[0.8370394706726074, 0.16161483526229858, 0.0013457279419526458],[0.7032768130302429, 0.2613142728805542, 0.035408854484558105],[0.6987595558166504, 0.2988429367542267, 0.0023975346703082323],[0.9917477369308472, 0.008210496045649052, 4.173181878286414e-05],[0.8185534477233887, 0.1702447235584259, 0.011201783083379269],[0.3845750391483307, 0.6021490097045898, 0.013275918550789356],[0.0030306403059512377, 0.5789738297462463, 0.4179955720901489],[0.00449243513867259, 0.9930773973464966, 0.002430172171443701],[0.5013061165809631, 0.49178487062454224, 0.0069090500473976135],[0.11467360705137253, 0.7762438654899597, 0.10908257961273193],[0.9358463883399963, 0.032888755202293396, 0.031264856457710266],[0.3822445273399353, 0.6028234958648682, 0.014931981451809406],[0.11627139896154404, 0.7387983798980713, 0.14493021368980408],[0.45775306224823, 0.13528740406036377, 0.40695950388908386],[0.4587438106536865

  X_test_float = X_test.applymap(lambda x: float(x) if isinstance(x, Decimal) else x)
INFO:sagemaker:Deleting endpoint configuration with name: xgboost-2024-05-23-04-11-33-142


Accuracy: 0.7133757961783439
Precision: 0.7188837234525053
Recall: 0.7133757961783439
F1 Score: 0.7147494945196413
Confusion Matrix:
[[78 24  4]
 [23 87 13]
 [ 1 25 59]]


INFO:sagemaker:Deleting endpoint with name: xgboost-2024-05-23-04-11-33-142
