In [1]:
import sagemaker
import boto3
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
import pandas as pd
from sagemaker.serializers import CSVSerializer
import sklearn.metrics as metrics
import numpy as np

In [2]:
bucket = 's3://datascience-sagemaker-talitasantos'
role = sagemaker.get_execution_role()
treinamento_arquivo = f'{bucket}/treinamento.csv'
validacao_arquivo = f'{bucket}/validacao.csv'
saida = f'{bucket}/saida'

tipo_instancia = 'ml.m5.large'

# imagem do algortirmo de treinamento
xgboost_container = '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest'

# prefixo do nome do Job
base_name="cobranca-xgboost"

# hiperparâmetros
hp = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":"50"
}

In [3]:
# construir o classificados (estimador)
estimator = sagemaker.estimator.Estimator(
    base_name=base_name,
    image_uri=xgboost_container, 
    hyperparameters=hp,
    role=role,
    instance_count=1, 
    instance_type=tipo_instancia, 
    volume_size=5, # 5 GB 
    output_path=saida
)

In [4]:
# carregar dados de treinamento e validacao
treinamento_input = TrainingInput(treinamento_arquivo, content_type='csv')
validacao_input = TrainingInput(validacao_arquivo, content_type='csv')

# executar training job
estimator.fit({'train': treinamento_input, 'validation': validacao_input})

2022-04-13 00:40:55 Starting - Starting the training job...
2022-04-13 00:41:18 Starting - Preparing the instances for trainingProfilerReport-1649810454: InProgress
.........
2022-04-13 00:42:39 Downloading - Downloading input data......
2022-04-13 00:43:39 Training - Downloading the training image...
2022-04-13 00:44:19 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2022-04-13:00:44:20:INFO] Running standalone xgboost training.[0m
[34m[2022-04-13:00:44:20:INFO] File size need to be processed in the node: 0.26mb. Available memory size in the node: 203.16mb[0m
[34m[2022-04-13:00:44:20:INFO] Determined delimiter of CSV input is ','[0m
[34m[00:44:20] S3DistributionType set as FullyReplicated[0m
[34m[00:44:20] 6871x5 matrix with 34355 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2022-04-13:00:44:20:INFO] Determined delimiter of CSV input is ','[0m
[34m[00:44:20] S3DistributionType

In [5]:
predictor = estimator.deploy(initial_instance_count=1, 
                             instance_type=tipo_instancia, 
                             serializer=CSVSerializer(), 
                             endpoint_name='datascience-credito-xgboost')

-----!

In [6]:
predictor.endpoint_name

'datascience-credito-xgboost'

In [9]:
dados_teste = pd.read_csv('dados/teste.csv')

predictions = []
for index, row in dados_teste.iterrows():
    p = predictor.predict(row[1:])
    predictions.append(float(p.decode('utf8')))

predictions = np.array(predictions)

In [10]:
predictions

array([0.72364473, 0.19540872, 0.49424705, 0.65756559, 0.49721897,
       0.80037445, 0.7920208 , 0.8134582 , 0.40800068, 0.35018262,
       0.76159823, 0.70976967, 0.82950914, 0.48459858, 0.21546346,
       0.79409051, 0.75126588, 0.64431465, 0.16383848, 0.09328946,
       0.64911729, 0.80343235, 0.23370428, 0.8429575 , 0.22750813,
       0.18629546, 0.81430757, 0.25118485, 0.77832001, 0.13779278,
       0.64378685, 0.48804352, 0.36500689, 0.40471932, 0.48488134,
       0.82383406, 0.77439868, 0.27134734, 0.7920208 , 0.65359724,
       0.44603002, 0.80636388, 0.67924148, 0.48325774, 0.81776905,
       0.79886466, 0.22264259, 0.24088533, 0.8412196 , 0.19307512,
       0.67400217, 0.55664337, 0.75690848, 0.82950914, 0.83141261,
       0.77942705, 0.74372399, 0.10097948, 0.70436728, 0.63706279,
       0.71108752, 0.788809  , 0.89280975, 0.34523568, 0.71588856,
       0.59542078, 0.54490542, 0.41093674, 0.53263026, 0.11771972,
       0.75623614, 0.19048724, 0.77116275, 0.43602222, 0.82265

In [11]:
predictions = np.where(predictions > 0.5,1,0)
predictions

array([1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1,

In [12]:
print (metrics.classification_report(dados_teste.iloc[:,0], predictions))

              precision    recall  f1-score   support

           0       0.75      0.73      0.74       469
           1       0.76      0.78      0.77       512

    accuracy                           0.76       981
   macro avg       0.76      0.76      0.76       981
weighted avg       0.76      0.76      0.76       981

