In [1]:
!pip install sagemaker==1.72.0



In [2]:
import os
import sagemaker
import numpy as np
import pandas as pd
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri


In [3]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [4]:
## read in data
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
valid = test = pd.read_csv('data/valid.csv')

In [5]:
# First, use original variables
train = train.drop(['stays_in_night', 'total'],1) 
test = test.drop(['stays_in_night', 'total'],1) 
valid = valid.drop(['stays_in_night', 'total'],1) 

In [6]:
#split X and Y#
train_Y = train['is_canceled']
train_X = train.drop(['is_canceled'],1)

test_Y = test['is_canceled']
test_X = test.drop(['is_canceled'],1)

valid_Y = valid['is_canceled']
valid_X = valid.drop(['is_canceled'],1)

In [7]:
data_dir = 'data/xgboost'
if not os.path.exists(data_dir):  
    os.makedirs(data_dir)

In [8]:
## save data
test_X.to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)
pd.concat([train_Y, train_X], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)
pd.concat([valid_Y, valid_X], axis=1).to_csv(os.path.join(data_dir, 'valid.csv'), header=False, index=False)

In [9]:
## use XGBoost to cunstruct a model

In [10]:
session = sagemaker.Session() # Store the current SageMaker session

In [11]:
# S3 prefix (which folder will we use)
prefix = 'captsone-xgboost'

In [12]:
test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)
valid_location = session.upload_data(os.path.join(data_dir, 'valid.csv'), key_prefix=prefix)

In [13]:
#role and container
role = get_execution_role()

In [14]:
import boto3
region = boto3.Session().region_name

In [15]:
container = get_image_uri(region, 'xgboost')

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
There is a more up to date SageMaker XGBoost image. To use the newer image, please set 'repo_version'='1.0-1'. For example:
	get_image_uri(region, 'xgboost', '1.0-1').


In [16]:
xgb = sagemaker.estimator.Estimator(container, # The location of the container we wish to use
                                    role,                                    # What is our current IAM Role
                                    train_instance_count=1,                  # How many compute instances
                                    train_instance_type='ml.m4.xlarge',      # What kind of compute instances
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)


Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [17]:
xgb.set_hyperparameters(max_depth=4, # change these set_hyperparameters to see the influence
                        eta=0.1,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        num_class = 2,
                        objective='multi:softmax', # try different models
                        early_stopping_rounds=10,
                        num_round=80)

In [18]:
## Fit the XGBoost model
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=valid_location, content_type='csv')

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2021-05-29 13:57:28 Starting - Starting the training job...
2021-05-29 13:57:30 Starting - Launching requested ML instances......
2021-05-29 13:58:38 Starting - Preparing the instances for training.........
2021-05-29 14:00:25 Downloading - Downloading input data...
2021-05-29 14:00:55 Training - Training image download completed. Training in progress.
2021-05-29 14:00:55 Uploading - Uploading generated training model.[34mArguments: train[0m
[34m[2021-05-29:14:00:54:INFO] Running standalone xgboost training.[0m
[34m[2021-05-29:14:00:54:INFO] File size need to be processed in the node: 0.08mb. Available memory size in the node: 8413.74mb[0m
[34m[2021-05-29:14:00:54:INFO] Determined delimiter of CSV input is ','[0m
[34m[14:00:54] S3DistributionType set as FullyReplicated[0m
[34m[14:00:54] 1467x13 matrix with 19071 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2021-05-29:14:00:54:INFO] Determined delimiter of CSV input is ','[0m


In [19]:
## Testing the model
xgb_transformer = xgb.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


In [20]:
xgb_transformer.wait()

.............................[34mArguments: serve[0m
[34m[2021-05-29 14:10:17 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2021-05-29 14:10:17 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2021-05-29 14:10:17 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2021-05-29 14:10:17 +0000] [20] [INFO] Booting worker with pid: 20[0m
[34m[2021-05-29 14:10:17 +0000] [21] [INFO] Booting worker with pid: 21[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-05-29:14:10:17:INFO] Model loaded successfully for worker : 20[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-05-29:14:10:17:INFO] Model loaded successfully for worker : 21[0m
[34m[2021-05-29 14:10:17 +0000] [22] [INFO] Booting worker with pid: 22[0m
[34m[2021-05-29 14:10:17 +0000] [23] [INFO] Booting worker with pid: 23[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-05-29:14:10:17:INFO] Model loaded successfully for worker : 22[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-05-2

In [21]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

download: s3://sagemaker-us-east-1-445297930402/xgboost-2021-05-29-14-05-31-244/test.csv.out to data/xgboost/test.csv.out


In [22]:
predictions = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)
predictions = [round(num) for num in predictions.squeeze().values]

In [23]:
def results(predictions, test_Y):
    tp = fp = fn = tn = 0
    
    for i in range(len(test_Y)):
        # true positive
        if test_Y[i]==predictions[i]==1: tp = tp+1
        # true negative
        elif test_Y[i]==predictions[i]==0: tn = tn+1
        # false negative
        elif test_Y[i]==1: fn = fn+1
        # false positive
        else: fp = fp+1
    
    return tp, fp, fn, tn
    

In [24]:
# How many canceled rooms are predicted correctly?
tp, _, _, _ = results(predictions, test_Y)
print('true-positive:', tp)
print('accuracy:', accuracy_score(test_Y, predictions))


true-positive: 0
accuracy: 0.7841269841269841


In [32]:
# Tre prediction results are not good. True-Positive is always 0. True to transform the factors.

In [25]:
# In second attempt, use the added variables
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
valid = test = pd.read_csv('data/valid.csv')


train = train.drop(['stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies'],1) 
test = test.drop(['stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies'],1)
valid = valid.drop(['stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies'],1)


train_Y = train['is_canceled']
train_X = train.drop(['is_canceled'],1)

test_Y = test['is_canceled']
test_X = test.drop(['is_canceled'],1)

valid_Y = valid['is_canceled']
valid_X = valid.drop(['is_canceled'],1)


test_X.to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)
pd.concat([train_Y, train_X], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)
pd.concat([valid_Y, valid_X], axis=1).to_csv(os.path.join(data_dir, 'valid.csv'), header=False, index=False)

In [26]:
session = sagemaker.Session()

test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)
valid_location = session.upload_data(os.path.join(data_dir, 'valid.csv'), key_prefix=prefix)

role = get_execution_role()
region = boto3.Session().region_name
container = get_image_uri(region, 'xgboost')

xgb = sagemaker.estimator.Estimator(container, 
                                    role,                                  
                                    train_instance_count=1,                 
                                    train_instance_type='ml.m4.xlarge',    
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)

xgb.set_hyperparameters(max_depth=4, 
                        eta=0.1,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        num_class = 2,
                        objective='multi:softmax', 
                        early_stopping_rounds=10,
                        num_round=80)


s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=valid_location, content_type='csv')

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
There is a more up to date SageMaker XGBoost image. To use the newer image, please set 'repo_version'='1.0-1'. For example:
	get_image_uri(region, 'xgboost', '1.0-1').
Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2021-05-29 14:18:26 Starting - Starting the training job...
2021-05-29 14:18:28 Starting - Launching requested ML instances......
2021-05-29 14:19:38 Starting - Preparing the instances for training......
2021-05-29 14:20:49 Downloading - Downloading input data...
2021-05-29 14:21:20 Training - Downloading the training image..[34mArguments: train[0m
[34m[2021-05-29:14:21:42:INFO] Running standalone xgboost training.[0m
[34m[2021-05-29:14:21:42:INFO] File size need to be processed in the node: 0.07mb. Available memory size in the node: 8430.09mb[0m
[34m[2021-05-29:14:21:42:INFO] Determined delimiter of CSV input is ','[0m
[34m[14:21:42] S3DistributionType set as FullyReplicated[0m
[34m[14:21:42] 1467x10 matrix with 14670 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2021-05-29:14:21:42:INFO] Determined delimiter of CSV input is ','[0m
[34m[14:21:42] S3DistributionType set as FullyReplicated[0m
[34m[14:21:42] 315x10 matrix with

In [27]:
xgb_transformer = xgb.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')
xgb_transformer.wait()

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


..............................
[34mArguments: serve[0m
[34m[2021-05-29 14:27:31 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2021-05-29 14:27:31 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2021-05-29 14:27:31 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2021-05-29 14:27:31 +0000] [20] [INFO] Booting worker with pid: 20[0m
[34m[2021-05-29 14:27:31 +0000] [21] [INFO] Booting worker with pid: 21[0m
[34m[2021-05-29 14:27:31 +0000] [22] [INFO] Booting worker with pid: 22[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-05-29:14:27:31:INFO] Model loaded successfully for worker : 20[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-05-29 14:27:31 +0000] [23] [INFO] Booting worker with pid: 23[0m
[34m[2021-05-29:14:27:31:INFO] Model loaded successfully for worker : 21[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-05-29:14:27:31:INFO] Model loaded successfully for worker : 22[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-05

In [28]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

download: s3://sagemaker-us-east-1-445297930402/xgboost-2021-05-29-14-22-39-898/test.csv.out to data/xgboost/test.csv.out


In [29]:
predictions = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)
predictions = [round(num) for num in predictions.squeeze().values]
tp, _, _, _ = results(predictions, test_Y)
tp

0

In [30]:
print('accuracy:', accuracy_score(test_Y, predictions))

accuracy: 0.7841269841269841


In [31]:
# The resultes are not good at all. Try to simplify the factors.
# This time, I use only 'hotel', 'lead_time', 'arrival_date_week_number', 'reserved_room_type', 'customer_type', 'adr', 'stays_in_night', 'total'
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
valid = test = pd.read_csv('data/valid.csv')


keep_variables = ['hotel', 'is_canceled', 'lead_time', 'arrival_date_week_number', 'reserved_room_type', 
                  'customer_type', 'adr', 'stays_in_night', 'total']
train = train[keep_variables]
test= test[keep_variables]
valid = valid[keep_variables]


train_Y = train['is_canceled']
train_X = train.drop(['is_canceled'],1)

test_Y = test['is_canceled']
test_X = test.drop(['is_canceled'],1)

valid_Y = valid['is_canceled']
valid_X = valid.drop(['is_canceled'],1)


test_X.to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)
pd.concat([train_Y, train_X], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)
pd.concat([valid_Y, valid_X], axis=1).to_csv(os.path.join(data_dir, 'valid.csv'), header=False, index=False)


In [32]:
session = sagemaker.Session()

test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)
valid_location = session.upload_data(os.path.join(data_dir, 'valid.csv'), key_prefix=prefix)

role = get_execution_role()
region = boto3.Session().region_name
container = get_image_uri(region, 'xgboost')

xgb = sagemaker.estimator.Estimator(container, 
                                    role,                                 
                                    train_instance_count=1,                  
                                    train_instance_type='ml.m4.xlarge',   
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)

xgb.set_hyperparameters(max_depth=4, 
                        eta=0.1,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        num_class = 2,
                        objective='multi:softmax', 
                        early_stopping_rounds=10,
                        num_round=80)


s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=valid_location, content_type='csv')

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
There is a more up to date SageMaker XGBoost image. To use the newer image, please set 'repo_version'='1.0-1'. For example:
	get_image_uri(region, 'xgboost', '1.0-1').
Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2021-05-29 14:34:47 Starting - Starting the training job...
2021-05-29 14:34:49 Starting - Launching requested ML instances......
2021-05-29 14:36:15 Starting - Preparing the instances for training.........
2021-05-29 14:37:33 Downloading - Downloading input data...
2021-05-29 14:38:03 Training - Downloading the training image..[34mArguments: train[0m
[34m[2021-05-29:14:38:26:INFO] Running standalone xgboost training.[0m
[34m[2021-05-29:14:38:26:INFO] File size need to be processed in the node: 0.07mb. Available memory size in the node: 8428.23mb[0m
[34m[2021-05-29:14:38:26:INFO] Determined delimiter of CSV input is ','[0m
[34m[14:38:26] S3DistributionType set as FullyReplicated[0m
[34m[14:38:26] 1467x8 matrix with 11736 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2021-05-29:14:38:26:INFO] Determined delimiter of CSV input is ','[0m
[34m[14:38:26] S3DistributionType set as FullyReplicated[0m
[34m[14:38:26] 315x8 matrix wit

In [33]:
xgb_transformer = xgb.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')
xgb_transformer.wait()

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


...............................[34mArguments: serve[0m
[34m[2021-05-29 14:48:02 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2021-05-29 14:48:02 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2021-05-29 14:48:02 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2021-05-29 14:48:02 +0000] [21] [INFO] Booting worker with pid: 21[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-05-29:14:48:02:INFO] Model loaded successfully for worker : 21[0m
[34m[2021-05-29 14:48:02 +0000] [22] [INFO] Booting worker with pid: 22[0m
[34m[2021-05-29 14:48:02 +0000] [23] [INFO] Booting worker with pid: 23[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-05-29:14:48:02:INFO] Model loaded successfully for worker : 22[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-05-29:14:48:02:INFO] Model loaded successfully for worker : 23[0m
[34m[2021-05-29 14:48:02 +0000] [24] [INFO] Booting worker with pid: 24[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-05

In [34]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

download: s3://sagemaker-us-east-1-445297930402/xgboost-2021-05-29-14-43-00-144/test.csv.out to data/xgboost/test.csv.out


In [35]:
predictions = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)
predictions = [round(num) for num in predictions.squeeze().values]
tp, _, _, _ = results(predictions, test_Y)
tp

0

In [36]:
print('accuracy:', accuracy_score(test_Y, predictions))

accuracy: 0.3873015873015873
