In [None]:
!pip install sagemaker==1.72.0

In [1]:
import os
#import sagemaker
import numpy as np
import pandas as pd
#from sagemaker import get_execution_role

In [2]:
## read in data
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
valid = test = pd.read_csv('data/valid.csv')

In [3]:
#split X and Y#
train_Y = train['is_canceled']
train_X = train.drop(['is_canceled'],1)

test_Y = test['is_canceled']
test_X = test.drop(['is_canceled'],1)

valid_Y = valid['is_canceled']
valid_X = valid.drop(['is_canceled'],1)

In [4]:
data_dir = 'data/xgboost'
if not os.path.exists(data_dir):
    
    os.makedirs(data_dir)

In [5]:
## save data
test_X.to_csv(os.path.join(data_dir, 'test.csv'), header=True, index=False)
pd.concat([train_Y, train_X], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=True, index=False)
pd.concat([valid_Y, valid_X], axis=1).to_csv(os.path.join(data_dir, 'valid.csv'), header=True, index=False)

In [6]:
## use XGBoost to cunstruct a model

In [None]:
session = sagemaker.Session() # Store the current SageMaker session

In [None]:
# S3 prefix (which folder will we use)
prefix = 'captsone-xgboost'

In [None]:
test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)
valid_location = session.upload_data(os.path.join(data_dir, 'valid.csv'), key_prefix=prefix)

In [None]:
#role and container
role = get_execution_role()
container = get_image_uri(session.boto_region_name, 'xgboost')

In [None]:
xgb = sagemaker.estimator.Estimator(container, # The location of the container we wish to use
                                    role,                                    # What is our current IAM Role
                                    train_instance_count=1,                  # How many compute instances
                                    train_instance_type='ml.m4.xlarge',      # What kind of compute instances
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)


In [None]:
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=50)

In [None]:
## Fit the XGBoost model
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=valid_location, content_type='csv')

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

In [None]:
## Testing the model
xgb_transformer = xgb.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')
xgb_transformer.wait()

In [None]:
xgb_transformer.wait()

In [None]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

In [None]:
predictions = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)
predictions = [round(num) for num in predictions.squeeze().values]

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(test_y, predictions)

In [None]:
## Clean up

!rm $data_dir/*

!rmdir $data_dir

!rm $cache_dir/*
!rmdir $cache_dir