In [1]:
import sagemaker
from sagemaker import get_execution_role

import boto3

sess = sagemaker.Session()
role = get_execution_role()
container = sagemaker.amazon.amazon_estimator.get_image_uri('us-east-1', "xgboost", "latest")



In [4]:
s3_validation_data = 's3://mastering-ml-aws/chapter4/test-trans-vec-csv-1/'
s3_train_data = 's3://mastering-ml-aws/chapter4/train-trans-vec-csv-1/'
s3_test_data = 's3://mastering-ml-aws/chapter4/test-trans-vec-csv-no-label/'
s3_output_location = 's3://mastering-ml-aws/chapter4/sagemaker/output/xgboost/'

In [3]:
sagemaker_model = sagemaker.estimator.Estimator(container,
                                                role,
                                                train_instance_count=1,
                                                train_instance_type='ml.c4.4xlarge',
                                                train_volume_size=30,
                                                train_max_run=360000,
                                                input_mode='File',
                                                output_path=s3_output_location,
                                                sagemaker_session=sess)


In [5]:
sagemaker_model.set_hyperparameters(objective='binary:logistic',
                                    max_depth=5,
                                    eta=0.2,
                                    gamma=4,
                                    min_child_weight=6,
                                    subsample=0.7,
                                    silent=0,
                                    num_round=50)

train_data = sagemaker.session.s3_input(s3_train_data, 
                                        distribution='FullyReplicated',
                                        content_type='text/csv', 
                                        s3_data_type='S3Prefix')
validation_data = sagemaker.session.s3_input(s3_validation_data, 
                                             distribution='FullyReplicated',
                                             content_type='text/csv', 
                                             s3_data_type='S3Prefix')
data_channels = {'train': train_data, 'validation': validation_data}

sagemaker_model.fit(inputs=data_channels, 
                    logs=True)


INFO:sagemaker:Creating training-job with name: xgboost-2019-04-27-20-39-02-968


2019-04-27 20:39:03 Starting - Starting the training job...
2019-04-27 20:39:05 Starting - Launching requested ML instances......
2019-04-27 20:40:11 Starting - Preparing the instances for training...
2019-04-27 20:41:03 Downloading - Downloading input data......
2019-04-27 20:41:47 Training - Training image download completed. Training in progress.
[31mArguments: train[0m
[31m[2019-04-27:20:41:48:INFO] Running standalone xgboost training.[0m
[31m[2019-04-27:20:41:48:INFO] File size need to be processed in the node: 4600.52mb. Available memory size in the node: 22216.55mb[0m
[31m[2019-04-27:20:41:48:INFO] Determined delimiter of CSV input is ','[0m
[31m[20:41:48] S3DistributionType set as FullyReplicated[0m
[31m[20:42:16] 9601250x100 matrix with 960125000 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[31m[2019-04-27:20:42:16:INFO] Determined delimiter of CSV input is ','[0m
[31m[20:42:16] S3DistributionType set as FullyReplicated[

[31m[20:47:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5[0m
[31m[38]#011train-error:0.170609#011validation-error:0.169888[0m
[31m[20:47:37] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5[0m
[31m[39]#011train-error:0.170609#011validation-error:0.169888[0m
[31m[20:47:44] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5[0m
[31m[40]#011train-error:0.169668#011validation-error:0.169047[0m
[31m[20:47:52] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5[0m
[31m[41]#011train-error:0.170609#011validation-error:0.169888[0m
[31m[20:47:59] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=5[0m
[31m[42]#011train-error:0.169668#011validation-error:0.169047[0m
[31m[20:48:07] src/tree/updater_prune.cc:74: tree pruning e

In [45]:
transformer = sagemaker_model.transformer(instance_count=1, instance_type='ml.m4.2xlarge',
                                          output_path=s3_output_location)
transformer.transform(s3_test_data, content_type='text/csv', split_type='Line')
transformer.wait()


INFO:sagemaker:Creating model with name: xgboost-2019-04-27-20-39-02-968
INFO:sagemaker:Creating transform job with name: xgboost-2019-04-28-01-26-38-372


..............................................!


In [46]:
transformer.output_path

's3://mastering-ml-aws/chapter4/sagemaker/output/xgboost/'

In [47]:
!aws s3 ls s3://mastering-ml-aws/chapter4/sagemaker/output/xgboost/ | head

                           PRE xgboost-2019-04-27-20-39-02-968/
2019-04-27 23:24:16          0 _SUCCESS.out
2019-04-28 01:29:58     361031 part-00000-19e45462-84f7-46ac-87bf-d53059e0c60c-c000.csv.out
2019-04-28 01:29:58     361045 part-00001-19e45462-84f7-46ac-87bf-d53059e0c60c-c000.csv.out
2019-04-28 01:29:59     361060 part-00002-19e45462-84f7-46ac-87bf-d53059e0c60c-c000.csv.out
2019-04-28 01:29:59     361067 part-00003-19e45462-84f7-46ac-87bf-d53059e0c60c-c000.csv.out
2019-04-28 01:29:59     361081 part-00004-19e45462-84f7-46ac-87bf-d53059e0c60c-c000.csv.out
2019-04-28 01:29:59     361082 part-00005-19e45462-84f7-46ac-87bf-d53059e0c60c-c000.csv.out
2019-04-28 01:30:01     361076 part-00006-19e45462-84f7-46ac-87bf-d53059e0c60c-c000.csv.out
2019-04-28 01:30:00     361054 part-00007-19e45462-84f7-46ac-87bf-d53059e0c60c-c000.csv.out

[Errno 32] Broken pipe
Exception ignored in: <_io.TextIOWrapper name='<stdout>' mode='w' encoding='UTF-8'>
BrokenPipeError: [Errno 32] Broken 

In [68]:
import pandas as pd

scores_df = pd.read_csv(output_path + 
    'part-00000-19e45462-84f7-46ac-87bf-d53059e0c60c-c000.csv.out',
    header=None, names=['score'])


In [69]:
scores_df.head(5)

Unnamed: 0,score
0,0.224591
1,0.166977
2,0.040747
3,0.166977
4,0.166977


In [52]:
!aws s3 ls --recursive s3://mastering-ml-aws/chapter4/sagemaker/output/xgboost/ | grep model

2019-04-27 20:48:59       9688 chapter4/sagemaker/output/xgboost/xgboost-2019-04-27-20-39-02-968/output/model.tar.gz


In [53]:
!aws s3 cp s3://mastering-ml-aws/chapter4/sagemaker/output/xgboost/xgboost-2019-04-27-20-39-02-968/output/model.tar.gz /tmp/model.tar.gz

Completed 9.5 KiB/9.5 KiB (114.4 KiB/s) with 1 file(s) remainingdownload: s3://mastering-ml-aws/chapter4/sagemaker/output/xgboost/xgboost-2019-04-27-20-39-02-968/output/model.tar.gz to ../../../../../tmp/model.tar.gz


In [56]:
!tar xvf /tmp/model.tar.gz

xgboost-model


In [57]:
!pip install xgboost

Collecting xgboost
[?25l  Downloading https://files.pythonhosted.org/packages/6a/49/7e10686647f741bd9c8918b0decdb94135b542fe372ca1100739b8529503/xgboost-0.82-py2.py3-none-manylinux1_x86_64.whl (114.0MB)
[K    100% |████████████████████████████████| 114.0MB 390kB/s eta 0:00:01 2% |█                               | 3.2MB 53.3MB/s eta 0:00:03    12% |████                            | 14.5MB 44.4MB/s eta 0:00:03    38% |████████████▏                   | 43.4MB 41.9MB/s eta 0:00:02    59% |███████████████████             | 67.7MB 40.2MB/s eta 0:00:02
Installing collected packages: xgboost
Successfully installed xgboost-0.82
[33mYou are using pip version 10.0.1, however version 19.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [58]:
import xgboost
import pickle as pkl

model_local = pkl.load(open('xgboost-model', 'rb'))


In [67]:
!aws s3 cp s3://mastering-ml-aws/chapter4/sagemaker/output/xgboost/part-00000-19e45462-84f7-46ac-87bf-d53059e0c60c-c000.csv.out - | head

0.224591374397
0.166976705194
0.0407470166683
0.166976705194
0.166976705194
0.166976705194
0.166976705194
0.166976705194
0.111299999058
0.124251760542
download failed: s3://mastering-ml-aws/chapter4/sagemaker/output/xgboost/part-00000-19e45462-84f7-46ac-87bf-d53059e0c60c-c000.csv.out to - [Errno 32] Broken pipe


In [78]:
!aws s3 ls s3://mastering-ml-aws/chapter4/test-trans-vec-csv-1/ | head

2019-04-27 01:20:51          0 _SUCCESS
2019-04-27 01:20:35    9700260 part-00000-25f35551-ffff-41d8-82a9-75f429553035-c000.csv
2019-04-27 01:20:36    9711918 part-00001-25f35551-ffff-41d8-82a9-75f429553035-c000.csv
2019-04-27 01:20:36    9611016 part-00002-25f35551-ffff-41d8-82a9-75f429553035-c000.csv
2019-04-27 01:20:36    9685788 part-00003-25f35551-ffff-41d8-82a9-75f429553035-c000.csv
2019-04-27 01:20:35    9665286 part-00004-25f35551-ffff-41d8-82a9-75f429553035-c000.csv
2019-04-27 01:20:35    9648000 part-00005-25f35551-ffff-41d8-82a9-75f429553035-c000.csv
2019-04-27 01:20:35    9644382 part-00006-25f35551-ffff-41d8-82a9-75f429553035-c000.csv
2019-04-27 01:20:35    9638754 part-00007-25f35551-ffff-41d8-82a9-75f429553035-c000.csv
2019-04-27 01:20:35    9553932 part-00008-25f35551-ffff-41d8-82a9-75f429553035-c000.csv

[Errno 32] Broken pipe
Exception ignored in: <_io.TextIOWrapper name='<stdout>' mode='w' encoding='UTF-8'>
BrokenPipeError: [Errno 32] Broken pipe


In [86]:
column_names = ['click'] + ['f' + str(i) for i in range(0, 100)]
validation_df = pd.read_csv(s3_validation_data + \
                            'part-00000-25f35551-ffff-41d8-82a9-75f429553035-c000.csv',
                            header=None, 
                            names=column_names)


In [87]:
validation_df

Unnamed: 0,click,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [88]:
matrix = xgboost.DMatrix(validation_df[column_names[1:]])
validation_df['score'] = model_local.predict(matrix)


In [89]:
validation_df.head()


Unnamed: 0,click,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f91,f92,f93,f94,f95,f96,f97,f98,f99,score
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166977
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166977
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166977
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166977
4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166977


In [90]:
validation_df[column_names[1:]].as_matrix()


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [91]:
from sklearn.metrics import roc_auc_score

roc_auc_score(validation_df['click'], validation_df['score'])


0.6698020721606562