In [1]:
# Importing sagemaker will throw a ContextualVersionConflict due to site-packages containing
# two versions of urllib3. Even though the right urllib3 version (1.23) is active, we need to
# manually remove urllib3 1.25.3 & restart kernel to make sagemaker work.
# This could probably be moved to post_setup.sh, but leaving it here in case someone knows
# of a solution

import urllib3
assert(urllib3.__version__ == '1.23')

# !sudo rm -rf /opt/miniconda/envs/py3/lib/python3.6/site-packages/urllib3-1.25.3.dist-info

In [2]:
import os
import datetime

import boto3
import sagemaker
from sagemaker.tensorflow import TensorFlow
import tensorflow

from config import DATASET_URL, IMAGE_NAME, IAM_ROLE, MODEL_URL_PREFIX

In [3]:
training_sample_count = 60000  # Standard size of training data for MNIST
batch_size = 64
num_epochs = 10

training_step_count = int(training_sample_count * num_epochs / batch_size)

entry_point = 'model.py'
timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
output_path   = f"{MODEL_URL_PREFIX}{timestamp}/output"
code_location = f"{MODEL_URL_PREFIX}{timestamp}/code"

boto_session = boto3.Session(region_name="us-east-1",
                             aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID_PROD"),
                             aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY_PROD"))

kwargs = dict(entry_point=entry_point,
              image_name=IMAGE_NAME,
              role=IAM_ROLE,
              sagemaker_session=sagemaker.Session(boto_session=boto_session),
              train_instance_count=1,
              train_instance_type='ml.m5.xlarge',
              framework_version='1.13',
              hyperparameters={'dataset-url': DATASET_URL,
                               'training_steps': training_step_count,
                               'batch_size': batch_size,
                               'evaluation_steps': 10,},
              py_version = 'py3',
              output_path = output_path,
              code_location=code_location,
              distributions={'parameter_server': {'enabled': True}})

# Training on a single sagemaker instance

In [4]:
mnist_estimator = TensorFlow(**kwargs)
%time  mnist_estimator.fit(inputs=None)  # we're bypassing the conventional sagemaker input methods

2019-07-02 18:43:48 Starting - Starting the training job...
2019-07-02 18:43:50 Starting - Launching requested ML instances......
2019-07-02 18:44:50 Starting - Preparing the instances for training...
2019-07-02 18:45:49 Downloading - Downloading input data
2019-07-02 18:45:49 Training - Downloading the training image......
2019-07-02 18:46:47 Training - Training image download completed. Training in progress..
[31m2019-07-02 18:46:50,617 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[31m2019-07-02 18:46:50,623 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-07-02 18:46:50,966 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-07-02 18:46:50,981 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-07-02 18:46:50,991 sagemaker-containers INFO     Invoking user script
[0m
[31mTraining Env:
[0m
[31m{
    "additional_f

[31mInstructions for updating:[0m
[31mUse `tf.data.experimental.map_and_batch(...)`.[0m
[31m2019-07-02 18:47:01,343 - tensorflow - INFO - Calling model_fn.[0m
[31m2019-07-02 18:47:01,804 - tensorflow - INFO - Done calling model_fn.[0m
[31m2019-07-02 18:47:01,806 - tensorflow - INFO - Create CheckpointSaverHook.[0m
[31m2019-07-02 18:47:01.842278: E tensorflow/core/platform/s3/aws_logging.cc:60] No response body. Response code: 404[0m
[31m2019-07-02 18:47:01.842335: W tensorflow/core/platform/s3/aws_logging.cc:57] If the signature check failed. This could be because of a time skew. Attempting to adjust the signer.[0m
[31m2019-07-02 18:47:01.880407: E tensorflow/core/platform/s3/aws_logging.cc:60] No response body. Response code: 404[0m
[31m2019-07-02 18:47:01.880461: W tensorflow/core/platform/s3/aws_logging.cc:57] If the signature check failed. This could be because of a time skew. Attempting to adjust the signer.[0m
[31m2019-07-02 18:47:01.900579: E tensorflow/core/p

[31m2019-07-02 18:47:35,264 - tensorflow - INFO - global_step/sec: 27.9899[0m
[31m2019-07-02 18:47:35,266 - tensorflow - INFO - loss = 0.7298758, step = 801 (3.573 sec)[0m
[31m2019-07-02 18:47:38,687 - tensorflow - INFO - global_step/sec: 29.2218[0m
[31m2019-07-02 18:47:38,688 - tensorflow - INFO - loss = 0.9928016, step = 901 (3.422 sec)[0m
[31m2019-07-02 18:47:42,760 - tensorflow - INFO - Saving checkpoints for 1000 into s3://com.climate.production.analytics/dsw/scratch/sagemaker/models/peta_sage_hack/20190702-184348/output/petastorm-sagemaker-2019-07-02-18-43-48-315/model/model.ckpt.[0m
[31m2019-07-02 18:47:43.947977: E tensorflow/core/platform/s3/aws_logging.cc:60] No response body. Response code: 404[0m
[31m2019-07-02 18:47:43.948022: W tensorflow/core/platform/s3/aws_logging.cc:57] If the signature check failed. This could be because of a time skew. Attempting to adjust the signer.[0m
[31m2019-07-02 18:47:44,386 - tensorflow - INFO - Calling model_fn.[0m
[31m2019

[31m2019-07-02 18:48:40,210 - tensorflow - INFO - global_step/sec: 30.2274[0m
[31m2019-07-02 18:48:40,211 - tensorflow - INFO - loss = 0.9276402, step = 2401 (3.309 sec)[0m
[31m2019-07-02 18:48:44,324 - tensorflow - INFO - Saving checkpoints for 2500 into s3://com.climate.production.analytics/dsw/scratch/sagemaker/models/peta_sage_hack/20190702-184348/output/petastorm-sagemaker-2019-07-02-18-43-48-315/model/model.ckpt.[0m
[31m2019-07-02 18:48:45.664410: E tensorflow/core/platform/s3/aws_logging.cc:60] No response body. Response code: 404[0m
[31m2019-07-02 18:48:45.664449: W tensorflow/core/platform/s3/aws_logging.cc:57] If the signature check failed. This could be because of a time skew. Attempting to adjust the signer.[0m
[31m2019-07-02 18:48:45,971 - tensorflow - INFO - Calling model_fn.[0m
[31m2019-07-02 18:48:46,122 - tensorflow - INFO - Done calling model_fn.[0m
[31m2019-07-02 18:48:46,146 - tensorflow - INFO - Starting evaluation at 2019-07-02-18:48:46[0m
[31m201

[31m2019-07-02 18:49:39,601 - tensorflow - INFO - global_step/sec: 26.7599[0m
[31m2019-07-02 18:49:39,602 - tensorflow - INFO - loss = 0.31026232, step = 3801 (3.737 sec)[0m
[31m2019-07-02 18:49:42,407 - tensorflow - INFO - global_step/sec: 35.6401[0m
[31m2019-07-02 18:49:42,408 - tensorflow - INFO - loss = 0.085260645, step = 3901 (2.806 sec)[0m
[31m2019-07-02 18:49:45,947 - tensorflow - INFO - Saving checkpoints for 4000 into s3://com.climate.production.analytics/dsw/scratch/sagemaker/models/peta_sage_hack/20190702-184348/output/petastorm-sagemaker-2019-07-02-18-43-48-315/model/model.ckpt.[0m
[31m2019-07-02 18:49:47.524771: E tensorflow/core/platform/s3/aws_logging.cc:60] No response body. Response code: 404[0m
[31m2019-07-02 18:49:47.524809: W tensorflow/core/platform/s3/aws_logging.cc:57] If the signature check failed. This could be because of a time skew. Attempting to adjust the signer.[0m
[31m2019-07-02 18:49:47,912 - tensorflow - INFO - Calling model_fn.[0m
[31

[31m2019-07-02 18:50:40,765 - tensorflow - INFO - global_step/sec: 29.3148[0m
[31m2019-07-02 18:50:40,767 - tensorflow - INFO - loss = 0.10550964, step = 5301 (3.412 sec)[0m
[31m2019-07-02 18:50:43,876 - tensorflow - INFO - global_step/sec: 32.1487[0m
[31m2019-07-02 18:50:43,877 - tensorflow - INFO - loss = 0.29030272, step = 5401 (3.110 sec)[0m
[31m2019-07-02 18:50:47,098 - tensorflow - INFO - Saving checkpoints for 5500 into s3://com.climate.production.analytics/dsw/scratch/sagemaker/models/peta_sage_hack/20190702-184348/output/petastorm-sagemaker-2019-07-02-18-43-48-315/model/model.ckpt.[0m
[31m2019-07-02 18:50:48.286933: E tensorflow/core/platform/s3/aws_logging.cc:60] No response body. Response code: 404[0m
[31m2019-07-02 18:50:48.286976: W tensorflow/core/platform/s3/aws_logging.cc:57] If the signature check failed. This could be because of a time skew. Attempting to adjust the signer.[0m
[31m2019-07-02 18:50:48,691 - tensorflow - INFO - Calling model_fn.[0m
[31m

[31m2019-07-02 18:51:42,942 - tensorflow - INFO - global_step/sec: 31.698[0m
[31m2019-07-02 18:51:42,944 - tensorflow - INFO - loss = 0.08908516, step = 6801 (3.156 sec)[0m
[31m2019-07-02 18:51:46,743 - tensorflow - INFO - global_step/sec: 26.3095[0m
[31m2019-07-02 18:51:46,744 - tensorflow - INFO - loss = 0.046961665, step = 6901 (3.800 sec)[0m
[31m2019-07-02 18:51:50,072 - tensorflow - INFO - Saving checkpoints for 7000 into s3://com.climate.production.analytics/dsw/scratch/sagemaker/models/peta_sage_hack/20190702-184348/output/petastorm-sagemaker-2019-07-02-18-43-48-315/model/model.ckpt.[0m
[31m2019-07-02 18:51:51.615627: E tensorflow/core/platform/s3/aws_logging.cc:60] No response body. Response code: 404[0m
[31m2019-07-02 18:51:51.615665: W tensorflow/core/platform/s3/aws_logging.cc:57] If the signature check failed. This could be because of a time skew. Attempting to adjust the signer.[0m
[31m2019-07-02 18:51:52,303 - tensorflow - INFO - Calling model_fn.[0m
[31m

[31m2019-07-02 18:52:43,101 - tensorflow - INFO - global_step/sec: 30.3914[0m
[31m2019-07-02 18:52:43,102 - tensorflow - INFO - loss = 0.019097788, step = 8301 (3.291 sec)[0m
[31m2019-07-02 18:52:46,673 - tensorflow - INFO - global_step/sec: 27.9919[0m
[31m2019-07-02 18:52:46,674 - tensorflow - INFO - loss = 0.15632853, step = 8401 (3.572 sec)[0m
[31m2019-07-02 18:52:50,323 - tensorflow - INFO - Saving checkpoints for 8500 into s3://com.climate.production.analytics/dsw/scratch/sagemaker/models/peta_sage_hack/20190702-184348/output/petastorm-sagemaker-2019-07-02-18-43-48-315/model/model.ckpt.[0m
[31m2019-07-02 18:52:51.755041: E tensorflow/core/platform/s3/aws_logging.cc:60] No response body. Response code: 404[0m
[31m2019-07-02 18:52:51.755078: W tensorflow/core/platform/s3/aws_logging.cc:57] If the signature check failed. This could be because of a time skew. Attempting to adjust the signer.[0m
[31m2019-07-02 18:52:52,223 - tensorflow - INFO - Calling model_fn.[0m
[31


2019-07-02 18:53:37 Uploading - Uploading generated training model
2019-07-02 18:53:37 Completed - Training job completed
Billable seconds: 470
CPU times: user 1.24 s, sys: 132 ms, total: 1.37 s
Wall time: 10min 17s


Accuracy should be around 0.95 and wall time = 10min 17s

# Training on 5 sagemaker instances

With just minimal code changes, we can the same number of steps in total but distributed over 5 instances

In [5]:
kwargs['train_instance_count'] = 5
mnist_estimator = TensorFlow(**kwargs)
%time  mnist_estimator.fit(inputs=None)

2019-07-02 18:54:06 Starting - Starting the training job...
2019-07-02 18:54:07 Starting - Launching requested ML instances......
2019-07-02 18:55:08 Starting - Preparing the instances for training......
2019-07-02 18:56:12 Downloading - Downloading input data
2019-07-02 18:56:12 Training - Downloading the training image....
[31m2019-07-02 18:57:09,640 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[31m2019-07-02 18:57:09,646 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-07-02 18:57:09,807 sagemaker_tensorflow_container.training INFO     Running distributed training job with parameter servers[0m
[31m2019-07-02 18:57:09,807 sagemaker_tensorflow_container.training INFO     Launching parameter server process[0m
[31m2019-07-02 18:57:09,807 sagemaker_tensorflow_container.training INFO     Running distributed training job with parameter servers[0m
[31m2019-07-02 18:57:09,851 sagemaker_tensorflo

[31m2019-07-02 18:57:10,680 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[31m2019-07-02 18:57:10,686 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-07-02 18:57:10,836 sagemaker_tensorflow_container.training INFO     Running distributed training job with parameter servers[0m
[31m2019-07-02 18:57:10,837 sagemaker_tensorflow_container.training INFO     Launching parameter server process[0m
[31m2019-07-02 18:57:10,837 sagemaker_tensorflow_container.training INFO     Running distributed training job with parameter servers[0m
[31m2019-07-02 18:57:10,879 sagemaker_tensorflow_container.training INFO     Launching worker process[0m
[31m2019-07-02 18:57:10,998 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-07-02 18:57:11,016 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-07-02 18:57:11,026 sagemaker-container


2019-07-02 18:57:10 Training - Training image download completed. Training in progress.[33m2019-07-02 18:57:26,315 - tensorflow - INFO - Done calling model_fn.[0m
[33m2019-07-02 18:57:26,316 - tensorflow - INFO - Create CheckpointSaverHook.[0m
[33m2019-07-02 18:57:26,570 - tensorflow - INFO - Graph was finalized.[0m
[33m2019-07-02 18:57:26,638 - tensorflow - INFO - Running local_init_op.[0m
[33m2019-07-02 18:57:26,644 - tensorflow - INFO - Done running local_init_op.[0m
[33m2019-07-02 18:57:26,814 - tensorflow - INFO - loss = 2.3823955, step = 15[0m
[31m2019-07-02 18:57:26,470 - tensorflow - INFO - loss = 20.751148, step = 0[0m
[31m2019-07-02 18:57:30,032 - tensorflow - INFO - loss = 1.9236612, step = 193 (3.563 sec)[0m
[33m2019-07-02 18:57:30,317 - tensorflow - INFO - loss = 2.0668314, step = 213 (3.503 sec)[0m
[31m2019-07-02 18:57:33,845 - tensorflow - INFO - loss = 1.993006, step = 397 (3.813 sec)[0m
[33m2019-07-02 18:57:34,103 - tensorflow - INFO - loss = 1.91

[32m2019-07-02 18:57:53,286 - tensorflow - INFO - Running local_init_op.[0m
[32m2019-07-02 18:57:53,294 - tensorflow - INFO - Done running local_init_op.[0m
[32m2019-07-02 18:57:53,505 - tensorflow - INFO - loss = 0.53786814, step = 1997[0m
[32m2019-07-02 18:57:54,418 - tensorflow - INFO - global_step/sec: 116.426[0m
[33m2019-07-02 18:57:54,053 - tensorflow - INFO - loss = 0.27410915, step = 2067 (4.163 sec)[0m
[32m2019-07-02 18:57:55,557 - tensorflow - INFO - global_step/sec: 92.1522[0m
[31m2019-07-02 18:57:55.610185: E tensorflow/core/platform/s3/aws_logging.cc:60] No response body. Response code: 404[0m
[31m2019-07-02 18:57:55.610227: W tensorflow/core/platform/s3/aws_logging.cc:57] If the signature check failed. This could be because of a time skew. Attempting to adjust the signer.[0m
[31m2019-07-02 18:57:55,833 - tensorflow - INFO - Skip the current checkpoint eval due to throttle secs (10 secs).[0m
[31m2019-07-02 18:57:55,862 - tensorflow - INFO - loss = 0.2369

[34m2019-07-02 18:58:14,811 - tensorflow - INFO - loss = 0.10091041, step = 4308 (4.109 sec)[0m
[32m2019-07-02 18:58:13,957 - tensorflow - INFO - global_step/sec: 114.702[0m
[32m2019-07-02 18:58:14,963 - tensorflow - INFO - global_step/sec: 102.364[0m
[32m2019-07-02 18:58:15,475 - tensorflow - INFO - loss = 0.10776798, step = 4357 (4.815 sec)[0m
[32m2019-07-02 18:58:16,165 - tensorflow - INFO - global_step/sec: 88.1931[0m
[31m2019-07-02 18:58:17,105 - tensorflow - INFO - Saving checkpoints for 4537 into s3://com.climate.production.analytics/dsw/scratch/sagemaker/models/peta_sage_hack/20190702-184348/output/petastorm-sagemaker-2019-07-02-18-54-05-832/model/model.ckpt.[0m
[32m2019-07-02 18:58:17,124 - tensorflow - INFO - global_step/sec: 111.582[0m
[34m2019-07-02 18:58:18,429 - tensorflow - INFO - loss = 0.12154327, step = 4667 (3.617 sec)[0m
[33m2019-07-02 18:58:18,543 - tensorflow - INFO - loss = 0.2301814, step = 4681 (4.395 sec)[0m
[32m2019-07-02 18:58:18,279 - ten

[32m2019-07-02 18:58:34,176 - tensorflow - INFO - global_step/sec: 117.36[0m
[32m2019-07-02 18:58:34,972 - tensorflow - INFO - global_step/sec: 126.909[0m
[32m2019-07-02 18:58:35,816 - tensorflow - INFO - global_step/sec: 123.242[0m
[32m2019-07-02 18:58:36,821 - tensorflow - INFO - global_step/sec: 106.421[0m
[32m2019-07-02 18:58:37,052 - tensorflow - INFO - loss = 0.047711678, step = 6698 (4.370 sec)[0m
[34m2019-07-02 18:58:38,520 - tensorflow - INFO - loss = 0.1076369, step = 6833 (4.086 sec)[0m
[32m2019-07-02 18:58:38,188 - tensorflow - INFO - global_step/sec: 83.4335[0m
[32m2019-07-02 18:58:38,918 - tensorflow - INFO - global_step/sec: 143.811[0m
[31m2019-07-02 18:58:38,590 - tensorflow - INFO - Skip the current checkpoint eval due to throttle secs (10 secs).[0m
[33m2019-07-02 18:58:38,968 - tensorflow - INFO - loss = 0.23757617, step = 6899 (3.968 sec)[0m
[32m2019-07-02 18:58:39,717 - tensorflow - INFO - global_step/sec: 132.641[0m
[31m2019-07-02 18:58:40,33

[35m2019-07-02 18:58:55,783 - tensorflow - INFO - loss = 0.039712176, step = 8762 (4.049 sec)[0m
[35m2019-07-02 18:58:59,427 - tensorflow - INFO - loss = 0.23454589, step = 9121 (3.644 sec)[0m
[33m2019-07-02 18:58:59,526 - tensorflow - INFO - loss = 0.104218215, step = 9138 (4.581 sec)[0m
[32m2019-07-02 18:58:59,552 - tensorflow - INFO - global_step/sec: 110.101[0m
[31m2019-07-02 18:58:59,723 - tensorflow - INFO - Saving checkpoints for 9169 into s3://com.climate.production.analytics/dsw/scratch/sagemaker/models/peta_sage_hack/20190702-184348/output/petastorm-sagemaker-2019-07-02-18-54-05-832/model/model.ckpt.[0m
[32m2019-07-02 18:59:00,622 - tensorflow - INFO - global_step/sec: 101.887[0m
[34m2019-07-02 18:59:01,727 - tensorflow - INFO - loss = 0.05827143, step = 9360 (3.779 sec)[0m
[32m2019-07-02 18:59:01,678 - tensorflow - INFO - global_step/sec: 96.6374[0m
[31m2019-07-02 18:59:02.260222: E tensorflow/core/platform/s3/aws_logging.cc:60] No response body. Response co

Final accuracy is 0.95ish as well - this time wall time = 8min and 24s.