In [1]:
# Sagemaker limits the urllib3 version. If a VersionConflict occurs when importing sagemaker,
# it's likely because there are two versions of urllib3 installed
# 1.23 and 1.25. Try removing urllib3 1.25.3 explicitly by running the following line & restarting kernel

import urllib3
assert(urllib3.__version__ == '1.23')

# !sudo rm -rf /opt/miniconda/envs/py3/lib/python3.6/site-packages/urllib3-1.25.3.dist-info

In [2]:
import os
import datetime

import boto3
import sagemaker
from sagemaker.tensorflow import TensorFlow
import tensorflow

from config import DATASET_URL, IMAGE_NAME, IAM_ROLE

In [3]:
training_sample_count = 60000  # Standard size of training data for MNIST
batch_size = 64
num_epochs = 10

training_step_count = int(training_sample_count * num_epochs / batch_size)

entry_point = 'MNIST_model.py'
timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
output_path   = f"s3://com.climate.production.analytics/dsw/scratch/sagemaker/models/peta_sage_hack/{timestamp}/output"
code_location = f"s3://com.climate.production.analytics/dsw/scratch/sagemaker/models/peta_sage_hack/{timestamp}/code"

boto_session = boto3.Session(region_name="us-east-1",
                             aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID_PROD"),
                             aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY_PROD"))

kwargs = dict(entry_point=entry_point,
              image_name=IMAGE_NAME,
              role=IAM_ROLE,
              sagemaker_session=sagemaker.Session(boto_session=boto_session),
              train_instance_count=1,
              train_instance_type='ml.m5.xlarge',
              framework_version='1.13',
              hyperparameters={'dataset-url': DATASET_URL,
                               'training_steps': training_step_count,
                               'batch_size': batch_size,
                               'evaluation_steps': 10,},
              py_version = 'py3',
              output_path = output_path,
              code_location=code_location,
              distributions={'parameter_server': {'enabled': True}})

# Training on a single sagemaker instance

In [4]:
mnist_estimator = TensorFlow(**kwargs)
%time  mnist_estimator.fit(inputs=None)  # we're bypassing the conventional sagemaker input methods

2019-06-26 21:10:59 Starting - Starting the training job...
2019-06-26 21:11:01 Starting - Launching requested ML instances......
2019-06-26 21:12:05 Starting - Preparing the instances for training...
2019-06-26 21:12:55 Downloading - Downloading input data
2019-06-26 21:12:55 Training - Downloading the training image......
2019-06-26 21:13:50 Training - Training image download completed. Training in progress.
[31m2019-06-26 21:13:52,749 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[31m2019-06-26 21:13:52,754 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-06-26 21:13:53,079 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-06-26 21:13:53,094 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-06-26 21:13:53,104 sagemaker-containers INFO     Invoking user script
[0m
[31mTraining Env:
[0m
[31m{
    "additional_fr

[31mInstructions for updating:[0m
[31mUse `tf.data.experimental.map_and_batch(...)`.[0m
[31m2019-06-26 21:14:03,149 - tensorflow - INFO - Calling model_fn.[0m
[31m2019-06-26 21:14:03,613 - tensorflow - INFO - Done calling model_fn.[0m
[31m2019-06-26 21:14:03,615 - tensorflow - INFO - Create CheckpointSaverHook.[0m
[31m2019-06-26 21:14:03.637421: E tensorflow/core/platform/s3/aws_logging.cc:60] No response body. Response code: 404[0m
[31m2019-06-26 21:14:03.637464: W tensorflow/core/platform/s3/aws_logging.cc:57] If the signature check failed. This could be because of a time skew. Attempting to adjust the signer.[0m
[31m2019-06-26 21:14:03.699061: E tensorflow/core/platform/s3/aws_logging.cc:60] No response body. Response code: 404[0m
[31m2019-06-26 21:14:03.699115: W tensorflow/core/platform/s3/aws_logging.cc:57] If the signature check failed. This could be because of a time skew. Attempting to adjust the signer.[0m
[31m2019-06-26 21:14:03.720134: E tensorflow/core/p

[31m2019-06-26 21:14:37,067 - tensorflow - INFO - global_step/sec: 27.5637[0m
[31m2019-06-26 21:14:37,068 - tensorflow - INFO - loss = 0.9618072, step = 801 (3.628 sec)[0m
[31m2019-06-26 21:14:40,748 - tensorflow - INFO - global_step/sec: 27.1685[0m
[31m2019-06-26 21:14:40,749 - tensorflow - INFO - loss = 0.7621321, step = 901 (3.680 sec)[0m
[31m2019-06-26 21:14:43,717 - tensorflow - INFO - Saving checkpoints for 1000 into s3://com.climate.production.analytics/dsw/scratch/sagemaker/models/peta_sage_hack/20190626-211056/output/petastorm-sagemaker-2019-06-26-21-10-59-310/model/model.ckpt.[0m
[31m2019-06-26 21:14:44.978491: E tensorflow/core/platform/s3/aws_logging.cc:60] No response body. Response code: 404[0m
[31m2019-06-26 21:14:44.978530: W tensorflow/core/platform/s3/aws_logging.cc:57] If the signature check failed. This could be because of a time skew. Attempting to adjust the signer.[0m
[31m2019-06-26 21:14:45,757 - tensorflow - INFO - Calling model_fn.[0m
[31m2019

[31m2019-06-26 21:15:29,701 - tensorflow - INFO - global_step/sec: 38.9243[0m
[31m2019-06-26 21:15:29,703 - tensorflow - INFO - loss = 0.5168744, step = 2101 (2.570 sec)[0m
[31m2019-06-26 21:15:34,039 - tensorflow - INFO - global_step/sec: 23.0524[0m
[31m2019-06-26 21:15:34,040 - tensorflow - INFO - loss = 0.5262003, step = 2201 (4.337 sec)[0m
[31m2019-06-26 21:15:36,897 - tensorflow - INFO - global_step/sec: 34.9889[0m
[31m2019-06-26 21:15:36,898 - tensorflow - INFO - loss = 0.7034991, step = 2301 (2.858 sec)[0m
[31m2019-06-26 21:15:40,365 - tensorflow - INFO - global_step/sec: 28.8333[0m
[31m2019-06-26 21:15:40,366 - tensorflow - INFO - loss = 0.4942026, step = 2401 (3.468 sec)[0m
[31m2019-06-26 21:15:44,019 - tensorflow - INFO - Saving checkpoints for 2500 into s3://com.climate.production.analytics/dsw/scratch/sagemaker/models/peta_sage_hack/20190626-211056/output/petastorm-sagemaker-2019-06-26-21-10-59-310/model/model.ckpt.[0m
[31m2019-06-26 21:15:45.280406: E te

[31m2019-06-26 21:16:31,377 - tensorflow - INFO - global_step/sec: 33.915[0m
[31m2019-06-26 21:16:31,379 - tensorflow - INFO - loss = 0.35189295, step = 3601 (2.949 sec)[0m
[31m2019-06-26 21:16:34,989 - tensorflow - INFO - global_step/sec: 27.688[0m
[31m2019-06-26 21:16:34,990 - tensorflow - INFO - loss = 0.11012103, step = 3701 (3.612 sec)[0m
[31m2019-06-26 21:16:38,254 - tensorflow - INFO - global_step/sec: 30.631[0m
[31m2019-06-26 21:16:38,255 - tensorflow - INFO - loss = 0.12154582, step = 3801 (3.264 sec)[0m
[31m2019-06-26 21:16:41,823 - tensorflow - INFO - global_step/sec: 28.0152[0m
[31m2019-06-26 21:16:41,824 - tensorflow - INFO - loss = 0.43901384, step = 3901 (3.570 sec)[0m
[31m2019-06-26 21:16:45,458 - tensorflow - INFO - Saving checkpoints for 4000 into s3://com.climate.production.analytics/dsw/scratch/sagemaker/models/peta_sage_hack/20190626-211056/output/petastorm-sagemaker-2019-06-26-21-10-59-310/model/model.ckpt.[0m
[31m2019-06-26 21:16:46.836968: E t

[31m2019-06-26 21:17:30,664 - tensorflow - INFO - global_step/sec: 32.1706[0m
[31m2019-06-26 21:17:30,665 - tensorflow - INFO - loss = 0.10022905, step = 5101 (3.108 sec)[0m
[31m2019-06-26 21:17:34,348 - tensorflow - INFO - global_step/sec: 27.1462[0m
[31m2019-06-26 21:17:34,349 - tensorflow - INFO - loss = 0.0522857, step = 5201 (3.683 sec)[0m
[31m2019-06-26 21:17:37,749 - tensorflow - INFO - global_step/sec: 29.4026[0m
[31m2019-06-26 21:17:37,750 - tensorflow - INFO - loss = 0.26348737, step = 5301 (3.401 sec)[0m
[31m2019-06-26 21:17:42,219 - tensorflow - INFO - global_step/sec: 22.3721[0m
[31m2019-06-26 21:17:42,220 - tensorflow - INFO - loss = 0.18897027, step = 5401 (4.470 sec)[0m
[31m2019-06-26 21:17:45,764 - tensorflow - INFO - Saving checkpoints for 5500 into s3://com.climate.production.analytics/dsw/scratch/sagemaker/models/peta_sage_hack/20190626-211056/output/petastorm-sagemaker-2019-06-26-21-10-59-310/model/model.ckpt.[0m
[31m2019-06-26 21:17:46.923170: E

[31m2019-06-26 21:18:31,418 - tensorflow - INFO - global_step/sec: 35.1417[0m
[31m2019-06-26 21:18:31,420 - tensorflow - INFO - loss = 0.07913907, step = 6601 (2.846 sec)[0m
[31m2019-06-26 21:18:34,990 - tensorflow - INFO - global_step/sec: 27.9943[0m
[31m2019-06-26 21:18:34,991 - tensorflow - INFO - loss = 0.1787745, step = 6701 (3.572 sec)[0m
[31m2019-06-26 21:18:37,957 - tensorflow - INFO - global_step/sec: 33.7009[0m
[31m2019-06-26 21:18:37,959 - tensorflow - INFO - loss = 0.18999982, step = 6801 (2.968 sec)[0m
[31m2019-06-26 21:18:41,650 - tensorflow - INFO - global_step/sec: 27.0811[0m
[31m2019-06-26 21:18:41,652 - tensorflow - INFO - loss = 0.23937987, step = 6901 (3.693 sec)[0m
[31m2019-06-26 21:18:45,215 - tensorflow - INFO - Saving checkpoints for 7000 into s3://com.climate.production.analytics/dsw/scratch/sagemaker/models/peta_sage_hack/20190626-211056/output/petastorm-sagemaker-2019-06-26-21-10-59-310/model/model.ckpt.[0m
[31m2019-06-26 21:18:47.004822: E

[31m2019-06-26 21:19:31,465 - tensorflow - INFO - global_step/sec: 31.9705[0m
[31m2019-06-26 21:19:31,467 - tensorflow - INFO - loss = 0.24131145, step = 8101 (3.129 sec)[0m
[31m2019-06-26 21:19:35,168 - tensorflow - INFO - global_step/sec: 27.0036[0m
[31m2019-06-26 21:19:35,170 - tensorflow - INFO - loss = 0.059389316, step = 8201 (3.703 sec)[0m
[31m2019-06-26 21:19:38,830 - tensorflow - INFO - global_step/sec: 27.3107[0m
[31m2019-06-26 21:19:38,831 - tensorflow - INFO - loss = 0.11306437, step = 8301 (3.661 sec)[0m
[31m2019-06-26 21:19:42,601 - tensorflow - INFO - global_step/sec: 26.5145[0m
[31m2019-06-26 21:19:42,602 - tensorflow - INFO - loss = 0.08806586, step = 8401 (3.771 sec)[0m
[31m2019-06-26 21:19:45,922 - tensorflow - INFO - Saving checkpoints for 8500 into s3://com.climate.production.analytics/dsw/scratch/sagemaker/models/peta_sage_hack/20190626-211056/output/petastorm-sagemaker-2019-06-26-21-10-59-310/model/model.ckpt.[0m
[31m2019-06-26 21:19:47.230463:

Billable seconds: 466
CPU times: user 1.19 s, sys: 92.1 ms, total: 1.28 s
Wall time: 9min 47s


Accuracy should be around 0.95 and wall time = 9m 47s

# Training on 5 sagemaker instances

With just minimal code changes, we can the same number of steps in total but distributed over 5 instances

In [5]:
kwargs['train_instance_count'] = 5
mnist_estimator = TensorFlow(**kwargs)
%time  mnist_estimator.fit(inputs=None)

2019-06-26 21:31:55 Starting - Starting the training job...
2019-06-26 21:31:57 Starting - Launching requested ML instances......
2019-06-26 21:32:59 Starting - Preparing the instances for training...
2019-06-26 21:33:50 Downloading - Downloading input data...
2019-06-26 21:33:57 Training - Downloading the training image...
2019-06-26 21:34:55 Training - Training image download completed. Training in progress..
[33m2019-06-26 21:34:54,676 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[33m2019-06-26 21:34:54,681 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[33m2019-06-26 21:34:54,839 sagemaker_tensorflow_container.training INFO     Running distributed training job with parameter servers[0m
[33m2019-06-26 21:34:54,839 sagemaker_tensorflow_container.training INFO     Launching parameter server process[0m
[33m2019-06-26 21:34:54,839 sagemaker_tensorflow_container.training INFO     Running distributed 

[35m2019-06-26 21:34:56,815 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[35m2019-06-26 21:34:56,820 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[35m2019-06-26 21:34:56,976 sagemaker_tensorflow_container.training INFO     Running distributed training job with parameter servers[0m
[35m2019-06-26 21:34:56,976 sagemaker_tensorflow_container.training INFO     Launching parameter server process[0m
[35m2019-06-26 21:34:56,976 sagemaker_tensorflow_container.training INFO     Running distributed training job with parameter servers[0m
[35m2019-06-26 21:34:57,023 sagemaker_tensorflow_container.training INFO     Launching worker process[0m
[35m2019-06-26 21:34:57,123 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[35m2019-06-26 21:34:57,138 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[35m2019-06-26 21:34:57,149 sagemaker-container

[31m2019-06-26 21:35:12.074966: E tensorflow/core/platform/s3/aws_logging.cc:60] No response body. Response code: 404[0m
[31m2019-06-26 21:35:12.075008: W tensorflow/core/platform/s3/aws_logging.cc:57] If the signature check failed. This could be because of a time skew. Attempting to adjust the signer.[0m
[31m2019-06-26 21:35:12,630 - tensorflow - INFO - loss = 2.300427, step = 98[0m
[32m2019-06-26 21:35:13,546 - tensorflow - INFO - loss = 2.0723224, step = 192 (3.203 sec)[0m
[32m2019-06-26 21:35:14,538 - tensorflow - INFO - global_step/sec: 70.5494[0m
[31m2019-06-26 21:35:16,125 - tensorflow - INFO - loss = 2.0523276, step = 331 (3.495 sec)[0m
[32m2019-06-26 21:35:16,287 - tensorflow - INFO - global_step/sec: 64.0203[0m
[32m2019-06-26 21:35:17,722 - tensorflow - INFO - global_step/sec: 75.9386[0m
[31m2019-06-26 21:35:18,190 - tensorflow - INFO - Saving checkpoints for 502 into s3://com.climate.production.analytics/dsw/scratch/sagemaker/models/peta_sage_hack/20190626-2

[32m2019-06-26 21:35:33,833 - tensorflow - INFO - global_step/sec: 107.587[0m
[31m2019-06-26 21:35:34,022 - tensorflow - INFO - Saving checkpoints for 2015 into s3://com.climate.production.analytics/dsw/scratch/sagemaker/models/peta_sage_hack/20190626-211056/output/petastorm-sagemaker-2019-06-26-21-31-55-362/model/model.ckpt.[0m
[32m2019-06-26 21:35:35,067 - tensorflow - INFO - global_step/sec: 102.079[0m
[35m2019-06-26 21:35:36,016 - tensorflow - INFO - loss = 2.0619946, step = 2198 (3.808 sec)[0m
[34m2019-06-26 21:35:36,326 - tensorflow - INFO - loss = 1.9372189, step = 2231 (4.311 sec)[0m
[32m2019-06-26 21:35:36,220 - tensorflow - INFO - global_step/sec: 87.655[0m
[32m2019-06-26 21:35:36,680 - tensorflow - INFO - loss = 1.9413054, step = 2265 (4.763 sec)[0m
[33m2019-06-26 21:35:36,864 - tensorflow - INFO - loss = 2.0099142, step = 2284 (4.387 sec)[0m
[31m2019-06-26 21:35:36.792769: E tensorflow/core/platform/s3/aws_logging.cc:60] No response body. Response code: 404

[33m2019-06-26 21:35:52,987 - tensorflow - INFO - loss = 0.72511417, step = 4034 (3.826 sec)[0m
[32m2019-06-26 21:35:53,526 - tensorflow - INFO - global_step/sec: 116.211[0m
[32m2019-06-26 21:35:54,077 - tensorflow - INFO - loss = 0.90772045, step = 4158 (3.918 sec)[0m
[32m2019-06-26 21:35:54,769 - tensorflow - INFO - global_step/sec: 98.0899[0m
[32m2019-06-26 21:35:55,885 - tensorflow - INFO - global_step/sec: 90.5361[0m
[31m2019-06-26 21:35:55.642318: E tensorflow/core/platform/s3/aws_logging.cc:60] No response body. Response code: 404[0m
[31m2019-06-26 21:35:55.642357: W tensorflow/core/platform/s3/aws_logging.cc:57] If the signature check failed. This could be because of a time skew. Attempting to adjust the signer.[0m
[31m2019-06-26 21:35:56,100 - tensorflow - INFO - Calling model_fn.[0m
[31m2019-06-26 21:35:56,391 - tensorflow - INFO - Done calling model_fn.[0m
[31m2019-06-26 21:35:56,420 - tensorflow - INFO - Starting evaluation at 2019-06-26-21:35:56[0m
[32

[32m2019-06-26 21:36:13,881 - tensorflow - INFO - global_step/sec: 123.399[0m
[34m2019-06-26 21:36:14,779 - tensorflow - INFO - loss = 0.21234241, step = 6239 (4.371 sec)[0m
[32m2019-06-26 21:36:15,210 - tensorflow - INFO - global_step/sec: 80.5341[0m
[32m2019-06-26 21:36:16,178 - tensorflow - INFO - global_step/sec: 110.521[0m
[31m2019-06-26 21:36:15.772889: E tensorflow/core/platform/s3/aws_logging.cc:60] No response body. Response code: 404[0m
[31m2019-06-26 21:36:15.772938: W tensorflow/core/platform/s3/aws_logging.cc:57] If the signature check failed. This could be because of a time skew. Attempting to adjust the signer.[0m
[31m2019-06-26 21:36:16,029 - tensorflow - INFO - Skip the current checkpoint eval due to throttle secs (10 secs).[0m
[35m2019-06-26 21:36:16,286 - tensorflow - INFO - loss = 0.11375193, step = 6389 (3.959 sec)[0m
[32m2019-06-26 21:36:17,129 - tensorflow - INFO - loss = 0.11963223, step = 6494 (5.048 sec)[0m
[32m2019-06-26 21:36:17,131 - tens

[33m2019-06-26 21:36:35,082 - tensorflow - INFO - loss = 0.35664505, step = 8334 (3.995 sec)[0m
[31m2019-06-26 21:36:35,769 - tensorflow - INFO - loss = 0.37629604, step = 8427 (12.709 sec)[0m
[34m2019-06-26 21:36:35,922 - tensorflow - INFO - loss = 0.11274399, step = 8452 (4.509 sec)[0m
[31m2019-06-26 21:36:37,358 - tensorflow - INFO - Saving checkpoints for 8629 into s3://com.climate.production.analytics/dsw/scratch/sagemaker/models/peta_sage_hack/20190626-211056/output/petastorm-sagemaker-2019-06-26-21-31-55-362/model/model.ckpt.[0m
[32m2019-06-26 21:36:38,903 - tensorflow - INFO - global_step/sec: 86.2717[0m
[33m2019-06-26 21:36:39,074 - tensorflow - INFO - loss = 0.1515798, step = 8770 (3.992 sec)[0m
[31m2019-06-26 21:36:39.459756: E tensorflow/core/platform/s3/aws_logging.cc:60] No response body. Response code: 404[0m
[31m2019-06-26 21:36:39.459801: W tensorflow/core/platform/s3/aws_logging.cc:57] If the signature check failed. This could be because of a time skew.

[32m2019-06-26 21:37:30,886 sagemaker_tensorflow_container.training INFO     master algo-1 is down, stopping parameter server[0m
[32mFor details of how to construct your training script see:[0m
[32mhttps://github.com/aws/sagemaker-python-sdk/tree/master/src/sagemaker/tensorflow#adapting-your-local-tensorflow-script[0m
[32m2019-06-26 21:37:30,886 sagemaker-containers INFO     Reporting training SUCCESS[0m
[33m2019-06-26 21:38:02,427 sagemaker_tensorflow_container.training INFO     master algo-1 is down, stopping parameter server[0m
[33mFor details of how to construct your training script see:[0m
[33mhttps://github.com/aws/sagemaker-python-sdk/tree/master/src/sagemaker/tensorflow#adapting-your-local-tensorflow-script[0m
[33m2019-06-26 21:38:02,428 sagemaker-containers INFO     Reporting training SUCCESS[0m
[35m2019-06-26 21:38:03,110 sagemaker_tensorflow_container.training INFO     master algo-1 is down, stopping parameter server[0m
[35mFor details of how to construct 

Final accuracy is 0.95ish as well - this time wall time is 6m and 50s.