In [12]:
# Notebook Instance Imports
import os
import sagemaker
from sagemaker.tensorflow import TensorFlow
from sagemaker import get_execution_role
from sagemaker.estimator import Estimator

# Identify Data Location

In [13]:
# S3 directories

training_files = "s3://canopy-production-ml/training_inputs/"
# val_file = "s3://canopy-production-ml/training_inputs/val_labels.csv"
# labels_file = "s3://canopy-production-ml/training_inputs/labels.json"

inputs = {"data":training_files}

print(inputs)

{'data': 's3://canopy-production-ml/training_inputs/'}


# Custom Docker for Training

In [3]:
%cd docker_test_folder

/home/ec2-user/SageMaker/cb_feature_detection/sagemaker_staging/docker_test_folder


In [4]:
!pwd

/home/ec2-user/SageMaker/cb_feature_detection/sagemaker_staging/docker_test_folder


In [5]:
! aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


In [10]:
! docker build -t tf-custom-container-test .

Sending build context to Docker daemon  35.33kB
Step 1/5 : FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/tensorflow-training:2.4.1-cpu-py37-ubuntu18.04
 ---> a6b6eae32037
Step 2/5 : ENV PATH="/opt/ml/code:${PATH}"
 ---> Using cache
 ---> 8294093ffa21
Step 3/5 : RUN pip3 install rasterio keras
 ---> Using cache
 ---> db9d1558d609
Step 4/5 : COPY cb_feature_train1_aws.py /opt/ml/code/train.py
 ---> 4f8682a4daf2
Step 5/5 : ENV SAGEMAKER_PROGRAM train.py
 ---> Running in 0f7eb0b242f9
Removing intermediate container 0f7eb0b242f9
 ---> beb1b3a5cfec
Successfully built beb1b3a5cfec
Successfully tagged tf-custom-container-test:latest


# For Local Container test - Success

In [11]:
from sagemaker.estimator import Estimator

estimator = Estimator(image_uri='tf-custom-container-test',
                      role='arn:aws:iam::963659202518:role/service-role/AmazonSageMaker-ExecutionRole-20210306T191865',
                      instance_count=1,
                      instance_type='local')

estimator.fit(inputs)

Creating qb0fdmh9hb-algo-1-hleiy ... 
Creating qb0fdmh9hb-algo-1-hleiy ... done
Attaching to qb0fdmh9hb-algo-1-hleiy
[36mqb0fdmh9hb-algo-1-hleiy |[0m 2021-03-07 16:53:43.886754: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.
[36mqb0fdmh9hb-algo-1-hleiy |[0m 2021-03-07 16:53:43.886942: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.
[36mqb0fdmh9hb-algo-1-hleiy |[0m 2021-03-07 16:53:43.918195: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.
[36mqb0fdmh9hb-algo-1-hleiy |[0m 2021-03-07 16:53:45,598 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_container.training
[36mqb0fdmh9hb-algo-1-hleiy |[0m 2021-03-07 16:53:45,605 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
[36mqb0f

KeyboardInterrupt: 

# Publish Container to ECR

In [17]:
%%sh

# Specify an algorithm name
algorithm_name=pc-tf-custom-container-test

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
region=${region:-us-west-2}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

# If the repository doesn't exist in ECR, create it.

aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1
if [ $? -ne 0 ]
then
aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly

$(aws ecr get-login --region ${region} --no-include-email)

# Build the docker image locally with the image name and then push it to ECR
# with the full name.

docker build -t ${algorithm_name} .
docker tag ${algorithm_name} ${fullname}

docker push ${fullname}

Login Succeeded
Sending build context to Docker daemon  35.33kB
Step 1/5 : FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/tensorflow-training:2.4.1-cpu-py37-ubuntu18.04
 ---> a6b6eae32037
Step 2/5 : ENV PATH="/opt/ml/code:${PATH}"
 ---> Using cache
 ---> 8294093ffa21
Step 3/5 : RUN pip3 install rasterio keras
 ---> Using cache
 ---> db9d1558d609
Step 4/5 : COPY cb_feature_train1_aws.py /opt/ml/code/train.py
 ---> Using cache
 ---> 4f8682a4daf2
Step 5/5 : ENV SAGEMAKER_PROGRAM train.py
 ---> Using cache
 ---> beb1b3a5cfec
Successfully built beb1b3a5cfec
Successfully tagged pc-tf-custom-container-test:latest
The push refers to repository [963659202518.dkr.ecr.us-east-1.amazonaws.com/pc-tf-custom-container-test]
a224793cc443: Preparing
ad96c940e8e7: Preparing
0859a4046b5c: Preparing
13a6259d0a5f: Preparing
7c9b17058a17: Preparing
e64228f78c01: Preparing
a4b459577f83: Preparing
2c6530437d13: Preparing
24a74e1f08ab: Preparing
1dc8a537c9f8: Preparing
805fb593f0a3: Preparing
aa1d9c35ff1e: 

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



In [12]:
import boto3

account_id = boto3.client('sts').get_caller_identity().get('Account')
ecr_repository = 'sagemaker-byoc-test'
tag = ':latest'

region = boto3.session.Session().region_name

uri_suffix = 'amazonaws.com'
if region in ['cn-north-1', 'cn-northwest-1']:
    uri_suffix = 'amazonaws.com.cn'

byoc_image_uri = '{}.dkr.ecr.{}.{}/{}'.format(account_id, region, uri_suffix, ecr_repository + tag)

byoc_image_uri
# This should return something like
# 111122223333.dkr.ecr.us-east-2.amazonaws.com/sagemaker-byoc-test:latest

'963659202518.dkr.ecr.us-east-1.amazonaws.com/sagemaker-byoc-test:latest'

# For ECR Run

## Identify Data Location

In [14]:
# S3 directories

training_files = "s3://canopy-production-ml/training_inputs/"
# val_file = "s3://canopy-production-ml/training_inputs/val_labels.csv"
# labels_file = "s3://canopy-production-ml/training_inputs/labels.json"

inputs = {"data":training_files}

print(inputs)

{'data': 's3://canopy-production-ml/training_inputs/'}


In [15]:
image_uri = '963659202518.dkr.ecr.us-east-1.amazonaws.com/sagemaker-byoc-test:latest'

# create estimator
estimator = Estimator(image_uri=image_uri,
                       instance_type='ml.p2.xlarge',
                       output_path='s3://canopy-production-ml-output',
                       base_job_name='pc-tf-custom-container-test-job',
                       instance_count=1,
                       role=get_execution_role(), # Passes to the container the AWS role that you are using on this notebook
                       py_version='py37')

In [16]:
estimator.fit(inputs)

ClientError: An error occurred (ValidationException) when calling the CreateTrainingJob operation: Cannot find repository: sagemaker-byoc-test in registry ID: 963659202518 Please check if your ECR repository exists and role arn:aws:iam::963659202518:role/service-role/AmazonSageMaker-ExecutionRole-20210306T191865 has proper pull permissions for SageMaker: ecr:BatchCheckLayerAvailability, ecr:BatchGetImage, ecr:GetDownloadUrlForLayer