In [1]:
# Notebook Instance Imports
import os
import sagemaker
from sagemaker.tensorflow import TensorFlow
from sagemaker import get_execution_role
from sagemaker.estimator import Estimator
from sagemaker.debugger import ProfilerConfig, FrameworkProfile
import time
import io
import json
import pandas as pd

profiler_config=ProfilerConfig(
    framework_profile_params=FrameworkProfile()
)

# Identify Data Location for Docker Local Test

In [34]:
# S3 directories

training_files = "s3://canopy-production-ml/training_inputs/train_val_test/"
# val_file = "s3://canopy-production-ml/training_inputs/val_labels.csv"
# labels_file = "s3://canopy-production-ml/training_inputs/labels.json"

inputs = {"data":training_files}
# hyperparameters = {"wandb_key":"abfa0dec9fc06fbfa6392496f40a22a8d47e58cf",
#                    "epochs":"10",
#                    "s3_chkpt_dir":"ckpt",
#                    "starting_weights",
#                    "starting_epoch",
#                    "batch_size":"20",
#                    "learning_rate",
#                    "bands_all",
#                    "band_list"
#                    "flip_left_right",
#                    "flip_up_down",
#                    "rot90",
#                    "enable_shuffle",
#                   "patience"}

hyperparameters = {
    "wandb_key": "ded96d05c0cfafc1f209276af6c21cb7ac61e5de",
    "epochs": "10",
    "s3_chkpt_dir": "ckpt",
    "augment": "False",
    "flip_left_right": "False",
    "flip_up_down": "False",
    "rot90": "True"
    #"bands": "2,3,4,8,12",
    #"starting_checkpoint": "ckpt/tf-custom-container-test-2021-03-15-18-18-39-877/last_chkpt.h5"
}

print(inputs)
print(hyperparameters)

{'data': 's3://canopy-production-ml/training_inputs/train_val_test/'}
{'wandb_key': 'ded96d05c0cfafc1f209276af6c21cb7ac61e5de', 'epochs': '10', 's3_chkpt_dir': 'ckpt', 'augment': 'False', 'flip_left_right': 'False', 'flip_up_down': 'False', 'rot90': 'True'}


# Custom Docker for Training

In [3]:
!pwd

/home/ec2-user/SageMaker/cb_feature_detection/sagemaker_staging


In [4]:
%cd docker_test_folder

/home/ec2-user/SageMaker/cb_feature_detection/sagemaker_staging/docker_test_folder


In [5]:
! aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


In [39]:
! docker build -t tf-custom-container-test .

Sending build context to Docker daemon  153.1kB
Step 1/6 : FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/tensorflow-training:2.4.1-gpu-py37-cu110-ubuntu18.04
 ---> 993791d9475c
Step 2/6 : ENV PATH="/opt/ml/code:${PATH}"
 ---> Using cache
 ---> 24f6ef66670c
Step 3/6 : RUN pip3 install rasterio wandb tensorflow-addons
 ---> Using cache
 ---> 2e0e92ca4149
Step 4/6 : COPY cb_feature_train3_aws.py /opt/ml/code/train.py
 ---> 0c802efcf55c
Step 5/6 : COPY data_loader.py /opt/ml/code/data_loader.py
 ---> 70ec557a634b
Step 6/6 : ENV SAGEMAKER_PROGRAM train.py
 ---> Running in 1f6466b69e5f
Removing intermediate container 1f6466b69e5f
 ---> b65aacad47d5
Successfully built b65aacad47d5
Successfully tagged tf-custom-container-test:latest


# For Local Container test - Success

In [40]:
from sagemaker.estimator import Estimator

estimator = Estimator(image_uri='tf-custom-container-test',
                      role='arn:aws:iam::963659202518:role/service-role/AmazonSageMaker-ExecutionRole-20210306T191865',
                      instance_count=1,
                      instance_type='local',
                     hyperparameters=hyperparameters)


estimator.fit(inputs)

Creating ikx8w1hx57-algo-1-39mtm ... 
Creating ikx8w1hx57-algo-1-39mtm ... done
Attaching to ikx8w1hx57-algo-1-39mtm
[36mikx8w1hx57-algo-1-39mtm |[0m 2021-03-17 22:57:06.526650: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.
[36mikx8w1hx57-algo-1-39mtm |[0m 2021-03-17 22:57:06.526867: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.
[36mikx8w1hx57-algo-1-39mtm |[0m 2021-03-17 22:57:06.531551: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[36mikx8w1hx57-algo-1-39mtm |[0m 2021-03-17 22:57:06.569714: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.
[36mikx8w1hx57-algo-1-39mtm |[0m 2021-03-17 22:57:08,545 sagemaker-training-toolkit INFO     Imported framework sagemaker_

KeyboardInterrupt: 

# Publish Container to ECR

In [8]:
%%sh

# Specify an algorithm name
algorithm_name=pc-tf-custom-container-test

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
region=${region:-us-east-1}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

# If the repository doesn't exist in ECR, create it.

aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1
if [ $? -ne 0 ]
then
aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly

$(aws ecr get-login --region ${region} --no-include-email)

# Build the docker image locally with the image name and then push it to ECR
# with the full name.

docker build -t ${algorithm_name} .
docker tag ${algorithm_name} ${fullname}

docker push ${fullname}

Login Succeeded
Sending build context to Docker daemon    151kB
Step 1/6 : FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/tensorflow-training:2.4.1-gpu-py37-cu110-ubuntu18.04
 ---> 993791d9475c
Step 2/6 : ENV PATH="/opt/ml/code:${PATH}"
 ---> Using cache
 ---> 24f6ef66670c
Step 3/6 : RUN pip3 install rasterio wandb tensorflow-addons
 ---> Using cache
 ---> 2e0e92ca4149
Step 4/6 : COPY cb_feature_train3_aws.py /opt/ml/code/train.py
 ---> Using cache
 ---> cf475618a1f9
Step 5/6 : COPY data_loader.py /opt/ml/code/data_loader.py
 ---> Using cache
 ---> e3d24cf93a48
Step 6/6 : ENV SAGEMAKER_PROGRAM train.py
 ---> Using cache
 ---> 73562464146b
Successfully built 73562464146b
Successfully tagged pc-tf-custom-container-test:latest
The push refers to repository [963659202518.dkr.ecr.us-east-1.amazonaws.com/pc-tf-custom-container-test]
22e88d6da3e3: Preparing
cf11aff593d0: Preparing
800c99fed61a: Preparing
0c867ea799c5: Preparing
3132871073ea: Preparing
f1b83ac14212: Preparing
6466a3f31741: 

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



In [9]:
import boto3

account_id = boto3.client('sts').get_caller_identity().get('Account')
ecr_repository = 'pc-tf-custom-container-test'
tag = ':latest'

region = boto3.session.Session().region_name

uri_suffix = 'amazonaws.com'
if region in ['cn-north-1', 'cn-northwest-1']:
    uri_suffix = 'amazonaws.com.cn'

image_uri = '{}.dkr.ecr.{}.{}/{}'.format(account_id, region, uri_suffix, ecr_repository + tag)

image_uri
# This should return something like
# 111122223333.dkr.ecr.us-east-2.amazonaws.com/sagemaker-byoc-test:latest

'963659202518.dkr.ecr.us-east-1.amazonaws.com/pc-tf-custom-container-test:latest'

# For ECR Run

## Identify Data Location

In [10]:
# S3 directories

training_files = "s3://canopy-production-ml/training_inputs/train_val_full/"
# val_file = "s3://canopy-production-ml/training_inputs/val_labels.csv"
# labels_file = "s3://canopy-production-ml/training_inputs/labels.json"

inputs = {"data":training_files}
# hyperparameters = {"wandb_key":"abfa0dec9fc06fbfa6392496f40a22a8d47e58cf",
#                    "epochs":"20",
#                    "s3_chkpt_dir":"ckpt",
#                    "batch_size":"20",}

hyperparameters = {
    "wandb_key": "ded96d05c0cfafc1f209276af6c21cb7ac61e5de",
    "epochs": "2",
    "s3_chkpt_dir": "ckpt",
    "batch_size": "100",
    #"last_checkpoint": "ckpt/pc-tf-custom-container-test-job-2021-03-12-23-55-45-390/model_resnet_epoch_3.h5"
}

print(inputs)
print(hyperparameters)

{'data': 's3://canopy-production-ml/training_inputs/train_val_full/'}
{'wandb_key': 'ded96d05c0cfafc1f209276af6c21cb7ac61e5de', 'epochs': '2', 's3_chkpt_dir': 'ckpt', 'batch_size': '100'}


In [11]:
profiler_config=ProfilerConfig(
    framework_profile_params=FrameworkProfile(start_unix_time=int(time.time()), duration=600)
)

In [14]:
job_name = 'pc-tf-custom-container-test-job-no-augment'

# create estimator
estimator = Estimator(image_uri=image_uri,
                       instance_type='ml.p3.16xlarge',
                       output_path='s3://canopy-production-ml-output',
                       base_job_name=job_name,
                       instance_count=1,
                       role=get_execution_role(), # Passes to the container the AWS role that you are using on this notebook
                       py_version='py37',
                     profiler_config=profiler_config,
                     checkpoint_s3_uri=
                      f's3://canopy-production-ml-output/ckpt/{job_name}-{time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())}',
                     hyperparameters=hyperparameters,
                      max_wait=60*60*24*3,
                      max_run=60*60*24*3,
                      use_spot_instances=True)



In [15]:
estimator.fit(inputs)

2021-03-17 20:41:04 Starting - Starting the training job...
2021-03-17 20:41:11 Starting - Launching requested ML instancesProfilerReport-1616013664: InProgress
............
2021-03-17 20:43:36 Starting - Preparing the instances for training......
2021-03-17 20:44:36 Downloading - Downloading input data...
2021-03-17 20:44:56 Training - Downloading the training image.......................[34m2021-03-17 20:48:41.311304: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2021-03-17 20:48:41.317805: I tensorflow/core/profiler/internal/smprofiler_config_reader.cc:123] PID of the process that is writing to the timeline : 1[0m
[34m2021-03-17 20:48:41.318571: I tensorflow/core/profiler/internal/smprofiler_timeline.cc:121] SageMaker Profiler Timeline Writer read the following config parameters :[0m
[34m2021-03-17 20:48:41.318589: I tensorflow/core/profiler/internal/smprofiler_timeline.cc:122] Base Folder : /opt/ml/output/profiler

KeyboardInterrupt: 

# Profiler Viewing

In [63]:
estimator.output_path

's3://canopy-production-ml-output'

In [18]:
estimator.latest_training_job.job_name

'pc-tf-custom-container-test-job-2021-03-12-21-36-45-193'

In [None]:
"job_name":"pc-tf-custom-container-test-job-2021-03-12-21-36-45-193"

In [65]:
rule_output_path = estimator.output_path + "/"+ estimator.latest_training_job.job_name + "/rule-output"

In [66]:
rule_output_path

's3://canopy-production-ml-output/pc-tf-custom-container-test-job-2021-03-09-06-30-02-520/rule-output'

In [67]:
! aws s3 ls {rule_output_path} --recursive

2021-03-09 06:45:11     350803 pc-tf-custom-container-test-job-2021-03-09-06-30-02-520/rule-output/ProfilerReport-1615271402/profiler-output/profiler-report.html
2021-03-09 06:45:11     202858 pc-tf-custom-container-test-job-2021-03-09-06-30-02-520/rule-output/ProfilerReport-1615271402/profiler-output/profiler-report.ipynb
2021-03-09 06:45:07        192 pc-tf-custom-container-test-job-2021-03-09-06-30-02-520/rule-output/ProfilerReport-1615271402/profiler-output/profiler-reports/BatchSize.json
2021-03-09 06:45:07      53300 pc-tf-custom-container-test-job-2021-03-09-06-30-02-520/rule-output/ProfilerReport-1615271402/profiler-output/profiler-reports/CPUBottleneck.json
2021-03-09 06:45:07        126 pc-tf-custom-container-test-job-2021-03-09-06-30-02-520/rule-output/ProfilerReport-1615271402/profiler-output/profiler-reports/Dataloader.json
2021-03-09 06:45:07        130 pc-tf-custom-container-test-job-2021-03-09-06-30-02-520/rule-output/ProfilerReport-1615271402/profiler-output/profiler-r

In [24]:
def read_s3_obj(s3_key):
    s3 = boto3.resource('s3')
    obj = s3.Object('canopy-production-ml-output', s3_key)
    obj_bytes = io.BytesIO(obj.get()['Body'].read())
    return obj_bytes

In [33]:
key = 'pc-tf-custom-container-test-job-2021-03-07-19-37-23-579/rule-output/ProfilerReport-1615145843/profiler-output/profiler-reports/Dataloader.json'
data = json.load(read_s3_obj(key))

In [34]:
data

{'RuleTriggered': 0,
 'Violations': 0,
 'Details': {},
 'Datapoints': 0,
 'RuleParameters': 'min_threshold:70\nmax_threshold:200'}

# Search for H5 Files

In [27]:
import boto3
s3 = boto3.resource('s3')
my_bucket = s3.Bucket('canopy-production-ml-output')
files = my_bucket.objects.all()
file_list = []
for file in files:
    if file.key.endswith('.h5'):
         file_list.append(file.key)

In [28]:
file_list

[]

In [29]:
!pwd

/home/ec2-user/SageMaker/cb_feature_detection/sagemaker_staging/docker_test_folder


In [37]:
!ls

cb_feature_Launch_Training_Job2.ipynb  docker_test_folder   labels_test_v1.csv
cb_feature_Launch_Training_Job3.ipynb  entry_point_test.py  test_script.py
cb_feature_train1_aws.py	       labels.json	    val_labels.csv


In [30]:
df = pd.read_csv("/home/ec2-user/SageMaker/cb_feature_detection/sagemaker_staging/val_labels.csv")

In [33]:
from io import StringIO # python3; python2: BytesIO 
import boto3

bucket = 'canopy-production-ml-output' # already created on S3
csv_buffer = StringIO()
csv_buffer = df.to_csv(csv_buffer)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'ckpt/test.csv').put(Body=csv_buffer.getvalue())

AttributeError: 'NoneType' object has no attribute 'getvalue'

In [11]:
help(sagemaker.session)

Help on module sagemaker.session in sagemaker:

NAME
    sagemaker.session - Placeholder docstring

CLASSES
    builtins.object
        LogState
        Session
    
    class LogState(builtins.object)
     |  Placeholder docstring
     |  
     |  Data descriptors defined here:
     |  
     |  __dict__
     |      dictionary for instance variables (if defined)
     |  
     |  __weakref__
     |      list of weak references to the object (if defined)
     |  
     |  ----------------------------------------------------------------------
     |  Data and other attributes defined here:
     |  
     |  COMPLETE = 5
     |  
     |  JOB_COMPLETE = 4
     |  
     |  STARTING = 1
     |  
     |  TAILING = 3
     |  
     |  WAIT_IN_PROGRESS = 2
    
    class Session(builtins.object)
     |  Manage interactions with the Amazon SageMaker APIs and any other AWS services needed.
     |  
     |  This class provides convenient methods for manipulating entities and resources that Amazon
    

## Sandbox

In [21]:
!pip3 install rasterio

Collecting rasterio
  Downloading rasterio-1.2.1-cp36-cp36m-manylinux1_x86_64.whl (19.1 MB)
[K     |████████████████████████████████| 19.1 MB 12.2 MB/s eta 0:00:01     |████████████████████████▋       | 14.7 MB 12.2 MB/s eta 0:00:01
Collecting cligj>=0.5
  Downloading cligj-0.7.1-py3-none-any.whl (7.1 kB)
Collecting snuggs>=1.4.1
  Downloading snuggs-1.4.7-py3-none-any.whl (5.4 kB)
Collecting affine
  Downloading affine-2.3.0-py2.py3-none-any.whl (15 kB)
Collecting click-plugins
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Installing collected packages: snuggs, cligj, click-plugins, affine, rasterio
Successfully installed affine-2.3.0 click-plugins-1.1.1 cligj-0.7.1 rasterio-1.2.1 snuggs-1.4.7


In [22]:
import rasterio


rasterio_env = rasterio.Env(
    #session=aws_session,
    GDAL_DISABLE_READDIR_ON_OPEN='NO',
    CPL_VSIL_CURL_USE_HEAD='NO',
    GDAL_GEOREF_SOURCES='INTERNAL',
    GDAL_TIFF_INTERNAL_MASK='NO'
)

In [26]:
import numpy as np
import tensorflow as tf


with rasterio_env as env:
    path_to_s3_img = 's3://canopy-production-ml/chips/cloudfree-merge-polygons/split/train/15/15_1400_2800.tif'
    with rasterio.open(path_to_s3_img, mode='r', sharing=False, GEOREF_SOURCES='INTERNAL') as src:
        train_img = src.read()
    # Normalize image
train_img = tf.image.convert_image_dtype(train_img, tf.float32)

In [28]:
train_img.shape

TensorShape([18, 100, 100])

In [29]:
with rasterio_env as env:
    path_to_s3_img = '/vsis3/canopy-production-ml/chips/cloudfree-merge-polygons/split/train/15/15_1400_2800.tif'
    with rasterio.open(path_to_s3_img, mode='r', sharing=False, GEOREF_SOURCES='INTERNAL') as src:
        train_img = src.read()
    # Normalize image
train_img = tf.image.convert_image_dtype(train_img, tf.float32)

In [30]:
train_img.shape

TensorShape([18, 100, 100])

In [31]:
help(rasterio.Env)

Help on class Env in module rasterio.env:

class Env(builtins.object)
 |  Abstraction for GDAL and AWS configuration
 |  
 |  The GDAL library is stateful: it has a registry of format drivers,
 |  an error stack, and dozens of configuration options.
 |  
 |  Rasterio's approach to working with GDAL is to wrap all the state
 |  up using a Python context manager (see PEP 343,
 |  https://www.python.org/dev/peps/pep-0343/). When the context is
 |  entered GDAL drivers are registered, error handlers are
 |  configured, and configuration options are set. When the context
 |  is exited, drivers are removed from the registry and other
 |  configurations are removed.
 |  
 |  Example
 |  -------
 |  .. code-block:: python
 |  
 |      with rasterio.Env(GDAL_CACHEMAX=128000000) as env:
 |          # All drivers are registered, GDAL's raster block cache
 |          # size is set to 128 MB.
 |          # Commence processing...
 |          ...
 |          # End of processing.
 |  
 |      # At thi