# ---------------PROJECT EAGLE-EYE-----------------

# Training With SSD

# Training Using TensorFlow Object Detection API On SageMaker

## 1. Setup environment

In [1]:
import os
import sagemaker
from sagemaker.estimator import Framework, Estimator

role = sagemaker.get_execution_role()
print(role)

# defining s3 training data inputs
inputs = {'train': 's3://eagle-eye-dataset/OD_using_TFOD_API/experiment3/tfrecords/'} 

# s3 path for tensorboard events
tensorboard_s3_prefix = 's3://eagle-eye-dataset/OD_using_TFOD_API/experiment3/tensorboard' 

arn:aws:iam::743025358310:role/service-role/AmazonSageMaker-ExecutionRole-20210528T211254


## 2. Build and push container

In [2]:
%%bash
git clone https://github.com/tensorflow/models.git docker/models
    

# getting model_main and exporter_main files from TF2 Object Detection GitHub repository
cp docker/models/research/object_detection/exporter_main_v2.py source_dir 
cp docker/models/research/object_detection/model_main_tf2.py source_dir

Cloning into 'docker/models'...


In [4]:
image_name = 'tf2-object-detection'

In [5]:
!sh ./docker/build_and_push.sh $image_name

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

./docker/build_and_push.sh: line 19: Login: command not found
Building image with name tf2-object-detection
Sending build context to Docker daemon  661.6MB
Step 1/12 : FROM tensorflow/tensorflow:2.5.0-gpu
2.5.0-gpu: Pulling from tensorflow/tensorflow

[1Ba5e7af40: Pulling fs layer 
[1B39a868b3: Pulling fs layer 
[1Bb10cca85: Pulling fs layer 
[1Bdd082e0f: Pulling fs layer 
[1Bc5fa5e16: Pulling fs layer 
[1B367c7b71: Pulling fs layer 
[1B536b8e20: Pulling fs layer 
[1Bdd54cd82: Pulling fs layer 
[1B800e18af: Pulling fs layer 
[1B68b08545: Pulling fs layer 
[1Bb9465126: Pulling fs layer 
[1B115fe9db: Pulling fs layer 
[1Bc77ca4f1: Pulling fs layer 
[1B5acf85d5: Pulling fs layer 
[1Bcc2a81ed: Pulling fs layer 
[1B6769a9b0: Pull complete 089kB/1.089kBB[16A[2K[16A[2K[16A[2K[16A[2K[12A[2K[16A[2K[16A[2K[16A[2K[10A[2K[9A[2K[8A[2K[9A[2K[8A[2K[9A[2K[7A[2K[8A[2K[8A[2K[

In [6]:
import os
with open (os.path.join('docker', 'ecr_image_fullname.txt'), 'r') as f:
    container = f.readlines()[0][:-1]

print(container)

743025358310.dkr.ecr.us-west-2.amazonaws.com/tf2-object-detection:20210625104612


## 3. Getting pre-trained model from model zoo

Download the base model and extract locally

In [None]:
%%bash
mkdir /tmp/checkpoint
mkdir source_dir/checkpoint
wget -O /tmp/faster_rcnn.tar.gz http://download.tensorflow.org/models/object_detection/tf2/20200711/faster_rcnn_resnet50_v1_640x640_coco17_tpu-8.tar.gz
tar -zxvf /tmp/faster_rcnn.tar.gz --strip-components 2 --directory source_dir/checkpoint faster_rcnn_resnet50_v1_640x640_coco17_tpu-8/checkpoint

## 4. Create SageMaker Custom Framework and Launch Training job

Here we define a custom framework estimator using the Amazon SageMaker Python SDK and run training with that class, which will take care of managing these tasks.

In [3]:
class CustomFramework(Framework):
    def __init__(
        self,
        entry_point,
        source_dir=None,
        hyperparameters=None,
        distributions=None,
        **kwargs
    ):
        super(CustomFramework, self).__init__(entry_point, source_dir, hyperparameters, **kwargs)
    
    def _configure_distribution(self, distributions):
        return
    
    def create_model(
        self,
        model_server_workers=None,
        role=None,
        vpc_config_override=None,
        entry_point=None,
        source_dir=None,
        dependencies=None,
        image_uri=None,
        **kwargs
    ):
        return None

In [4]:
from sagemaker.debugger import TensorBoardOutputConfig

hyperparameters = {
    "model_dir":"/opt/training",        
    "pipeline_config_path": "pipeline.config",
    "num_train_steps": 1000,    
    "sample_1_of_n_eval_examples": 1
}

tensorboard_output_config = TensorBoardOutputConfig(
    s3_output_path=tensorboard_s3_prefix,
    container_local_output_path='/opt/training/'
)

estimator = CustomFramework(
    image_uri=container,
    role=role,
    entry_point='run_training.sh',
    source_dir='source_dir/',
    instance_count=1,
    instance_type='ml.p3.2xlarge',
    hyperparameters=hyperparameters,
    tensorboard_output_config=tensorboard_output_config,
    disable_profiler=True,
    base_job_name='tf2-object-detection'
)

In [None]:
estimator.fit(inputs)

### TensorBoard Events Path

In [7]:
job_artifacts_path = estimator.latest_job_tensorboard_artifacts_path()
job_artifacts_path

's3://eagle-eye-dataset/OD_using_TFOD_API/tensorboard/ssd/tf2-object-detection-2021-06-18-02-41-00-395/tensorboard-output'