In [1]:
import sagemaker
import os

In [2]:
import pathlib as pl

In [3]:
import datetime

In [4]:
from sagemaker.tensorflow import TensorFlow
from sagemaker.debugger import TensorBoardOutputConfig

In [5]:
MODEL = "MobileNetV2"
EPOCHS = 3
STEPS_PER_EPOCH = 1

In [6]:
sess = sagemaker.Session()
role = sagemaker.get_execution_role()

In [7]:
upload_data = False
if upload_data:
    data_input_path = sess.upload_data('data/severstal-binary')
else:
    data_input_path = "s3://sagemaker-us-east-2-475496805360/data"
print(data_input_path)

s3://sagemaker-us-east-2-475496805360/data


In [8]:
training_input_path = data_input_path + "/train"
validation_input_path = data_input_path + "/validate"

In [9]:
time_now = datetime.datetime.now().strftime("%Y-%m-%d_%Hh%Mm%Ss")
tensorboard_logs_dir = f"s3://{sess.default_bucket()}/tensorboard_logs_v2/fit/{time_now}_{MODEL}"

In [10]:
tensorboard_output_config = TensorBoardOutputConfig(s3_output_path=tensorboard_logs_dir)

In [11]:
tf_estimator = TensorFlow(
    entry_point='fine_tune_eff_net.py', 
    role=role,
    instance_count=1, 
    instance_type='ml.g4dn.xlarge',  # local 'ml.g4dn.xlarge'
    framework_version='2.3', 
    py_version='py37',
    script_mode=True,
    hyperparameters={
        'epochs': EPOCHS,
        "steps-per-epoch": STEPS_PER_EPOCH,
        "log-dir": tensorboard_logs_dir,
        "model": MODEL
    },
    tensorboard_output_config=tensorboard_output_config,
)

In [12]:
tf_estimator.fit(
    {
        'training': training_input_path, 
        'validation': validation_input_path, 
    }
)

2020-12-07 09:41:12 Starting - Starting the training job...
2020-12-07 09:41:14 Starting - Launching requested ML instances......
2020-12-07 09:42:16 Starting - Preparing the instances for training...
2020-12-07 09:43:11 Downloading - Downloading input data...
2020-12-07 09:43:36 Training - Downloading the training image......
2020-12-07 09:44:42 Training - Training image download completed. Training in progress..[34m2020-12-07 09:44:43.754161: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.2[0m
[34m2020-12-07 09:44:46,318 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2020-12-07 09:44:46,818 sagemaker-training-toolkit INFO     Invoking user script
[0m
[34mTraining Env:
[0m
[34m{
    "additional_framework_parameters": {},
    "channel_input_dirs": {
        "training": "/opt/ml/input/data/training",
        "validation": "/opt/ml/input/data/validation"
  