# Training a Model
***
This notebook is for training a model with default hyperparameters and the mobilenet-v2-100-224 model.

To Do:
- Adding explicit verification and testing channels
- Making sure hypder parameters are what we want.
- Ensuring output log file is saved to s3.

In [36]:
import sagemaker, boto3, json
from sagemaker.session import Session

role = sagemaker.get_execution_role()
print(role)

sagemaker_session = Session()
aws_role = sagemaker_session.get_caller_identity_arn()
aws_region = boto3.Session().region_name
bucket = sagemaker_session.default_bucket()
s3 = boto3.resource('s3')


print("using bucket %s" % bucket)

arn:aws:iam::447099698275:role/service-role/AmazonSageMaker-ExecutionRole-20230114T111418
using bucket sagemaker-us-east-2-447099698275


In [37]:
from sagemaker import image_uris, model_uris, script_uris, hyperparameters

# Here, we are retrieving the URIs to all the default Docker images, training scripts, and pretrained models
# that are provided by Jumpstart
# Note we are just retrieving links to these things. They will be assembled later in the Estimator, Model, or Predictor
# Estimators are for training.
# Models are for deployment.
# Predictors are for inference.

model_id, model_version = "tensorflow-ic-imagenet-mobilenet-v2-100-224-classification-4", "*"
training_instance_type = "ml.m5.xlarge"

# Retrieve the docker image
train_image_uri = image_uris.retrieve(
    region=None,
    framework=None,
    model_id=model_id,
    model_version=model_version,
    image_scope="training",
    instance_type=training_instance_type,
)
# Retrieve the training script
#train_source_uri = script_uris.retrieve(
 #   model_id=model_id, model_version=model_version, script_scope="training"
#)

train_source_uri = "s3://sagemaker-script-bucket-mirror/transfer-learning/sourcedir.tar.gz"

# Retrieve the pre-trained model tarball to further fine-tune
train_model_uri = model_uris.retrieve(
    model_id=model_id, model_version=model_version, model_scope="training"
)

print(train_source_uri)

s3://sagemaker-script-bucket-mirror/transfer-learning/sourcedir.tar.gz


In [42]:
# Our training data
training_data_bucket = f"lantern-rd-pictures"
training_dataset_s3_path = f"s3://{training_data_bucket}/"

#Our test data
test_data_bucket = f"lantern-rd-test-pictures"
test_dataset_s3_path = f"s3://{test_data_bucket}/"

# Output
output_bucket = bucket
output_prefix = "slf-classifier-training"
s3_output_location = f"s3://{output_bucket}/{output_prefix}/output"
print(s3_output_location)

s3://sagemaker-us-east-2-447099698275/slf-classifier-training/output


In [39]:
from sagemaker import hyperparameters

# Retrieve the default hyper-parameters for fine-tuning the model
#hyperparameters = hyperparameters.retrieve_default(model_id=model_id, model_version=model_version)

# [Optional] Override default hyperparameters with custom values
hyperparameters = {
    "train_only_top_layer" : "True",
    "epochs" : "18",
    "batch_size" : "32",
    "optimizer" : "adam",
    "learning_rate" : "0.001",
    "beta_1" : "0.9",
    "beta_2" : "0.999",
    "momentum" : "0.9",
    "epsilon" : "0.0000001",
    "rho" : "0.95",
    "initial_accumulator_value" : "0.1",
    "reinitialize_top_layer" : "True",
    "early_stopping" : "False",
    "early_stopping_patience" : "5",
    "early_stopping_min_delta" : "0.0",
    "dropout_rate" : "0.2",
    "regularizers_l2" : "0.0001",
    "label_smoothing" : "0.1",
    "image_resize_interpolation" : "bilinear",
    "augmentation" : "True",
    "augmentation_random_flip" : "horizontal_and_vertical",
    "augmentation_random_rotation" : "0.5",
    "augmentation_random_zoom" : "0.1",
    "binary_mode" : "False",
    "eval_metric" : "accuracy",
    "validation_split_ratio" : "0.2",
    "random_seed" : "123"
}
print(hyperparameters)

{'train_only_top_layer': 'True', 'epochs': '18', 'batch_size': '32', 'optimizer': 'adam', 'learning_rate': '0.001', 'beta_1': '0.9', 'beta_2': '0.999', 'momentum': '0.9', 'epsilon': '0.0000001', 'rho': '0.95', 'initial_accumulator_value': '0.1', 'reinitialize_top_layer': 'True', 'early_stopping': 'False', 'early_stopping_patience': '5', 'early_stopping_min_delta': '0.0', 'dropout_rate': '0.2', 'regularizers_l2': '0.0001', 'label_smoothing': '0.1', 'image_resize_interpolation': 'bilinear', 'augmentation': 'True', 'augmentation_random_flip': 'horizontal_and_vertical', 'augmentation_random_rotation': '0.5', 'augmentation_random_zoom': '0.1', 'binary_mode': 'False', 'eval_metric': 'accuracy', 'validation_split_ratio': '0.2', 'random_seed': '123'}


In [40]:
from sagemaker.tuner import ContinuousParameter

#######################################################
# Use AMT for tuning and selecting the best model
# Setting to false will use Automatic Model Tuning
# Will use hp_tuner instead of ic_estimator
use_amt = False
######################################################

# Define objective metric per framework, based on which the best model will be selected.
metric_definitions_per_model = {
    "tensorflow": {
        "metrics": [{"Name": "val_accuracy", "Regex": "val_accuracy: ([0-9\\.]+)"}],
        "type": "Maximize",
    },
    "pytorch": {
        "metrics": [{"Name": "val_accuracy", "Regex": "val Acc: ([0-9\\.]+)"}],
        "type": "Maximize",
    },
}

# You can select from the hyperparameters supported by the model, and configure ranges of values to be searched for training the optimal model.(https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning-define-ranges.html)
hyperparameter_ranges = {
    "adam-learning-rate": ContinuousParameter(0.0001, 0.1, scaling_type="Logarithmic")
}

# Increase the total number of training jobs run by AMT, for increased accuracy (and training time).
max_jobs = 6
# Change parallel training jobs run by AMT to reduce total training time, constrained by your account limits.
# if max_jobs=max_parallel_jobs then Bayesian search turns to Random.
max_parallel_jobs = 2

In [41]:
from sagemaker.estimator import Estimator
from sagemaker.utils import name_from_base
from sagemaker.tuner import HyperparameterTuner

training_job_name = name_from_base(f"slf-{model_id}-transfer-learning")

# Create SageMaker Estimator instance
ic_estimator = Estimator(
    role=aws_role,
    image_uri=train_image_uri,
    source_dir=train_source_uri,
    model_uri=train_model_uri,
    entry_point="transfer_learning.py",
    instance_count=1,
    instance_type=training_instance_type,
    max_run=360000,
    hyperparameters=hyperparameters,
    output_path=s3_output_location,
    base_job_name=training_job_name,
)

if use_amt:
    metric_definitions = next(
        value for key, value in metric_definitions_per_model.items() if model_id.startswith(key)
    )

    hp_tuner = HyperparameterTuner(
        ic_estimator,
        metric_definitions["metrics"][0]["Name"],
        hyperparameter_ranges,
        metric_definitions["metrics"],
        max_jobs=max_jobs,
        max_parallel_jobs=max_parallel_jobs,
        objective_type=metric_definitions["type"],
        base_tuning_job_name=training_job_name,
    )

    # Launch a SageMaker Tuning job to search for the best hyperparameters
    hp_tuner.fit({"training": training_dataset_s3_path})
else:
    # Launch a SageMaker Training job by passing s3 path of the training data
    ic_estimator.fit({"training": training_dataset_s3_path, "test": test_dataset_s3_path}, logs=True)
    #s3.Object(bucket, "{output_bucket}/{output_prefix}/output/{training_job_name}/log.txt").put(Body=ic_estimator.logs())

2023-02-09 16:08:40 Starting - Starting the training job...
2023-02-09 16:09:03 Starting - Preparing the instances for trainingProfilerReport-1675958919: InProgress
......
2023-02-09 16:10:03 Downloading - Downloading input data...
2023-02-09 16:10:41 Training - Downloading the training image......
2023-02-09 16:11:23 Training - Training image download completed. Training in progress.[34m2023-02-09 16:11:22.134941: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2023-02-09 16:11:22.135102: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.[0m
[34m2023-02-09 16:11:22.160609: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2023-02-09 16:11:24,086 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_containe

In [5]:
import sagemaker
print(sagemaker.__version__)

2.120.0


In [None]:
boto3.client("s3").download_file(s3_bucket, f"{image_key}", filename)