This notebook is developed using the `Python 3 (TensorFlow 2.3 Python 3.7 CPU Optimized)` kernel on an `ml.t3.medium` instance.

In [None]:
!pip install -q sagemaker-experiments

In [None]:
import sagemaker
import json
import boto3

role = sagemaker.get_execution_role()
sess = sagemaker.Session()
region = sess.boto_region_name
bucket = sess.default_bucket()
prefix = 'sagemaker-studio-book/chapter09'

In [None]:
import numpy as np
import os
from time import gmtime, strftime
import time
import uuid

In [None]:
max_features = 20000
maxlen = 400

data_dir = os.path.join(os.getcwd(), 'imdb_data')
train_dir = os.path.join(data_dir, 'train')
train_file = os.path.join(train_dir, 'x_train.npy')
test_dir = os.path.join(data_dir, 'test')
test_file = os.path.join(test_dir, 'x_test.npy')

if not (os.path.isfile(train_file) and os.path.isfile(test_file)):
    print('Data not available locally. Creating...')
    import tensorflow as tf
    from tensorflow.keras.preprocessing import sequence
    from tensorflow.python.keras.datasets import imdb
    
    (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
    print(len(x_train), 'train sequences')
    print(len(x_test), 'test sequences')

    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
    print('x_train shape:', x_train.shape)
    print('x_test shape:', x_test.shape)
    
    os.makedirs(data_dir, exist_ok=True)
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)
    
    np.save(os.path.join(train_dir, 'x_train.npy'), x_train)
    np.save(os.path.join(train_dir, 'y_train.npy'), y_train)
    np.save(os.path.join(test_dir, 'x_test.npy'), x_test)
    np.save(os.path.join(test_dir, 'y_test.npy'), y_test)
else:
    print('Data available locally.')

In [None]:
traindata_s3_prefix = f'{prefix}/imdb_data/train'
testdata_s3_prefix = f'{prefix}/imdb_data/test'

train_s3 = sess.upload_data(path='./imdb_data/train/', key_prefix=traindata_s3_prefix)
test_s3 = sess.upload_data(path='./imdb_data/test/', key_prefix=testdata_s3_prefix)

In [None]:
!mkdir code

In [None]:
%%writefile code/tensorflow_sentiment_with_checkpoint.py
import logging
logging.getLogger('tensorflow').setLevel(logging.ERROR)
import argparse
import codecs
import json
import numpy as np
import os
import re
import tensorflow as tf

max_features = 20000
maxlen = 400
embedding_dims = 300
filters = 256
kernel_size = 3
hidden_dims = 256

def parse_args():
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script
    parser.add_argument('--epochs', type=int, default=1)
    parser.add_argument('--batch_size', type=int, default=64)
    parser.add_argument('--learning_rate', type=float, default=0.01)
    parser.add_argument('--drop_out_rate', type=float, default=0.2)
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--checkpoint_dir', type=str, default='/opt/ml/checkpoints', 
                        help='Path where checkpoints will be saved.')

    return parser.parse_known_args()


def save_history(path, history):
    history_for_json = {}
    # transform float values that aren't json-serializable
    for key in list(history.history.keys()):
        if type(history.history[key]) == np.ndarray:
            history_for_json[key] == history.history[key].tolist()
        elif type(history.history[key]) == list:
           if  type(history.history[key][0]) == np.float32 or type(history.history[key][0]) == np.float64:
               history_for_json[key] = list(map(float, history.history[key]))

    with codecs.open(path, 'w', encoding='utf-8') as f:
        json.dump(history_for_json, f, separators=(',', ':'), sort_keys=True, indent=4) 


def get_train_data(train_dir):
    x_train = np.load(os.path.join(train_dir, 'x_train.npy'))
    y_train = np.load(os.path.join(train_dir, 'y_train.npy'))
    print(f'x train {x_train.shape} y train {y_train.shape}')

    return x_train, y_train


def get_test_data(test_dir):
    x_test = np.load(os.path.join(test_dir, 'x_test.npy'))
    y_test = np.load(os.path.join(test_dir, 'y_test.npy'))
    print(f'x test {x_test.shape} y test {y_test.shape}')

    return x_test, y_test


def get_model(args):
    embedding_layer = tf.keras.layers.Embedding(max_features,
                                                embedding_dims,
                                                input_length=maxlen)

    sequence_input = tf.keras.Input(shape=(maxlen,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    x = tf.keras.layers.Dropout(args.drop_out_rate)(embedded_sequences)
    x = tf.keras.layers.Conv1D(filters, kernel_size, padding='valid', activation='relu', strides=1)(x)
    x = tf.keras.layers.MaxPooling1D()(x)
    x = tf.keras.layers.GlobalMaxPooling1D()(x)
    x = tf.keras.layers.Dense(hidden_dims, activation='relu')(x)
    x = tf.keras.layers.Dropout(args.drop_out_rate)(x)
    preds = tf.keras.layers.Dense(1, activation='sigmoid')(x)

    model = tf.keras.Model(sequence_input, preds)
    optimizer = tf.keras.optimizers.Adam(args.learning_rate)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    return model


def load_model_from_checkpoints(checkpoint_dir):
    checkpoint_files = [file for file in os.listdir(checkpoint_dir) if file.endswith('.' + 'h5')]
    print('------------------------------------------------------')
    print(f'Available checkpoint files: {checkpoint_files}')
    epoch_numbers = [re.search('(\.*([1-9]|[1-9][0-9]|[1-9][0-9][0-9]))(?=\.)', file).group() 
                     for file in checkpoint_files]
      
    max_epoch_number = max(epoch_numbers)
    max_epoch_index = epoch_numbers.index(max_epoch_number)
    max_epoch_filename = checkpoint_files[max_epoch_index]

    print(f'Latest epoch checkpoint file name: {max_epoch_filename}')
    print('Resuming training from epoch: {}'.format(int(max_epoch_number)+1))
    print('------------------------------------------------------')
    
    resumed_model_from_checkpoints = tf.keras.models.load_model(f'{checkpoint_dir}/{max_epoch_filename}')
    return resumed_model_from_checkpoints, int(max_epoch_number)


if __name__ == '__main__':

    args, _ = parse_args()
    print(args)

    if os.path.isdir(args.checkpoint_dir):
        print(f'Checkpointing directory {args.checkpoint_dir} exists.')
    else:
        print(f'Creating Checkpointing directory {args.checkpoint_dir}.')
        os.mkdir(args.checkpoint_dir)
        
    x_train, y_train = get_train_data(args.train)
    x_test, y_test = get_test_data(args.test)

    # Load model
    if not os.listdir(args.checkpoint_dir):
        model = get_model(args)
        initial_epoch_number = 0
    else:    
        model, initial_epoch_number = load_model_from_checkpoints(args.checkpoint_dir)

    callbacks = [tf.keras.callbacks.ModelCheckpoint(args.checkpoint_dir + '/checkpoint-{epoch}.h5')]
    
    history = model.fit(x_train, y_train,
                        batch_size=args.batch_size,
                        epochs=args.epochs,
                        initial_epoch=initial_epoch_number,
                        validation_data=(x_test, y_test),
                        callbacks=callbacks)

    save_history(args.model_dir + '/history.p', history)
    
    # create a TensorFlow SavedModel for deployment to a SageMaker endpoint with TensorFlow Serving
    model.save(args.model_dir + '/1')

In [None]:
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from botocore.exceptions import ClientError

experiment_name = 'imdb-sentiment-analysis'

try:
    experiment = Experiment.create(
        experiment_name=experiment_name, 
        description='Training a sentiment classification model using imdb dataset.')
except ClientError as e:
    print(f'{experiment_name} experiment already exists! Reusing the existing experiment.')
    

In [None]:
from sagemaker.tensorflow import TensorFlow

exp_datetime = strftime('%Y-%m-%d-%H-%M-%S', gmtime())
jobname = f'imdb-tf-spot-{exp_datetime}'

s3_output_location = f's3://{bucket}/{prefix}'
code_dir = f's3://{bucket}/{prefix}'

train_instance_type = 'ml.p2.xlarge'
hyperparameters = {'epochs': 20, 'batch_size': 256, 'learning_rate': 0.01, 'drop_out_rate': 0.2}

use_spot_instances = True
max_run = 3600
max_wait = 3600

checkpoint_suffix = str(uuid.uuid4())[:8]
checkpoint_suffix = '02fa28a1'
checkpoint_s3_uri = f's3://{bucket}/{prefix}/checkpoint-{checkpoint_suffix}'
checkpoint_local_path = '/opt/ml/checkpoints/'
model_local_path = '/opt/ml/model'

estimator = TensorFlow(source_dir='code',
                       entry_point='tensorflow_sentiment_with_checkpoint.py',
                       output_path=s3_output_location,
                       model_dir=model_local_path,
                       code_location=code_dir,
                       instance_type=train_instance_type,
                       instance_count=1,
                       enable_sagemaker_metrics=True,
                       hyperparameters=hyperparameters,
                       role=role,
                       framework_version='2.1',
                       py_version='py3',
                       use_spot_instances=use_spot_instances,
                       checkpoint_s3_uri=checkpoint_s3_uri,
                       max_run=max_run,
                       max_wait=max_wait,
                       debugger_hook_config=False)

data_channels = {'train':train_s3, 'test': test_s3}
print(data_channels)

In [None]:
# Creating a new trial for the experiment
exp_trial = Trial.create(experiment_name=experiment_name, 
                         trial_name=jobname)

experiment_config={'ExperimentName': experiment_name,
                   'TrialName': exp_trial.trial_name,
                   'TrialComponentDisplayName': 'Training'}

estimator.fit(inputs=data_channels,
              job_name=jobname,
              experiment_config=experiment_config,
              wait=True)