In [None]:
%%sh
pip install -q pip --upgrade
pip install -q matplotlib sagemaker smdebug awscli --upgrade --user

## Download the Fashion-MNIST dataset

In [None]:
import os
import numpy as np
from tensorflow.keras.datasets import fashion_mnist

(x_train, y_train), (x_val, y_val) = fashion_mnist.load_data()

os.makedirs("./data", exist_ok = True)
np.savez('./data/training', image=x_train, label=y_train)
np.savez('./data/validation', image=x_val, label=y_val)

In [None]:
!pygmentize fmnist-5.py

## Upload Fashion-MNIST data to S3

In [None]:
import sagemaker, smdebug

print(sagemaker.__version__)

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sess.default_bucket()
prefix = 'keras2-fashion-mnist'

training_input_path = sess.upload_data('data/training.npz', key_prefix=prefix+'/training')
validation_input_path = sess.upload_data('data/validation.npz', key_prefix=prefix+'/validation')
output_path = 's3://{}/{}/output/'.format(bucket, prefix)
chk_path = 's3://{}/{}/checkpoints/'.format(bucket, prefix)

print(training_input_path)
print(validation_input_path)
print(output_path)
print(chk_path)

## Train with Tensorflow

In [None]:
from sagemaker.tensorflow import TensorFlow
from sagemaker.debugger import rule_configs, Rule, DebuggerHookConfig, CollectionConfig

save_interval = '100'

tf_estimator = TensorFlow(entry_point='fmnist-5.py',
    role=role,
    instance_count=1, 
    instance_type='ml.p3.2xlarge',
    framework_version='2.1.0', 
    py_version='py3',
    hyperparameters={'epochs': 20},
    output_path=output_path,
    #use_spot_instances=True,
    #max_run=3600,                    
    #max_wait=7200,

    debugger_hook_config=DebuggerHookConfig(                 
        s3_output_path='s3://{}/{}/debug'.format(bucket, prefix), 
        collection_configs=[
            CollectionConfig(name='metrics', parameters={"save_interval": save_interval}),
            CollectionConfig(name='losses', parameters={"save_interval": save_interval}),
            CollectionConfig(name='outputs', parameters={"save_interval": save_interval}),
            CollectionConfig(name='weights', parameters={"save_interval": save_interval}),
            CollectionConfig(name='gradients', parameters={"save_interval": save_interval})
        ],
    ),

    rules=[
        Rule.sagemaker(rule_configs.poor_weight_initialization()),
        Rule.sagemaker(rule_configs.dead_relu()),
        Rule.sagemaker(rule_configs.check_input_images(), rule_parameters={"channel": '3'})
    ]
)

In [None]:
tf_estimator.fit({'training': training_input_path, 'validation': validation_input_path})

In [None]:
description = tf_estimator.latest_training_job.rule_job_summary()

for rule in description:
    rule.pop('LastModifiedTime')
    rule.pop('RuleEvaluationJobArn')
    print(rule)

In [None]:
from smdebug.trials import create_trial

s3_output_path = tf_estimator.latest_job_debugger_artifacts_path()
trial = create_trial(s3_output_path)

In [None]:
trial.tensor_names()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

loss = trial.tensor('val_f1_score')
plt.autoscale()
values = [loss.value(s) for s in loss.steps()]
plt.plot(loss.steps(), values)

In [None]:
w = trial.tensor('conv2d/weights/conv2d/kernel:0')
print(w.value(0).shape)
g = trial.tensor('training/Adam/gradients/gradients/conv2d/Conv2D_grad/Conv2DBackpropFilter:0')
print(g.value(0).shape)

In [None]:
def plot_conv_filter(tensor_name, filter_num, min_step=0):
    tensor = trial.tensor(tensor_name)
    steps = [s for s in tensor.steps() if s >= min_step]
    plt.autoscale()
    for i in range(0,3):
        for j in range(0,3):
            values = [tensor.value(s)[:,:,0,filter_num][i][j] for s in steps]
            label='({},{})'.format(i,j)
            plt.plot(steps, values, label=label)
    plt.legend(loc='upper left')
    plt.show()

In [None]:
plot_conv_filter('conv2d/weights/conv2d/kernel:0', 63)

In [None]:
plot_conv_filter('training/Adam/gradients/gradients/conv2d/Conv2D_grad/Conv2DBackpropFilter:0', 63, min_step=15000)

## Deploy

In [None]:
import time

tf_endpoint_name = 'keras-tf-fmnist-'+time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

tf_predictor = tf_estimator.deploy(
                 initial_instance_count=1, 
                 instance_type='ml.m5.large',
                 endpoint_name=tf_endpoint_name)

## Predict 

In [None]:
%matplotlib inline
import random
import matplotlib.pyplot as plt

num_samples = 5
indices = random.sample(range(x_val.shape[0] - 1), num_samples)
images = x_val[indices]/255
labels = y_val[indices]

for i in range(num_samples):
    plt.subplot(1,num_samples,i+1)
    plt.imshow(images[i].reshape(28, 28), cmap='gray')
    plt.title(labels[i])
    plt.axis('off')

payload = images.reshape(num_samples, 28, 28, 1)

In [None]:
response = tf_predictor.predict(payload)
prediction = np.array(response['predictions'])
predicted_label = prediction.argmax(axis=1)
print('Predicted labels are: {}'.format(predicted_label))

## Clean up

In [None]:
tf_predictor.delete_endpoint()