In [None]:
!git commit -m "Initial commit"

In [23]:
%%capture

!pip install tensorflow==2.3.0
!pip install sagemaker-experiments

#### Imports 

In [2]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sagemaker.tensorflow.serving import TensorFlowModel
from sagemaker.multidatamodel import MultiDataModel
from tensorflow.keras.datasets import cifar10
from sagemaker.tensorflow import TensorFlow
from sagemaker.inputs import TrainingInput
from sagemaker import get_execution_role
from tensorflow.keras import utils
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from datetime import datetime
import tensorflow as tf
import numpy as np
import sagemaker
import logging
import boto3
import time
import os

#### Setup Logger

In [3]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())
!python --version

Python 3.6.13


In [4]:
logger.info(f'[Using TensorFlow version: {tf.__version__}]')
logger.info(f'[Using SageMaker version: {sagemaker.__version__}]')

[Using TensorFlow version: 2.3.0]
[Using SageMaker version: 2.86.2]


#### Seed for Reproducability

In [5]:
SEED = 123
np.random.seed(SEED)
tf.random.set_seed(SEED)

#### Create Roles, Sessions and Data Locations

In [6]:
role = get_execution_role()
session = boto3.Session()
sagemaker_session = sagemaker.Session()

s3 = session.resource('s3')
TF_FRAMEWORK_VERSION = '2.3.0'
BUCKET = sagemaker.Session().default_bucket()
PREFIX = 'cv-models'

### Train - CIFAR-10 Image Classification

<p align="justify">First, we will train a Convolutional Neural Network (CNN) model to classify images from the CIFAR-10 dataset. Image classification is the task of assigning a label to an image, from a predefined set of categories. CIFAR-10 is an established CV dataset used for object recognition. It is a subset of the 80 Million Tiny Images dataset and consists of 60,000 (32x32) color images containing 1 of 10 object classes, with 6,000 images per class.</p>

#### a) Load Data

The first step is to load the pre-shuffled CIFAR-10 dataset into our train and test objects. Luckily, Keras provides the CIFAR dataset for us to load using the `load_data()` method. All we have to do is import keras.datasets and then load the data.

In [7]:
(X_train, y_train), (X_test, y_test) = cifar10.load_data()

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


In [8]:
logger.info(f'X_train Shape: {X_train.shape}')
logger.info(f'y_train Shape: {y_train.shape}')
logger.info(f'X_test Shape : {X_test.shape}')
logger.info(f'y_test Shape : {y_test.shape}')

X_train Shape: (50000, 32, 32, 3)
y_train Shape: (50000, 1)
X_test Shape : (10000, 32, 32, 3)
y_test Shape : (10000, 1)


#### c) Data Preparation

##### Rescale 
Rescales the images by dividing the pixel values by 255: [0,255] ⇒ [0,1]

In [9]:
X_train = X_train.astype('float32')/255
X_test = X_test.astype('float32')/255

##### One Hot Encode Target Labels
One-hot encoding is a process by which categorical variables are converted into a numeric form. One-hot encoding converts the (1 × n) label vector to a label matrix of dimensions (10 × n), where n is the number of sample images. So, if we have 1,000 images in our dataset, the label vector will have the dimensions (1 × 1000). After one-hot encoding, the label matrix dimensions will be (1000 × 10). That’s why, when we define our network architecture in the next step, we will make the output softmax layer contain 10 nodes, where each node represents the probability of each class we have.

In [10]:
num_classes = len(np.unique(y_train))
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)

##### Split Data
Break original train set further into train and validation sets.

In [11]:
X_train, X_validation = X_train[500:], X_train[:500]
y_train, y_validation = y_train[500:], y_train[:500]

##### Save to Local

Create a local `data/cifar_10` directory to save the datasets.

In [12]:
DATASET_PATH = './data/cifar_10'

In [13]:
os.makedirs(DATASET_PATH, exist_ok=True)

Save train, validation and test sets to local `data` directory

In [14]:
np.save(f'{DATASET_PATH}/X_train.npy', X_train)
np.save(f'{DATASET_PATH}/y_train.npy', y_train)
np.save(f'{DATASET_PATH}/X_validation.npy', X_validation)
np.save(f'{DATASET_PATH}/y_validation.npy', y_validation)
np.save(f'{DATASET_PATH}/X_test.npy', X_test)
np.save(f'{DATASET_PATH}/y_test.npy', y_test)

##### Copy Datasets to S3
Copy train, validation and test sets from the local dir to S3, since SageMaker expects datasets to be in S3 for training.

In [15]:
!aws s3 cp ./{DATASET_PATH}/X_train.npy s3://{BUCKET}/{PREFIX}/cifar_10/train/
!aws s3 cp ./{DATASET_PATH}/y_train.npy s3://{BUCKET}/{PREFIX}/cifar_10/train/
!aws s3 cp ./{DATASET_PATH}/X_validation.npy s3://{BUCKET}/{PREFIX}/cifar_10/validation/
!aws s3 cp ./{DATASET_PATH}/y_validation.npy s3://{BUCKET}/{PREFIX}/cifar_10/validation/
!aws s3 cp ./{DATASET_PATH}/X_test.npy s3://{BUCKET}/{PREFIX}/cifar_10/test/
!aws s3 cp ./{DATASET_PATH}/y_test.npy s3://{BUCKET}/{PREFIX}/cifar_10/test/

upload: data/cifar_10/X_train.npy to s3://sagemaker-us-east-1-949263681218/cv-models/cifar_10/train/X_train.npy
upload: data/cifar_10/y_train.npy to s3://sagemaker-us-east-1-949263681218/cv-models/cifar_10/train/y_train.npy
upload: data/cifar_10/X_validation.npy to s3://sagemaker-us-east-1-949263681218/cv-models/cifar_10/validation/X_validation.npy
upload: data/cifar_10/y_validation.npy to s3://sagemaker-us-east-1-949263681218/cv-models/cifar_10/validation/y_validation.npy
upload: data/cifar_10/X_test.npy to s3://sagemaker-us-east-1-949263681218/cv-models/cifar_10/test/X_test.npy
upload: data/cifar_10/y_test.npy to s3://sagemaker-us-east-1-949263681218/cv-models/cifar_10/test/y_test.npy


# Create Training Inputs

In [16]:
train_input = TrainingInput(s3_data=f's3://{BUCKET}/{PREFIX}/cifar_10/train', 
                            distribution='FullyReplicated', 
                            content_type='npy')
validation_input = TrainingInput(s3_data=f's3://{BUCKET}/{PREFIX}/cifar_10/validation', 
                                 distribution='FullyReplicated', 
                                 content_type='npy')
test_input = TrainingInput(s3_data=f's3://{BUCKET}/{PREFIX}/cifar_10/test', 
                           distribution='FullyReplicated', 
                           content_type='npy')

In [17]:
inputs = {'train': train_input, 'val': validation_input, 'test': test_input}

#### e) Define Model Architecture & create Training Script

We will build a small CNN consisting of three convolutional layers and two dense layers.<br>
<b>Note:</b> We will use the ReLU activation function for all the hidden layers. In the last dense layer, we will use a softmax activation function with 10 nodes to return an array of 10 probability scores (summing to 1). Each score will be the probability that the current image belongs to our 10 image classes.

# Prepare a Experiment Tracker

In [21]:
sm = boto3.client('sagemaker')



In [24]:
from smexperiments.experiment import Experiment
cifar_experiment = Experiment.create(
    experiment_name="cifar-10-dataset-experiment", 
    description="objects", 
    sagemaker_boto_client=sm)

In [27]:
# from smexperiments.tracker import Tracker
# with Tracker.create(display_name="Preprocessing", sagemaker_boto_client=sm) as tracker:
#      tracker.log_input(name="cifar-10-dataset-log", media_type="s3/uri", value='random')
#      tracker.log_parameters({
#         "normalization_mean": 0.1307,
#         "normalization_std": 0.3081,
#     })

In [30]:
from smexperiments.trial import Trial
for num_hidden_channel in [2, 5, 10, 20, 32]:
    trial_name = f"cnn-training-job-{num_hidden_channel}-hidden-channels-{int(time.time())}"
    cnn_trial = Trial.create(
        trial_name=trial_name, 
        experiment_name=cifar_experiment.experiment_name,
        sagemaker_boto_client=sm,
    )
    cnn_trial.add_trial_component(tracker.trial_component)

#### f) Create a TensorFlow Estimator & fit the Model

In [82]:
model_name = 'cifar-10'
hyperparameters = {'epochs': 3}
estimator_parameters = {'entry_point':'cifar_train.py',
                        'instance_type': 'ml.m5.2xlarge',
                        'instance_count': 1,
                        'model_dir': f'/opt/ml/model',
                        'role': role,
                        'hyperparameters': hyperparameters,
                        'output_path': f's3://{BUCKET}/{PREFIX}/cifar_10/out',
                        'base_job_name': f'mme-cv-{model_name}',
                        'framework_version': TF_FRAMEWORK_VERSION,
                        'py_version': 'py37',
                        'script_mode': True}
estimator_1 = TensorFlow(**estimator_parameters)

In [83]:
cnn_training_job_name = "cnn-training-job-{}".format(int(time.time()))
estimator_1.fit(inputs, job_name=cnn_training_job_name,
        experiment_config={
            "ExperimentName": cifar_experiment.experiment_name, 
            "TrialName": cnn_trial.trial_name,
            "TrialComponentDisplayName": "Training",
        })

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: cnn-training-job-1655879605


2022-06-22 06:33:26 Starting - Starting the training job...
2022-06-22 06:33:49 Starting - Preparing the instances for trainingProfilerReport-1655879605: InProgress
......
2022-06-22 06:34:49 Downloading - Downloading input data...
2022-06-22 06:35:17 Training - Downloading the training image...
2022-06-22 06:35:50 Training - Training image download completed. Training in progress.[34m2022-06-22 06:35:49,104 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2022-06-22 06:35:49,112 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-06-22 06:35:49,570 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-06-22 06:35:49,587 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-06-22 06:35:49,602 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-06-22 06:35:49

[34m1546/1546 - 93s - loss: 1.7540 - accuracy: 0.3730 - val_loss: 1.9726 - val_accuracy: 0.3460[0m
[34mEpoch 2/3[0m
[34m1546/1546 - 88s - loss: 1.5483 - accuracy: 0.4377 - val_loss: 1.4266 - val_accuracy: 0.4800[0m
[34mEpoch 3/3[0m

2022-06-22 06:40:34 Uploading - Uploading generated training model[34m1546/1546 - 88s - loss: 1.4907 - accuracy: 0.4611 - val_loss: 1.6230 - val_accuracy: 0.4540[0m
[34mTest Accuracy: 0.4302999973297119[0m
[34m2022-06-22 06:40:32,091 sagemaker-training-toolkit INFO     Reporting training SUCCESS[0m



2022-06-22 06:40:51 Completed - Training job completed
ProfilerReport-1655879605: NoIssuesFound
Training seconds: 359
Billable seconds: 359


# Contents of the recorded experiment

In [85]:
trial_component_analytics = ExperimentAnalytics(
    experiment_name=cifar_experiment.experiment_name,
    sort_by="metrics.test:accuracy.max",
    sort_order="Descending",
    metric_names=['test:accuracy'],
   # parameter_names=['hidden_channels', 'epochs', 'dropout', 'optimizer']
)
analytic_table = trial_component_analytics.dataframe()
analytic_table.head()

Unnamed: 0,TrialComponentName,DisplayName,SourceArn,SageMaker.ImageUri,SageMaker.InstanceCount,SageMaker.InstanceType,SageMaker.VolumeSizeInGB,epochs,model_dir,sagemaker_container_log_level,...,SageMaker.DebugHookOutput - MediaType,SageMaker.DebugHookOutput - Value,SageMaker.ModelArtifact - MediaType,SageMaker.ModelArtifact - Value,Trials,Experiments,normalization_mean,normalization_std,cifar-10-dataset-log - MediaType,cifar-10-dataset-log - Value
0,cnn-training-job-1655879605-aws-training-job,Training,arn:aws:sagemaker:us-east-1:949263681218:train...,763104351884.dkr.ecr.us-east-1.amazonaws.com/t...,1.0,ml.m5.4xlarge,30.0,3.0,"""/opt/ml/model""",20.0,...,,s3://sagemaker-us-east-1-949263681218/cv-model...,,s3://sagemaker-us-east-1-949263681218/cv-model...,[cnn-training-job-32-hidden-channels-1655871250],[cifar-10-dataset-experiment],,,,
1,TrialComponent-2022-06-22-041210-guap,Preprocessing,,,,,,,,,...,,,,,[cnn-training-job-10-hidden-channels-165587124...,"[cifar-10-dataset-experiment, cifar-10-dataset...",0.1307,0.3081,s3/uri,random
2,cnn-training-job-1655871262-aws-training-job,Training,arn:aws:sagemaker:us-east-1:949263681218:train...,763104351884.dkr.ecr.us-east-1.amazonaws.com/t...,1.0,ml.m5.2xlarge,30.0,1.0,"""/opt/ml/model""",20.0,...,,s3://sagemaker-us-east-1-949263681218/cv-model...,,s3://sagemaker-us-east-1-949263681218/cv-model...,[cnn-training-job-32-hidden-channels-1655871250],[cifar-10-dataset-experiment],,,,


In [54]:
from sagemaker.analytics import TrainingJobAnalytics

In [60]:
analytics = TrainingJobAnalytics(training_job_name = 'cnn-training-job-1655871262', metric_names=['test:accuracy'])

In [75]:
analytics.__dict__ #['_cloudwatch'].list_metrics()

{'_sage_client': <botocore.client.SageMaker at 0x7f6cb0c3f8d0>,
 '_cloudwatch': <botocore.client.CloudWatch at 0x7f6cb13a02e8>,
 '_training_job_name': 'cnn-training-job-1655871262',
 '_start_time': None,
 '_end_time': None,
 '_period': 60,
 '_metric_names': ['test:accuracy'],
 '_dataframe': None,
 '_data': defaultdict(list, {}),
 '_time_interval': {'start_time': datetime.datetime(2022, 6, 22, 4, 15, 56, 247000, tzinfo=tzlocal()),
  'end_time': datetime.datetime(2022, 6, 22, 4, 20, 6, 584000, tzinfo=tzlocal())}}

# Experiment Cleanup

In [94]:
# def cleanup_sme_sdk(experiment):
#     for trial_summary in experiment.list_trials():
#         trial = Trial.load(trial_name=trial_summary.trial_name)
#         for trial_component_summary in trial.list_trial_components():
#             tc = TrialComponent.load(
#                 trial_component_name=trial_component_summary.trial_component_name)
#             trial.remove_trial_component(tc)
#             try:
#                 # comment out to keep trial components
#                 tc.delete()
#             except:
#                 # tc is associated with another trial
#                 continue
#             # to prevent throttling
#             time.sleep(.5)
#         trial.delete()
#         experiment_name = experiment.experiment_name
#     experiment.delete()
#     print(f"\nExperiment {experiment_name} deleted")

In [None]:
# cleanup_sme_sdk('cifar-10-dataset-experiment')

# Deploy

In [24]:
predictor = estimator_1.deploy(initial_instance_count=1,
                       instance_type='ml.t2.medium',
                       endpoint_name=f'tensorflow-cv')
print("\nSuccessfully deployed...")

update_endpoint is a no-op in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


----!

In [31]:
%matplotlib inline
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing import image
from IPython.display import Image
import matplotlib.image as mpimg 
import matplotlib.pyplot as plt
import numpy as np
CIFAR10_LABELS = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

In [32]:
img = load_img('./data/cifar_10/raw_images/airplane.png', target_size=(32, 32))
data = img_to_array(img)
data = data.astype('float32')
data = data / 255.0
data = data.reshape(1, 32, 32, 3)

In [33]:
payload = {'instances': data}

In [36]:
resp = predictor.predict(payload)
predicted_label = CIFAR10_LABELS[np.argmax(resp)]
print(f'Predicted Label: [{predicted_label}]')

Predicted Label: [airplane]
