# Custom Training with TensorFlow in Sagemaker

We will be using Tensorflow in Sagemaker to build custom training model, I learn this project from "Guided project" in Coursera. 

First create an AWS account and go to Sagemaker, create a notebook instance and upload this notebook.

In [1]:
pip install sagemaker==1.0.0

Collecting sagemaker==1.0.0
  Downloading sagemaker-1.0.0.tar.gz (120 kB)
[K     |████████████████████████████████| 120 kB 3.0 MB/s 
Building wheels for collected packages: sagemaker
  Building wheel for sagemaker (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sagemaker: filename=sagemaker-1.0.0-py2.py3-none-any.whl size=107061 sha256=c0329abc06bf367c442e9d64570436e9e6002d44deb04951cdfcd9368ca0da96
  Stored in directory: /root/.cache/pip/wheels/ea/36/5e/8f37bfa88bcebb122be905a71515c1a46b0683dff24858a111
Successfully built sagemaker
Installing collected packages: sagemaker
Successfully installed sagemaker-1.0.0
You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
%matplotlib inline

import os
import tarfile
import urllib
import shutil
import json
import random
import numpy as np
import tensorflow as tf
import sagemaker

from PIL import Image
from matplotlib import pyplot as plt

urls = ['http://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz',
        'http://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz']

print('Libraries imported')

Libraries imported


In [3]:
def download_and_extract(data_dir, download_dir):
    for url in urls:
        target_file = url.split('/')[-1]
        if target_file not in os.listdir(download_dir):
            print('Downloading', url)
            urllib.request.urlretrieve(url, os.path.join(download_dir, target_file))
            tf = tarfile.open(url.split('/')[-1])
            tf.extractall(data_dir)
        else:
            print('Already downloaded', url)

def get_annotations(file_path, annotations={}):
    
    with open(file_path, 'r') as f:
        rows = f.read().splitlines()

    for i, row in enumerate(rows):
        image_name, _, _, _ = row.split(' ')
        class_name = image_name.split('_')[:-1]
        class_name = '_'.join(class_name)
        image_name = image_name + '.jpg'
        
        annotations[image_name] = 'cat' if class_name[0] != class_name[0].lower() else 'dog'
    
    return annotations

In [4]:
if not os.path.isdir('data'):
    os.mkdir('data')

download_and_extract('data', '.')

Downloading http://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz
Downloading http://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz


# Dataset for Training

In [5]:
annotations = get_annotations('data/annotations/trainval.txt')
annotations = get_annotations('data/annotations/test.txt', annotations)

total_count = len(annotations.keys())
print('Total examples', total_count)

Total examples 7349


In [6]:
next(iter(annotations.items()))

('Abyssinian_100.jpg', 'cat')

In [7]:
classes = ['cat', 'dog']
sets = ['train', 'validation']
root_dir = 'custom_data'

if not os.path.isdir(root_dir):
    os.mkdir(root_dir)
    
for set_name in sets:
    if not os.path.isdir(os.path.join(root_dir, set_name)):
        os.mkdir(os.path.join(root_dir, set_name))
    for class_name in classes:
        folder = os.path.join(root_dir, set_name, class_name)
        if not os.path.isdir(folder):
            os.mkdir(folder)

Copy the files to correct set/ class folders

In [8]:
for image, class_name in annotations.items():
    target_set = 'validation' if random.randint(0, 99) < 20 else 'train'
    target_path = os.path.join(root_dir, target_set, class_name, image)
    shutil.copy(os.path.join('data/images/', image), target_path)

In [9]:
sets_counts = {
    'train': 0,
    'validation': 0
}

for set_name in sets:
    for class_name in classes:
        path = os.path.join(root_dir, set_name, class_name)
        count = len(os.listdir(path))
        print(path, 'has', count, 'images')
        sets_counts[set_name] += count

print(sets_counts)

custom_data/train/cat has 1911 images
custom_data/train/dog has 3942 images
custom_data/validation/cat has 460 images
custom_data/validation/dog has 1036 images
{'train': 5853, 'validation': 1496}


# Training Script - Create Model

In [10]:
%%writefile train.py

import tensorflow as tf
import argparse
import os
import json

def create_model():
    model = tf.keras.models.Sequential([
        tf.keras.applications.mobilenet_v2.MobileNetV2(include_top=False, weights='imagenet',
                                                       pooling='avg', input_shape=(128, 128, 3)),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    
    model.layers[0].trainable = False
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


Writing train.py


# Training Script - Data Generators

In [11]:
%%writefile -a train.py

def create_data_generators(root_dir, batch_size):
    train_data_generator = tf.keras.preprocessing.image.ImageDataGenerator(
        preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input,
        horizontal_flip=True,
        zoom_range=[0.8, 1.2],
        rotation_range=20
    ).flow_from_directory(
        os.path.join(root_dir, 'train'),
        target_size=(128, 128),
        batch_size=batch_size,
        class_mode='binary'
    )
    
    val_data_generator = tf.keras.preprocessing.image.ImageDataGenerator(
        preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input
    ).flow_from_directory(
        os.path.join(root_dir, 'validation'),
        target_size=(128, 128),
        batch_size=batch_size,
        class_mode='binary'
    )
    
    return train_data_generator, val_data_generator


Appending to train.py


# Training Script - Putting it Together

In [12]:
%%writefile -a train.py

if __name__ =='__main__':

    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument('--epochs', type=int, default=3)
    parser.add_argument('--batch_size', type=int, default=16)
    parser.add_argument('--steps', type=int, default=int(5873/16))
    parser.add_argument('--val_steps', type=int, default=(1476/16))

    # input data and model directories
    parser.add_argument('--model_dir', type=str)
    parser.add_argument('--sm-model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAINING'))

    args, _ = parser.parse_known_args()

    local_output_dir = args.sm_model_dir
    local_root_dir = args.train
    batch_size = args.batch_size
    
    model = create_model()
    train_gen, val_gen = create_data_generators(local_root_dir, batch_size)
    
    _ = model.fit(
        train_gen,
        epochs=args.epochs,
        steps_per_epoch=args.steps,
        validation_data=val_gen,
        validation_steps=args.val_steps
    )
    
    model.save(os.path.join(local_output_dir, 'model', '1'))
    

Appending to train.py


# Upload Dataset to S3

** Now setup a bucket in 'S3' (for example='petcustom') and upload your dataset in cloud.** and excute the rest of the code.

In [13]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket_name = 'petscustom'

print('Uploading to S3..')
s3_data_path = sagemaker_session.upload_data(path=root_dir, bucket=bucket_name, key_prefix='data')

print('Uploaded to', s3_data_path)

ValueError: Must setup local AWS configuration with a region supported by SageMaker.

# Train with TensorFlow Estimator

In [14]:
from sagemaker.tensorflow import TensorFlow

pets_estimator = TensorFlow(
    entry_point='train.py',
    role=role,
    train_instance_count=1,
    train_instance_type='ml.p2.xlarge',
    framework_version='2.1.0',
    py_version='py3',
    output_path='s3://petscustom/'
)

NameError: name 'role' is not defined

In [15]:
pets_estimator.fit(s3_data_path)

NameError: name 'pets_estimator' is not defined

# Deploy TensorFlow Model

In [16]:
pets_predictor = pets_estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')
print('\nModel Deployed!')

NameError: name 'pets_estimator' is not defined

# Final Predictions

In [17]:
cat_dir = 'custom_data/validation/cat/'
cat_images = [os.path.join(cat_dir, x) for x in os.listdir(cat_dir)]
print(cat_images[0])

dog_dir = 'custom_data/validation/dog/'
dog_images = [os.path.join(dog_dir, x) for x in os.listdir(dog_dir)]
print(dog_images[0])

custom_data/validation/cat/Bombay_110.jpg
custom_data/validation/dog/american_bulldog_93.jpg


In [18]:
def get_pred(image_path):
    img = tf.keras.preprocessing.image.load_img(image_path, target_size=(128, 128))
    img = tf.keras.preprocessing.image.img_to_array(img)
    img = tf.keras.applications.mobilenet_v2.preprocess_input(img)
    img = np.expand_dims(img, axis=0)

    results = pets_predictor.predict(img)
    return results

In [19]:
image_path = cat_images[0]
results = get_pred(image_path)

print(results)

NameError: name 'pets_predictor' is not defined

In [20]:
class_id = int(np.squeeze(results['predictions']) > 0.5)
print('Predicted class_id:', class_id, 'with class_name:', classes[class_id])

NameError: name 'results' is not defined

# Delete Model Endpoint

In [21]:
sagemaker_session.delete_endpoint(pets_predictor.endpoint)

NameError: name 'sagemaker_session' is not defined