# Custom Training with TensorFlow in Sagemaker

# Download Data

In [5]:
%matplotlib inline

import os
import tarfile
import urllib
import shutil
import json
import random
import numpy as np
import tensorflow as tf
import sagemaker

from PIL import Image
from matplotlib import pyplot as plt

urls = ['http://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz',
        'http://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz']

print('Libraries imported')

print(tf.version)

Libraries imported
<module 'tensorflow._api.v2.version' from '/usr/local/lib/python3.8/site-packages/tensorflow/_api/v2/version/__init__.py'>


In [6]:
def download_and_extract(data_dir, download_dir):
    for url in urls:
        target_file = url.split('/')[-1]
        if target_file not in os.listdir(download_dir):
            print('Downloading', url)
            urllib.request.urlretrieve(url, os.path.join(download_dir, target_file))
            tf = tarfile.open(url.split('/')[-1])
            tf.extractall(data_dir)
        else:
            print('Already downloaded', url)

def get_annotations(file_path, annotations={}):
    
    with open(file_path, 'r') as f:
        rows = f.read().splitlines()

    for i, row in enumerate(rows):
        image_name, _, _, _ = row.split(' ')
        class_name = image_name.split('_')[:-1]
        class_name = '_'.join(class_name)
        image_name = image_name + '.jpg'
        
        annotations[image_name] = 'cat' if class_name[0] != class_name[0].lower() else 'dog'
    
    return annotations

In [7]:
if not os.path.isdir('data'):
    os.mkdir('data')

download_and_extract('data', '.')

Downloading http://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz
Downloading http://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz


# Dataset for Training

In [8]:
annotations = get_annotations('data/annotations/trainval.txt')
annotations = get_annotations('data/annotations/test.txt', annotations)

total_count = len(annotations.keys())
print('Total examples', total_count)

Total examples 7349


In [9]:
next(iter(annotations.items()))

('Abyssinian_100.jpg', 'cat')

In [10]:
classes = ['cat', 'dog']
sets = ['train', 'validation']
root_dir = 'custom_data'

if not os.path.isdir(root_dir):
    os.mkdir(root_dir)
    
for set_name in sets:
    if not os.path.isdir(os.path.join(root_dir, set_name)):
        os.mkdir(os.path.join(root_dir, set_name))
    for class_name in classes:
        folder = os.path.join(root_dir, set_name, class_name)
        if not os.path.isdir(folder):
            os.mkdir(folder)

Copy the files to correct set/ class folders

In [11]:
for image, class_name in annotations.items():
    target_set = 'validation' if random.randint(0, 99) < 20 else 'train'
    target_path = os.path.join(root_dir, target_set, class_name, image)
    shutil.copy(os.path.join('data/images/', image), target_path)

In [12]:
sets_counts = {
    'train': 0,
    'validation': 0
}

for set_name in sets:
    for class_name in classes:
        path = os.path.join(root_dir, set_name, class_name)
        count = len(os.listdir(path))
        print(path, 'has', count, 'images')
        sets_counts[set_name] += count

print(sets_counts)

custom_data/train/cat has 1890 images
custom_data/train/dog has 3965 images
custom_data/validation/cat has 481 images
custom_data/validation/dog has 1013 images
{'train': 5855, 'validation': 1494}


# Training Script - Create Model

In [13]:
%%writefile train.py

import tensorflow as tf
import argparse
import os
import json

def create_model():
    model = tf.keras.Sequential([
        tf.keras.applications.mobilenet_v2.MobileNetV2(include_top = False,
                                                      pooling = 'average',
                                                      input_shape = (128, 128, 3)),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1, activation = 'sigmoid')
    ])
    model.layers[0].trainable = False
    model.compile(
    loss = 'binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy'])
    return model

Writing train.py


# Training Script - Data Generators

In [14]:
%%writefile -a train.py
def create_data_generators(root_dir, batch_size):
    train_data_generator = tf.keras.preprocessing.image.ImageDataGenerator(
        preprocessing_function = tf.keras.applications.mobilenet_v2.preprocess_input,
        horizontal_flip = True,
        zoom_range=[0.8,1.2],
        rotation_range = 20
    ).flow_from_directory(
        os.path.join(root_dir, 'train')
        target_size=(128,128),
        batch_size = batch_size,
        class_mode='binary'
    )
    val_data_generator = tf.keras.preprocessing.image.ImageDataGenerator(
        preprocessing_function = tf.keras.applications.mobilenet_v2.preprocess_input,

    ).flow_from_directory(
        os.path.join(root_dir, 'validation')
        target_size=(128,128),
        batch_size = batch_size,
        class_mode='binary'
    )
    return train_data_generator, val_data_generator

Appending to train.py


# Training Script - Putting it Together

In [15]:
%%writefile -a train.py
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    
    parser.add_argument('--epochs', type = int, default=3)
    parser.add_argument('--batch_size', type=int, default=16)
    parser.add_argument('--steps', type=int, default=int(np.ceil(train_gen.n / float(batch_size)))
    parser.add_argument('--val_steps', type=int, default=int(np.ceil(val_gen.n / float(batch_size)))
    
    parser.add_argument('--model_dir', type=str)
    parser.add_argument('--sm_model_dir', type=str, default = os.environ.get('SM_MODEL_DIR'))
    
    parser.add_argument('--train', type=str, default = os.environ.get('SM_CHANNEL_TRAINING'))
    
    args, _ = parser.parse_known_args()
    local_output_dur = args.sm_model_dir
    local_root_dir = args.train
    batch_size = args.batch_size
    
    model = create_model()
    train_gen, val_gen = create_data_generators(local_root_dir, batch_size)
    
    _ = model.fit(
        train_gen,
        epochs=args.epochs,
        steps_per_epoch=args.steps,
        validation_data=val_gen,
        validation_steps=args.val_steps
    )
    
    model.save(os.path.join(local_output_dir, 'coursera_proj_model', '1'))

Appending to train.py


# Upload Dataset to S3

In [None]:
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket_name = "swifty-datasets"

print("Uploading to S3")
s3_data_path = sess.upload_data(path=root_dir, bucket=bucket_name, key_prefix='coursera_tf_data')
print("Uploaded to", s3_data_path)

Uploading to S3


### Train with TensorFlow Estimator

In [38]:
from sagemaker.tensorflow import TensorFlow

pets_estimator = TensorFlow(
    entry_point='train.py',
    role=role,
    instance_type='ml.m4.xlarge',
    instance_count=1,
    py_version='py38',
    framework_version = '2.6.0',
    output_path='s3://swifty-ai-models/other_models/coursera_tf_sm/'
    
)

In [None]:
pets_estimator.fit(s3_data_path)

2022-04-07 18:36:21 Starting - Starting the training job...
2022-04-07 18:36:50 Starting - Preparing the instances for trainingProfilerReport-1649356581: InProgress
.........
2022-04-07 18:38:13 Downloading - Downloading input data...

# Deploy TensorFlow Model
We are going to create a new instance for inference. This takes time because a new instance has to be served.

In [None]:
pets_predictor = pets_estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')
print('\nModel is deployed')

# Final Predictions

In [None]:
cat_dir = 'custom_data/validation/cat/'
cat_images = [os.path.join(cat_dir, x) for x in os.listdir(cat_dir)]
print(cat_images[0])

dog_dir = 'custom_data/validation/dog/'
dog_images = [os.path.join(dog_dir, x) for x in os.listdir(dog_dir)]
print(dog_images[0])

In [None]:
def get_pred(image_path):
    img = tf.keras.preprocessing.image.load_img(image_path, target_size=(128, 128))
    img = tf.keras.preprocessing.image.img_to_array(img)
    img = tf.keras.applications.mobilenet_v2.preprocess_input(img)
    img = np.expand_dims(img, axis=0)

    results = pets_predictor.predict(img)
    return results

In [None]:
image_path = cat_images[0]
results = get_pred(image_path)

print(results)

In [None]:
class_id = int(np.squeeze(results['predictions']) > 0.5)
print('Predicted class_id:', class_id, 'with class_name:', classes[class_id])

# Delete Model Endpoint

In [None]:
sagemaker_session.delete_endpoint(pets_predictor.endpoint)