# Model Serving and Optimization

Adapted from [tensorflow estimator tutorials - model optimization](https://github.com/GoogleCloudPlatform/tf-estimator-tutorials/blob/master/00_Miscellaneous/model_optimisation/Tutorial%20-%20TensorFlow%20Model%20Optimisation%20for%20Serving%20-%20MNIST%20with%20Keras.ipynb).

This notebooks shows how to optimize the TensorFlow exported SavedModel by **shrinking** its size (to have less memory and disk footprints), and **improving** prediction latency. This can be accopmlished by applying the following:
* **Freezing**: That is, converting the variables stored in a checkpoint file of the SavedModel into constants stored directly in the model graph.
* **Pruning**: That is, stripping unused nodes during the prediction path of the graph, merging duplicate nodes, as well as removing other node ops like summary, identity, etc.
* **Quantization**:  That is, converting any large float Const op into an eight-bit equivalent, followed by a float conversion op so that the result is usable by subsequent nodes.
* **Other refinements**: That includes constant folding, batch_norm folding, fusing convolusion, etc.

The optimization operations we apply in this example are from the TensorFlow [Graph Conversion Tool](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/graph_transforms/README.md#fold_constants), which is a c++ command-line tool. We use the Python APIs to call the c++ libraries. 

The Graph Transform Tool is designed to work on models that are saved as GraphDef files, usually in a binary protobuf format. However, the model exported after training and estimator is in SavedModel format (saved_model.pb file + variables folder with variables.data-* and variables.index files). 

We need to optimize the model and keep it the SavedModel format. Thus, the optimisation steps will be:
1. Freeze the SavedModel: SavedModel -> GraphDef
2. Optimize the freezed model: GraphDef -> GraphDef
3. Convert the optimised freezed model to SavedModel: GraphDef -> SavedModel

# Gathering and Vizualizing the Data

Before we can build our model let's gather the data and spot check the first few examples in the training set.

In [None]:
%matplotlib inline
import os
import numpy as np
from datetime import datetime
from lib.utils import show_images
import tensorflow as tf

(train_data, train_labels), (eval_data, eval_labels) = tf.keras.datasets.mnist.load_data()
NUM_CLASSES = 10
 
show_images(train_data[:12], cols=3)

# Define the Model

The model for this vision task of classfiying digits will be a [convolutional neural network](https://en.wikipedia.org/wiki/Convolutional_neural_network). Since we are more concerned with optimizing the model for performance we wont worry too much about the accuracy and loss metrics at the end of training.

In [None]:
def keras_model_fn(params):
    inputs = tf.keras.layers.Input(shape=(28, 28), name='input_image')
    input_layer = tf.keras.layers.Reshape(target_shape=(28, 28, 1), name='reshape')(inputs)
    
    # convolutional layers
    conv_inputs = input_layer
    for i in range(params.num_conv_layers): 
        filters = params.init_filters * (2**i)
        conv = tf.keras.layers.Conv2D(kernel_size=3, filters=filters, strides=1, padding='SAME', activation='relu')(conv_inputs)
        max_pool = tf.keras.layers.MaxPool2D(pool_size=2, strides=2, padding='SAME')(conv)
        batch_norm = tf.keras.layers.BatchNormalization()(max_pool)
        conv_inputs = batch_norm

    flatten = tf.keras.layers.Flatten(name='flatten')(conv_inputs)
    
    # fully-connected layers
    dense_inputs = flatten
    for i in range(len(params.hidden_units)):
        dense = tf.keras.layers.Dense(units=params.hidden_units[i], activation='relu')(dense_inputs)
        dropout = tf.keras.layers.Dropout(params.dropout)(dense)
        dense_inputs = dropout
        
    # softmax classifier
    logits = tf.keras.layers.Dense(units=NUM_CLASSES, name='logits')(dense_inputs)
    softmax = tf.keras.layers.Activation('softmax', name='softmax')(logits)

    # keras model
    model = tf.keras.models.Model(inputs, softmax)
    return model

def create_estimator(params, run_config): 
    keras_model = keras_model_fn(params)
    print(keras_model.summary())
    
    optimizer = tf.keras.optimizers.Adam(lr=params.learning_rate)
    keras_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    mnist_classifier = tf.keras.estimator.model_to_estimator(
        keras_model=keras_model,
        config=run_config
    )
    
    return mnist_classifier

# Setup Training and Evaluation Configuration

In [None]:
def run_experiment(params, run_config):  
    train_spec = tf.estimator.TrainSpec(
        input_fn = tf.estimator.inputs.numpy_input_fn(
            x={"input_image": train_data},
            y=train_labels[:,np.newaxis],
            batch_size=params.batch_size,
            num_epochs=None,
            shuffle=True),
        max_steps=params.max_training_steps
    )

    eval_spec = tf.estimator.EvalSpec(
        input_fn = tf.estimator.inputs.numpy_input_fn(
            x={"input_image": eval_data},
            y=eval_labels[:,np.newaxis],
            batch_size=params.batch_size,
            num_epochs=1,
            shuffle=False),
        steps=None,
        throttle_secs=params.eval_throttle_secs
    )

    tf.logging.set_verbosity(tf.logging.INFO)

    time_start = datetime.utcnow() 
    print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
    print(".......................................") 

    estimator = create_estimator(params, run_config)

    tf.estimator.train_and_evaluate(
        estimator=estimator,
        train_spec=train_spec, 
        eval_spec=eval_spec
    )

    time_end = datetime.utcnow() 
    print(".......................................")
    print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
    print("")
    time_elapsed = time_end - time_start
    print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))
    
    return estimator

# Define Hyper Parameters

In [None]:
MODELS_LOCATION = '/saved_models/'
MODEL_NAME = 'mnist'
model_dir = os.path.join(MODELS_LOCATION, MODEL_NAME)

params  = tf.contrib.training.HParams(
    batch_size=100,
    hidden_units=[512, 512],
    num_conv_layers=3, 
    init_filters=64,
    dropout=0.2,
    max_training_steps=50,
    eval_throttle_secs=10,
    learning_rate=1e-3,
    debug=True
)

run_config = tf.estimator.RunConfig(
    tf_random_seed=19830610,
    save_checkpoints_steps=1000,
    keep_checkpoint_max=3,
    model_dir=model_dir
)

# Train and Evaluate the Model

In [None]:
if tf.gfile.Exists(model_dir):
    print("Removing previous artifacts...")
    tf.gfile.DeleteRecursively(model_dir)

os.makedirs(model_dir)

estimator = run_experiment(params, run_config)

# Export the Model

In [None]:
def make_serving_input_receiver_fn():
    inputs = {'input_image': tf.placeholder(shape=[None,28,28], dtype=tf.float32, name='serving_input_image')}
    return tf.estimator.export.build_raw_serving_input_receiver_fn(inputs)

export_dir = os.path.join(model_dir, 'export')

if tf.gfile.Exists(export_dir):
    tf.gfile.DeleteRecursively(export_dir)
        
estimator.export_savedmodel(
    export_dir_base=export_dir,
    serving_input_receiver_fn=make_serving_input_receiver_fn()
)

# Inspect the Exported SavedModel

In [None]:
%%bash

saved_models_base=/saved_models/mnist/export/
saved_model_dir=${saved_models_base}$(ls ${saved_models_base} | tail -n 1)
echo ${saved_model_dir}
ls ${saved_model_dir}
saved_model_cli show --dir=${saved_model_dir} --all

# Copy SavedModel to TensorFlow Serving Path

TensorFlow serving reads models from a specified directory when the TensorFlow Serving process is launched. Below we are copying the saved model to a directory we can serve models from. The folder hierarchy is as follows
```
top level dir/
    model_name/
       version_number/
            saved_model.pb
```

In [None]:
%%bash
saved_models_base=/saved_models/mnist/export/
saved_model_dir=${saved_models_base}$(ls ${saved_models_base} | tail -n 1)

mkdir -p /serving/mnist/1
cp -r ${saved_model_dir}/* /serving/mnist/1

# Serve MNIST Model
## Run TensorFlow Serving

To create a TensorFlow Serving instance:
1. Open a new terminal tab here in the Jupyter Lab UI.
2. Make sure to switch the shell to bash by running `bash`.
2. Use the following command to start the server.

```sh
tensorflow_model_server \
    --rest_api_port=8501 \
    --model_name=mnist \
    --model_base_path=/serving/mnist/ \
    --enable_batching
```

Now let's verify that the server is up and running via the curl command in the next cell.

In [None]:
!curl http://localhost:8501/v1/models/mnist

# Describe the SavedModel Graph

Before we optimize our model for performance we should get a sense for the size and composition of the TensorFlow graph. Keep these numbers in mind as we take steps to decrease the size of our model and increase its performance.

In [None]:
from lib.utils import describe_graph, get_graph_def_from_saved_model, get_size

saved_model_dir = os.path.join(
    export_dir, [f for f in os.listdir(export_dir) if f.isdigit()][0])

describe_graph(get_graph_def_from_saved_model(saved_model_dir))

get_size(saved_model_dir)

# Run Inference via REST Client

We are ready to start sending images to our model for inference. To begin let's send a single image to make sure everything is working as expected.

In [None]:
import requests

SERVER_URL = "http://localhost:8501"

# Display the image we will classify as a test
show_images(eval_data[:1])

def predict(instances, version=1):
    payload = {'instances': instances}
    res = requests.post("{}/v1/models/mnist/versions/{}:predict".format(SERVER_URL, version), json=payload)
    res.raise_for_status()
    response = res.json()
    class_ids = [np.argmax(item) for item in response["predictions"]]

    return class_ids

print("Class Prediction: {}".format(predict([eval_data[0].tolist()])))

In [None]:
def benchmark_inference(version=1, batch=100, repeat=10):
    instances = []
    for i in range(batch):
        img = eval_data[i]
        instances.append(img.tolist())

    #warmup request
    predict(instances[:1], version=version)
    print('Warm up request performed!')
    print('Timer started...')

    time_start = datetime.utcnow()
    output = None

    for i in range(repeat):
        output = predict(instances, version=version)

    time_end = datetime.utcnow()

    time_elapsed_sec = (time_end - time_start).total_seconds()

    print("Inference elapsed time: {} seconds\n".format(time_elapsed_sec))

    print("Prediction produced for {} instances batch, repeated {} times".format(len(output), repeat))
    print("Average latency per batch: {} seconds".format(time_elapsed_sec/repeat))

    print("Prediction output for the last instance: {}".format(output[0]))

benchmark_inference()

# Freeze SavedModel

This function will convert the SavedModel into a GraphDef file (freezed_model.pb), and storing the variables as constant to the freezed_model.pb

You need to define the graph output nodes for freezing. We are only interested in the output of **softmax/Softmax** node

In [None]:
from lib.utils import get_graph_def_from_file

def freeze_graph(saved_model_dir):
    from tensorflow.python.tools import freeze_graph
    from tensorflow.python.saved_model import tag_constants
    output_graph_filename = os.path.join(saved_model_dir, "freezed_model.pb")
    output_node_names = "softmax/Softmax"
    initializer_nodes = ""

    freeze_graph.freeze_graph(
        input_saved_model_dir=saved_model_dir,
        output_graph=output_graph_filename,
        saved_model_tags = tag_constants.SERVING,
        output_node_names=output_node_names,
        initializer_nodes=initializer_nodes,

        input_graph=None, 
        input_saver=False,
        input_binary=False, 
        input_checkpoint=None, 
        restore_op_name=None, 
        filename_tensor_name=None, 
        clear_devices=False,
        input_meta_graph=False,
    )
    
    print("SavedModel graph freezed!")
    
freeze_graph(saved_model_dir)

freezed_filepath=os.path.join(saved_model_dir,'freezed_model.pb')
describe_graph(get_graph_def_from_file(freezed_filepath))

# Convert Freeze Graph to SavedModel

In order to serve the freeze graph we created we need to convert it back into a saved model. You should notice the variable size go to 0 when we run get_size at the end of the cell.

In [None]:
def convert_graph_def_to_saved_model(graph_filepath, export_dir):
    from tensorflow.python import ops

    if tf.gfile.Exists(export_dir):
        tf.gfile.DeleteRecursively(export_dir)

    graph_def = get_graph_def_from_file(graph_filepath)
    
    with tf.Session(graph=tf.Graph()) as session:
        tf.import_graph_def(graph_def, name="")
        tf.saved_model.simple_save(session,
                export_dir,
                inputs={
                    node.name: session.graph.get_tensor_by_name("{}:0".format(node.name)) 
                    for node in graph_def.node if node.op=='Placeholder'},
                outputs={
                    "softmax": session.graph.get_tensor_by_name("softmax/Softmax:0"),
                }
            )

        print("Optimised graph converted to SavedModel!")
        
convert_graph_def_to_saved_model(freezed_filepath, "/serving/mnist/2")

get_size("/serving/mnist/2")

In [None]:
!curl http://localhost:8501/v1/models/mnist

# Benchmark the Frozen Model

Be sure to compare the performance here to the performance in from the first version of the model.

In [None]:
benchmark_inference(version=2)

# Optimize the freezed_model.pb

In [None]:
def optimize_graph(model_dir, graph_filename, transforms):
    from tensorflow.tools.graph_transforms import TransformGraph
    
    input_names = []
    output_names = ['softmax/Softmax']
    
    graph_def = get_graph_def_from_file(os.path.join(model_dir, graph_filename))
    optimised_graph_def = TransformGraph(graph_def, 
                                         input_names,
                                         output_names,
                                         transforms 
                                        )
    tf.train.write_graph(optimised_graph_def,
                        logdir=model_dir,
                        as_text=False,
                        name='optimised_model.pb')
    
    print("Freezed graph optimised!")

In [None]:
transforms = [
    'remove_nodes(op=Identity)', 
    'fold_constants(ignore_errors=true)',
    'fold_batch_norms',
    'fuse_resize_pad_and_conv',
    'quantize_weights',
    'quantize_nodes',
    'merge_duplicate_nodes',
    'strip_unused_nodes', 
    'sort_by_execution_order'
]

optimize_graph(saved_model_dir, 'freezed_model.pb', transforms)

# Describe the Optimised Graph

In [None]:
optimised_filepath=os.path.join(saved_model_dir,'optimised_model.pb')
describe_graph(get_graph_def_from_file(optimised_filepath))

# Convert Optimised graph (GraphDef) to SavedModel

In [None]:
optimised_filepath=os.path.join(saved_model_dir,'optimised_model.pb')
describe_graph(get_graph_def_from_file(optimised_filepath))

convert_graph_def_to_saved_model(optimised_filepath, "/serving/mnist/3")
get_size("/serving/mnist/3")

# Benchmark the Optimized Model

In [None]:
benchmark_inference(version=3)