### Tensorflow Serving Warmup

The warmup data is read by Tensorflow Serving on startup to prime the model before accepting user requests. This makes the first user request as fast as expected. A set of realistic inputs should be used for the warmup.

In [None]:
# The model storage is mounted in this Pod as well, and we can put stuff in there directly from jupyter
!ls /models

In [None]:
!pip3 install tensorflow-serving-api==1.14.*

In [None]:
from __future__ import print_function

import os
import sys
import tensorflow as tf
from tensorflow_serving.apis import predict_pb2
from tensorflow_serving.apis import prediction_log_pb2

In [None]:
# Writing to temp instead of writing to the production folder directly
model_dir = '/models/1558M/1597347193/' 
model_name = 'gpt-pvc' # I do not know if this matters
# Two example input texts grabbed from production requests
# With these two examples, warmup takes 14s
warmup_contexts = [
    [ 200, 201, 202 ],
    [ 300, 301, 302 ],
    [ 400, 401, 402 ]
]

In [None]:
assets_dir = os.path.join(model_dir, 'assets.extra')
if not os.path.exists(assets_dir):
    os.mkdir(assets_dir)

warmup_file = os.path.join(assets_dir, 'tf_serving_warmup_requests')
with tf.io.TFRecordWriter(warmup_file) as writer:
    for context in warmup_contexts:
      # Create the inference request
      request = predict_pb2.PredictRequest()
      request.model_spec.name = model_name
      request.model_spec.signature_name = 'predict'
    
      # Should add some variability for these options as well
      request.inputs['context'].CopyFrom(
          tf.make_tensor_proto(context, shape=[1, len(context)]))

    
      log = prediction_log_pb2.PredictionLog(
          predict_log=prediction_log_pb2.PredictLog(request=request))
      writer.write(log.SerializeToString())

print('Created the file \'%s\'' % warmup_file)