In [None]:
!pip install -Uq grpcio==1.26.0 transformers tensorflow_serving_api

In [None]:
import os
import requests
import tempfile
import json
import numpy as np
import tensorflow as tf
from tensorflow_serving.apis import predict_pb2
from tensorflow_serving.apis import prediction_service_pb2_grpc
import grpc
from transformers import TFBertForSequenceClassification, BertTokenizerFast, BertConfig

In [None]:
MODEL_DIR = tempfile.gettempdir()
model = TFBertForSequenceClassification.from_pretrained("nateraw/bert-base-uncased-imdb", from_pt=True)
# the saved_model parameter is a flag to create a saved model version of the model in same time than the h5 weights
model.save_pretrained(MODEL_DIR, saved_model=True)
os.environ["MODEL_DIR"] = os.path.join(MODEL_DIR, "saved_model")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


INFO:tensorflow:Assets written to: /tmp/saved_model/1/assets


INFO:tensorflow:Assets written to: /tmp/saved_model/1/assets


In [None]:
# check if saved model is properly formatted
!saved_model_cli show --dir {MODEL_DIR}/saved_model/1 --tag_set serve --signature_def serving_default

The given SavedModel SignatureDef contains the following input(s):
  inputs['attention_mask'] tensor_info:
      dtype: DT_INT32
      shape: (-1, -1)
      name: serving_default_attention_mask:0
  inputs['input_ids'] tensor_info:
      dtype: DT_INT32
      shape: (-1, -1)
      name: serving_default_input_ids:0
  inputs['token_type_ids'] tensor_info:
      dtype: DT_INT32
      shape: (-1, -1)
      name: serving_default_token_type_ids:0
The given SavedModel SignatureDef contains the following output(s):
  outputs['logits'] tensor_info:
      dtype: DT_FLOAT
      shape: (-1, 2)
      name: StatefulPartitionedCall:0
Method name is: tensorflow/serving/predict


In [None]:
# install apt package for tf serving
!echo "deb http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" | tee /etc/apt/sources.list.d/tensorflow-serving.list && \
curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | apt-key add -
!apt update
!apt-get install tensorflow-model-server

In [None]:
# run a tf serving
%%bash --bg
nohup tensorflow_model_server \
  --rest_api_port=8501 \
  --grpc_api_port=8500 \
  --model_name=bert \
  --model_base_path="${MODEL_DIR}" >server.log 2>&1

Starting job # 5 in a separate thread.


In [None]:
# check if server runs properly
!tail server.log

In [None]:
# create the requirements for the tests
sentence = "I love the new TensorFlow update in transformers."
tokenizer = BertTokenizerFast.from_pretrained("nateraw/bert-base-uncased-imdb")
config = BertConfig.from_pretrained("nateraw/bert-base-uncased-imdb")

In [None]:
# run inference over REST API
batch = tokenizer(sentence)
batch = dict(batch)
batch = [batch]

input_data = {"instance": batch}
r = requests.post("https://localhost:8501/v1/models/bert:predict", data=json.dumps(input_data))
result = json.loads(r.text)['predictions'][0]
abs_scores = np.abs(result)
label_id = np.argmax(abs_scores)
print(config.id2[label_id])

In [None]:
# run an inference over gRPC API
# Tokenize the sentence but this time with TensorFlow tensors as output already batch sized to 1. Ex:
# {
#    'input_ids': <tf.Tensor: shape=(1, 3), dtype=int32, numpy=array([[  101, 19082,   102]])>,
#    'token_type_ids': <tf.Tensor: shape=(1, 3), dtype=int32, numpy=array([[0, 0, 0]])>,
#    'attention_mask': <tf.Tensor: shape=(1, 3), dtype=int32, numpy=array([[1, 1, 1]])>
# }
batch = tokenizer(sentence, return_tensors='tf')
channel = grpc.insecure_channel('localhost:8502')
stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
request = predict_pb2.PredictRequest()
request.model_spec.name = 'bert'
request.model_spec.signature_name = 'serving_default'
request.inputs['input_ids'].CopyFrom(tf.make_tensor_proto(batch['input_ids']))
request.inputs["attention_mask"].CopyFrom(tf.make_tensor_proto(batch["attention_mask"]))
request.inputs["token_type_ids"].CopyFrom(tf.make_tensor_proto(batch["token_type_ids"]))

result = stub.Predict(request)

output = result.outputs['logits'].float_val
print(config.id2label[np.argmax(np.abs(output))])