In [1]:
# Copyright 2019 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

In [2]:
# ! CUDA_VISIBLE_DEVICES='1' python /workspace/TensorRT/demo/BERT/python/bert_builder.py -m /workspace/TensorRT/demo/BERT/eval_ckpts/model.ckpt-315171 -o tmp.engine -b 32 -s 8 -c /workspace/TensorRT/demo/BERT/eval_ckpts -t cola

In [3]:
# ! ls /workspace/TensorRT/demo/BERT/build

## Data Preprocessing
Let's convert the paragraph and the question to BERT input with the help of the tokenizer:

In [4]:
import data_processing as dp
import tokenization

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [5]:
with open('/workspace/TensorRT/demo/BERT/eval_ckpts/data.txt', 'r', encoding='utf-8') as fin:
    tmp = fin.readlines()
data = []
for i in range(1, len(tmp)):
    label, sent = tmp[i].split('\t')
    data.append([label, sent])
del tmp

In [6]:
tokenizer = tokenization.FullTokenizer(vocab_file="/workspace/models/fine-tuned/bert_tf_v2_base_fp16_128_v2/vocab.txt", do_lower_case=True)

# The maximum number of tokens for the question. Questions longer than this will be truncated to this length.
max_query_length = 8

# When splitting up a long document into chunks, how much stride to take between chunks.
doc_stride = 8

# The maximum total input sequence length after WordPiece tokenization. 
# Sequences longer than this will be truncated, and sequences shorter 
max_seq_length = 8

batch_size = 1

## TensorRT Inference

In [7]:
import tensorrt as trt
TRT_LOGGER = trt.Logger(trt.Logger.INFO)

In [8]:
import ctypes
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
ctypes.CDLL("libnvinfer_plugin.so", mode=ctypes.RTLD_GLOBAL)
ctypes.CDLL("/workspace/TensorRT/demo/BERT/build/libcommon.so", mode=ctypes.RTLD_GLOBAL)
ctypes.CDLL("/workspace/TensorRT/demo/BERT/build/libbert_plugins.so", mode=ctypes.RTLD_GLOBAL)

<CDLL '/workspace/TensorRT/demo/BERT/build/libbert_plugins.so', handle 47fa0c0 at 0x7f5b80643be0>

In [9]:
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import time
cold_start = time.time()
# Load the BERT-Large Engine
with open("bert_shopee_8_batch32.engine", "rb") as f, \
    trt.Runtime(TRT_LOGGER) as runtime, \
    runtime.deserialize_cuda_engine(f.read()) as engine, \
    engine.create_execution_context() as context:
    
    # print(engine.max_batch_size, engine.num_layers, engine.num_bindings, engine.num_optimization_profiles)
    # change the optimization profile to fit in input data
    
    print(context.active_optimization_profile)
#     context.active_optimization_profile = 1
#     for i in range(engine.num_optimization_profiles):
#         if i == context.active_optimization_profile:
#             for binding in range(3):
#                 print(i, engine.get_profile_shape(profile_index=i, binding=binding))

    # We always use batch size 1.( From nvidia)
    # able to use larger batch size
    
    input_shape = (batch_size, max_seq_length)
    input_nbytes = trt.volume(input_shape) * trt.int32.itemsize
    
    # Allocate device memory for inputs.
    d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(3)]
    # Create a stream in which to copy inputs/outputs and run inference.
    stream = cuda.Stream()

    # Specify input shapes. These must be within the min/max bounds of the active profile (0th profile in this case)
    # Note that input shapes can be specified on a per-inference basis, but in this case, we only have a single shape.
    for binding in range(3):
        context.set_binding_shape(binding, input_shape)
    assert context.all_binding_shapes_specified
    # Allocate output buffer by querying the size from the context. This may be different for different input shapes.
    h_output = cuda.pagelocked_empty(tuple(context.get_binding_shape(3)), dtype=np.float32)
    d_output = cuda.mem_alloc(h_output.nbytes)

    print("\nRunning Inference...")
    ttl_time = 0
    correct = 0

    for step in range(len(data)//batch_size):
        eval_start_time = time.time()            
        input_ids = np.random.randn(batch_size, 8)
        segment_ids = np.random.randn(batch_size, 8)
        input_mask = np.random.randn(batch_size, 8)
        label = np.zeros(batch_size)
        for i in range(batch_size):
            short_paragraph_text = data[step * batch_size + i][1]
            doc_tokens = dp.convert_doc_tokens(short_paragraph_text)
            try:
                features = dp.convert_examples_to_features(
                    doc_tokens, '', tokenizer, max_seq_length, doc_stride, max_query_length
                )
                input_ids[i] = features['input_ids']
                segment_ids[i] = features['segment_ids']
                input_mask[i] = features['input_mask']
                label[i] = int(data[step * batch_size + i][0])
            except:
                print(doc_tokens)
                i -= 1
        buffer_time = time.time()
        
        # asynchronous execution
        # Copy inputs(np arrays) into cuda memory
        cuda.memcpy_htod_async(d_inputs[0], input_ids.astype(np.int32), stream)
        cuda.memcpy_htod_async(d_inputs[1], segment_ids.astype(np.int32), stream)
        cuda.memcpy_htod_async(d_inputs[2], input_mask.astype(np.int32), stream)
        # Run inference, inference result is stored in cuda memory
        context.execute_async_v2(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)
        # Transfer predictions back from GPU
        cuda.memcpy_dtoh_async(h_output, d_output, stream)
        # Synchronize the stream
        stream.synchronize()

#         # synchronous execution
#         # copy inputs to cuda memory
#         cuda.memcpy_htod(d_inputs[0], input_ids.astype(np.int32))
#         cuda.memcpy_htod(d_inputs[1], segment_ids.astype(np.int32))
#         cuda.memcpy_htod(d_inputs[2], input_mask.astype(np.int32))
#         # run inference synchronously
#         context.execute(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)],batch_size=batch_size)    
#         # transfer back from gpu
#         cuda.memcpy_dtoh(h_output, d_output)
#         #Problems for synchronous execution: output zeros

        eval_time_elapsed = time.time() - eval_start_time
        if step == 0:
            cold_start_time = time.time() - cold_start
        ttl_time += eval_time_elapsed
#         correct += (h_output.reshape(batch_size,22).argmax(axis=1) == label).astype(int).sum()
    eval_time_elapsed = ttl_time / (step + 1)
    print("-----------------------------")
    print("Running Inference in {:.3f} Batches/Sec".format(
        1.0/eval_time_elapsed
    ))
    print("Time using for one batch is {:3f} ms".format(eval_time_elapsed*1000))
    print("Average time using for one inference is {:3f} ms".format(eval_time_elapsed*1000/batch_size))
    print("Time using for one batch cold start is {:.3f} s".format(cold_start_time))
    print("-----------------------------")

0

Running Inference...
[]
[]
[]
[]
-----------------------------
Running Inference in 413.291 Batches/Sec
Time using for one batch is 2.419603 ms
Average time using for one inference is 2.419603 ms
Time using for one batch cold start is 7.354 s
-----------------------------


In [10]:
buffer_time - eval_start_time

0.0004367828369140625

# Alternatively, bert_inference script can be used

In [11]:
# ! python bert_inference.py -e bert_shopee_8_batch32.engine -pf /workspace/TensorRT/demo/BERT/eval_ckpts/data.txt -v /workspace/models/fine-tuned/bert_tf_v2_base_fp16_128_v2/vocab.txt -b 32 -s 8