## Inference with Triton

In [None]:
import matplotlib.pyplot as plt
from skimage.transform import resize
import numpy as np
import time
import tritonclient
import tritonclient.http as httpclient
from tritonclient.utils import triton_to_np_dtype

### Check the model status on Triton

In [None]:
URL = '10.176.0.221:8000'

In [None]:
MODEL1 = 'cls_torch'
!curl http://$URL/v2/models/$MODEL1/versions/1/stats

In [None]:
MODEL2 = 'cls_tensorrt'
!curl http://$URL/v2/models/$MODEL1/versions/1/stats

### Execute the Inference Service

- Inference function

In [None]:
client = httpclient
client_inferer = client.InferenceServerClient(url=URL, verbose=0)

def infer(data, model_name, input_name, output_name, priority=0):
    inputs = [client.InferInput(input_name, data.shape, 'FP32')]
    inputs[0].set_data_from_numpy(data)
    outputs = [client.InferRequestedOutput(output_name, class_count=0)]
    result = client_inferer.infer(
            model_name=model_name,
            inputs=inputs, 
            request_id=str(1),
            model_version='1',
            outputs=outputs,
            priority=priority)
    return result    

- Prepare data

In [None]:
batch_size = 32
# Bring your own testing image
# The dog image here is from https://www.google.com/search?q=dog+image&rlz=1C1GCEA_enTW1019TW1019&source=lnms&tbm=isch&sa=X&ved=2ahUKEwi4pvXmzr__AhVwmmoFHWciB0QQ_AUoAXoECAIQAw&biw=1536&bih=752&dpr=2.5#imgrc=PpmCvrB3OtU3hM
inputs = resize(plt.imread('./dog.jpg'), [256, 256, 3])
plt.imshow(inputs)
plt.show()
inputs = inputs.transpose([2, 0, 1]).astype('float32')
inputs = np.array([inputs for i in range(batch_size)])
inputs.shape

- TorchScript Inference

In [None]:
input_name = "INPUT__0"
output_name = "OUTPUT__0"
result1 = infer(inputs, MODEL1, input_name, output_name)

In [None]:
result1.as_numpy(output_name).shape, result1.as_numpy(output_name).argmax(1)

- TensorRT Inference

In [None]:
input_name = "input0"
output_name = "output0"
result2 = infer(inputs, MODEL2, input_name, output_name)

In [None]:
result2.as_numpy(output_name).shape, result2.as_numpy(output_name).argmax(1)

## Performance Analyzer

- TorchScript

In [None]:
!perf_analyzer -m $MODEL1 \
               -u $URL \
               -b 32 \
               --percentile 95 \
               --concurrency-range 2:11:2 \
               --measurement-interval 5000

- TensorRT

In [None]:
!perf_analyzer -m $MODEL2 \
               -u $URL \
               -b 32 \
               --percentile 95 \
               --concurrency-range 2:11:2 \
               --measurement-interval 5000