In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pycuda.driver as cuda
import tensorrt as trt
import pycuda.autoinit

# Load the trained model
model = tf.keras.models.load_model('my_model.h5')

# Create a TensorRT engine
trt_logger = trt.Logger(trt.Logger.WARNING)
trt_builder = trt.Builder(trt_logger)
trt_network = trt_builder.create_network()
trt_parser = trt.OnnxParser(trt_network, trt_logger)

# Convert the Keras model to ONNX format
onnx_model_path = 'my_model.onnx'
tf.saved_model.save(model, 'temp')
converter = tf.compat.v1.lite.TFLiteConverter.from_saved_model('temp')
converter.allow_custom_ops = True
tflite_model = converter.convert()
with open(onnx_model_path, 'wb') as f:
    f.write(tflite_model)

# Parse the ONNX model and build the TensorRT engine
with open(onnx_model_path, 'rb') as f:
    onnx_model = f.read()
trt_parser.parse(onnx_model)
trt_builder.max_batch_size = 1
trt_builder.max_workspace_size = 1 << 30  # 1GB
trt_engine = trt_builder.build_cuda_engine(trt_network)

# Create an execution context and allocate memory
trt_context = trt_engine.create_execution_context()
inputs, outputs, bindings = [], [], []
for binding in trt_engine:
    size = trt.volume(trt_engine.get_binding_shape(binding)) * trt_engine.max_batch_size
    dtype = trt.nptype(trt_engine.get_binding_dtype(binding))
    host_mem = cuda.pagelocked_empty(size, dtype)
    device_mem = cuda.mem_alloc(host_mem.nbytes)
    bindings.append(int(device_mem))
    if trt_engine.binding_is_input(binding):
        inputs.append((host_mem, device_mem))
    else:
        outputs.append((host_mem, device_mem))
stream = cuda.Stream()

# Define a function to run inference and measure memory usage
def run_inference(inputs, outputs, bindings, stream):
    # Transfer input data to device memory
    for inp, (host_mem, device_mem) in zip(inputs, bindings):
        np.copyto(host_mem, inp.reshape(-1))
        cuda.memcpy_htod_async(device_mem, host_mem, stream)
    # Run inference
    trt_context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    # Transfer output data to host memory
    results = [np.empty_like(out[0]) for out in outputs]
    for out, (host_mem, device_mem) in zip(results, outputs):
        cuda.memcpy_dtoh_async(host_mem, device_mem, stream)
    # Synchronize the stream and return the output data
    stream.synchronize()
    return results

# Run inference and measure memory usage
prev_mem_usage = cuda.mem_get_info()[0]
inputs = [np.random.randn(*input_shape).astype(np.float32) for input_shape in model.input_shape]
outputs = run_inference(inputs, outputs, bindings, stream)
latest_mem_usage = cuda.mem_get_info()[0]

# Plot memory usage graphs
fig, ax = plt.subplots()
ax.plot([0, 1], [prev_mem_usage, latest_mem_usage])
ax.set_xlabel('Inference run')
ax.set_ylabel('Memory usage (bytes)')
plt.show()


ModuleNotFoundError: No module named 'pycuda'