## Ohm Muruga Thunai

In [None]:
import os
import tensorrt as trt
import pycuda.driver as cuda
import numpy as np

In [None]:
def load_engine(engine_file_path):
    assert os.path.exists(engine_file_path)
    print("Reading engine from file {}".format(engine_file_path))
    with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        return runtime.deserialize_cuda_engine(f.read())
    

In [48]:
def pre_process_single_image(image_path):
    img = Image.open(image_path).convert("RGB").resize((32, 32))
    img_arr = np.array(img, dtype=np.float32)  # H×W×C
    img_arr = img_arr.transpose(2, 0, 1)  # C×H×W
    img_arr = img_arr[np.newaxis, ...]    # 1×C×H×W
    return img_arr


In [49]:
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit  # initializes CUDA
import numpy as np
from PIL import Image

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

def infer(engine_path, image_path):
    # Load engine
    runtime = trt.Runtime(TRT_LOGGER)
    with open(engine_path, "rb") as f:
        engine = runtime.deserialize_cuda_engine(f.read())
    context = engine.create_execution_context()

    # Load & preprocess image
    # img = Image.open(image_path).convert("RGB").resize((32, 32))
    # img_arr = np.array(img, dtype=np.float32)  # H×W×C
    # img_arr = img_arr.transpose(2, 0, 1)  # C×H×W
    # img_arr = img_arr[np.newaxis, ...]    # 1×C×H×W
    img_arr = pre_process_single_image(image_path)

    print(img_arr.shape) # (1, 3, 32, 32) (batch, 3, height, width)

    # Set dynamic input shape
    input_name = engine.get_tensor_name(0)
    context.set_input_shape(input_name, img_arr.shape)

    assert context.all_binding_shapes_specified

    # Get input/output shapes & dtypes
    outputs = []
    bindings = []
    
    stream = cuda.Stream()
    for idx in range(engine.num_io_tensors):
        name = engine.get_tensor_name(idx)
        shape = context.get_tensor_shape(name)  # resolved dims
        dtype = trt.nptype(engine.get_tensor_dtype(name))
        size = int(np.prod(shape))

        # Allocate host/device memory
        host_mem = cuda.pagelocked_empty(size, dtype)
        dev_mem = cuda.mem_alloc(host_mem.nbytes)
        bindings.append(int(dev_mem))

        if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
            np.copyto(host_mem, img_arr.ravel())
        else:
            outputs.append((name, shape, dtype, host_mem))

        context.set_tensor_address(name, int(dev_mem))

    img_arr = np.ascontiguousarray(img_arr.astype(np.float32))


    # Transfer input, run, and fetch output
    input_memory = cuda.mem_alloc(img_arr.nbytes)

    cuda.memcpy_htod_async(input_memory, img_arr, stream)
    context.execute_async_v3(stream_handle=stream.handle)
    print(outputs)
    cuda.memcpy_dtoh_async(outputs[0][3], bindings[1], stream)
    stream.synchronize()

    # Postprocess logits → predicted class
    out_name, shape, dtype, host_out = outputs[0]
    logits = np.array(host_out).reshape(shape)
    pred_class = int(np.argmax(logits, axis=1)[0])
    print(f"Predicted class index: {pred_class}")

# Example call
# :contentReference[oaicite:1]{index=1}


In [None]:
def infer_with_timimng_Muruga(engine_path, image_path):
    # Load engine
    runtime = trt.Runtime(TRT_LOGGER)
    with open(engine_path, "rb") as f:
        engine = runtime.deserialize_cuda_engine(f.read())
    context = engine.create_execution_context()

    # Preprocess the Image
    img_arr = pre_process_single_image(image_path)
        
    input_name = engine.get_tensor_name(0)
    output_name = engine.get_tensor_name(1)
    context.set_input_shape(input_name, img_arr.shape)
    
    assert context.all_binding_shapes_specified

    

    
            

In [None]:
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import time
import tensorrt as trt

def timed_infer(engine, input_image):
    # Assume input_image is np.array (1,3,32,32), dtype=float32
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    runtime = trt.Runtime(TRT_LOGGER)
    
    with open(engine, "rb") as f:
        engine = runtime.deserialize_cuda_engine(f.read())
    context = engine.create_execution_context()

    # preprocess the image
    input_image = pre_process_single_image(input_image)

    input_name = engine.get_tensor_name(0)
    output_name = engine.get_tensor_name(1)
    context.set_input_shape(input_name, input_image.shape)

    # Allocate memory
    input_image = np.ascontiguousarray(input_image.astype(np.float32))
    output_shape = context.get_tensor_shape(output_name)
    output_size = int(np.prod(output_shape))
    output_dtype = trt.nptype(engine.get_tensor_dtype(output_name))

    device_input = cuda.mem_alloc(input_image.nbytes)
    device_output = cuda.mem_alloc(output_size * np.dtype(output_dtype).itemsize)

    context.set_tensor_address(input_name, int(device_input))
    context.set_tensor_address(output_name, int(device_output))

    # Prepare output buffer
    host_output = np.empty(output_shape, dtype=output_dtype)

    stream = cuda.Stream()

    # CUDA Events for timing
    start_h2d = cuda.Event()
    end_h2d = cuda.Event()

    start_exec = cuda.Event()
    end_exec = cuda.Event()

    start_d2h = cuda.Event()
    end_d2h = cuda.Event()

    # Host to Device
    start_h2d.record(stream)
    cuda.memcpy_htod_async(device_input, input_image, stream)
    end_h2d.record(stream)

    # Inference
    start_exec.record(stream)
    context.execute_async_v3(stream.handle)
    end_exec.record(stream)

    # Device to Host
    start_d2h.record(stream)
    cuda.memcpy_dtoh_async(host_output, device_output, stream)
    end_d2h.record(stream)

    # print(host_output[0])

    # Synchronize
    stream.synchronize()

    # Measure times (ms)
    h2d_time = start_h2d.time_till(end_h2d)
    exec_time = start_exec.time_till(end_exec)
    d2h_time = start_d2h.time_till(end_d2h)

    total_time = h2d_time + exec_time + d2h_time

    print(f"H2D Time     : {h2d_time:.3f} ms")
    print(f"Infer Time   : {exec_time:.3f} ms")
    print(f"D2H Time     : {d2h_time:.3f} ms")
    print(f"Total Time   : {total_time:.3f} ms")

    return host_output, h2d_time, exec_time, d2h_time


In [61]:
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import time
import tensorrt as trt

def timed_infer_batches(engine, test_loader):
    # Assume input_image is np.array (1,3,32,32), dtype=float32
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    runtime = trt.Runtime(TRT_LOGGER)
    
    with open(engine, "rb") as f:
        engine = runtime.deserialize_cuda_engine(f.read())
    context = engine.create_execution_context()

    outputs = []
    total_inference_time = 0
    total_host_to_device_time = 0
    total_device_to_host_time = 0
    total_time = 0

    batches_seen = 0

    for (input_data, label) in test_loader:
        input_image = input_data.numpy()

        # preprocess the image
        # input_image = pre_process_single_image(input_image)

        input_name = engine.get_tensor_name(0)
        output_name = engine.get_tensor_name(1)
        context.set_input_shape(input_name, input_image.shape)

        # Allocate memory
        input_image = np.ascontiguousarray(input_image.astype(np.float32))
        output_shape = context.get_tensor_shape(output_name)
        output_size = int(np.prod(output_shape))
        output_dtype = trt.nptype(engine.get_tensor_dtype(output_name))

        device_input = cuda.mem_alloc(input_image.nbytes)
        device_output = cuda.mem_alloc(output_size * np.dtype(output_dtype).itemsize)

        context.set_tensor_address(input_name, int(device_input))
        context.set_tensor_address(output_name, int(device_output))

        # Prepare output buffer
        host_output = np.empty(output_shape, dtype=output_dtype)

        stream = cuda.Stream()

        # CUDA Events for timing
        start_h2d = cuda.Event()
        end_h2d = cuda.Event()

        start_exec = cuda.Event()
        end_exec = cuda.Event()

        start_d2h = cuda.Event()
        end_d2h = cuda.Event()

        # Host to Device
        start_h2d.record(stream)
        cuda.memcpy_htod_async(device_input, input_image, stream)
        end_h2d.record(stream)

        # Inference
        start_exec.record(stream)
        context.execute_async_v3(stream.handle)
        end_exec.record(stream)

        # Device to Host
        start_d2h.record(stream)
        cuda.memcpy_dtoh_async(host_output, device_output, stream)
        end_d2h.record(stream)

        # Synchronize
        stream.synchronize()

        # Measure times (ms)
        h2d_time = start_h2d.time_till(end_h2d)
        exec_time = start_exec.time_till(end_exec)
        d2h_time = start_d2h.time_till(end_d2h)

        total_inference_time += exec_time
        total_host_to_device_time += h2d_time
        total_device_to_host_time += d2h_time

        total_time += h2d_time + exec_time + d2h_time

        outputs.append(host_output[0])
        batches_seen += 1
    
    # Calulate the avegare time
    avg_inference_time_trt = total_inference_time / batches_seen
    avg_host_to_device_time = total_host_to_device_time / batches_seen
    avg_device_to_host_time = total_device_to_host_time / batches_seen
    avg_time_per_batch = total_time / batches_seen

    print(f"H2D Time per batch    : {avg_host_to_device_time:.3f} ms")
    print(f"Infer Time per batch  : {avg_inference_time_trt:.3f} ms")
    print(f"D2H Time per batch    : {avg_device_to_host_time:.3f} ms")
    print(f"Total Time per batch : {avg_time_per_batch:.3f} ms")


    return outputs


In [None]:
engine_path = r"resnet50_new.engine"
image_path = r"C:\Tejeswar\Fusion\sampleImages\cifar10_image_2_label_9_truck.jpg"

# timed_infer(engine_path, image_path)
# inferMuruga(engine_path, image_path)

timed_infer_batches(engine_path, test_loader)

[  5074.078    -2192.8315    1501.8422   -5104.639   -10178.127
 -21113.158    -2229.9111   -5574.8438      50.58682  15115.169  ]
H2D Time     : 0.051 ms
Infer Time   : 2.814 ms
D2H Time     : 0.167 ms
Total Time   : 3.032 ms


(array([[  5074.078  ,  -2192.8315 ,   1501.8422 ,  -5104.639  ,
         -10178.127  , -21113.158  ,  -2229.9111 ,  -5574.8438 ,
             50.58682,  15115.169  ]], dtype=float32),
 0.05119999870657921,
 2.813983917236328,
 0.1666879951953888)