# Practical Example: From .pt File to ONNXRuntime Utilization

## 01.Convert `.pt` to `.onnx`

### Model Import with Ultralytics

In [None]:
from pathlib import Path
from ultralytics import YOLO

working_dir = Path()
model_weight = working_dir / 'models/yolov10n.pt'

print(model_weight.resolve())

model = YOLO(model_weight)

### Model Export as `.onnx`

In [None]:
model.export(format="onnx", dynamic=False, batch=6)

## 02.Inference Samples

### Inference Session

In [8]:
import onnxruntime as ort

onnx_weight = working_dir / 'models/yolov10x.onnx'
EP_LIST = [
    ('CUDAExecutionProvider', {'device_id': 0}), # To utilze GPU, onnxruntime-gpu is required
    'CPUExecutionProvider',
]

# Create an inference session using the ONNX model and specify execution providers
session = ort.InferenceSession(onnx_weight, providers=['CPUExecutionProvider'])

# Get the model inputs
model_inputs = session.get_inputs()

# Store the shape of the input for later use
input_shape = model_inputs[0].shape
input_width = input_shape[2]
input_height = input_shape[3]

print(f"INPUT_SHAPE: {input_shape}")

INPUT_SHAPE: [6, 3, 640, 640]


### Sample Video

In [29]:
import cv2

vid_path = working_dir / 'samples/sample1.mp4'
cap = cv2.VideoCapture(vid_path)

ret, im = cap.read()
print(im.shape)

(1080, 1920, 3)


### Preprocess

##### Slicing Image

In [30]:
INPUT_SZ = (640,640)
IMAGE_HEIGHT, IMAGE_WIDTH = im.shape[:2]

NUM_SLICES = (3,2) # horizontal, width
SLICE_LENGTH = int(IMAGE_HEIGHT * 11 / 18)
RATIO_INPUT2SLICE = float(SLICE_LENGTH/INPUT_SZ[0])
OFFSETS = (-int(SLICE_LENGTH / 22), -int(SLICE_LENGTH * 4 / 11)) # horizontal, width

# Calculate slice start positions
SLICE_POSITIONS = []
for i in range(NUM_SLICES[1]):
    for j in range(NUM_SLICES[0]):
        x = j * (SLICE_LENGTH+OFFSETS[0])
        y = i * (SLICE_LENGTH+OFFSETS[1])
        SLICE_POSITIONS.append((x, y))

In [31]:
import numpy as np
from typing import List, Optional, Tuple
from PIL import Image
from pathlib import Path

def slice_image(
    image: np.ndarray,
    output_file_name: Optional[str] = None,
    output_dir: Optional[str] = None
) -> Tuple[List[np.ndarray], Optional[str]]:
    """
    Slice a large image (numpy array) into a fixed number of windows: 3 horizontal slices and 2 vertical slices.
    Each slice size is determined as 11/18 of the image's height.

    Args:
        image (np.ndarray): Image to be sliced as a numpy array (height x width x channels).
        output_file_name (str, optional): Root name of output files.
        output_dir (str, optional): Output directory.

    Returns:
        Tuple: A list of sliced images as numpy arrays and the directory of the exported images if applicable.
            sliced_images[idx]: slice at (idx//NUM_SLICES[1], idx%NUM_SLICES[1])-th order
    """

    sliced_images = []
    for (x, y) in SLICE_POSITIONS:
        slice_image = image[y:y + SLICE_LENGTH, x:x + SLICE_LENGTH]
        sliced_images.append(slice_image)

        # Save image if output_dir and output_file_name are provided
        if output_file_name and output_dir:
            if not Path(output_dir).exists():
                Path(output_dir).mkdir(parents=True)
            slice_file_name = f"{output_file_name}_{x}_{y}.png"
            slice_file_path = Path(output_dir) / slice_file_name
            Image.fromarray(slice_image).save(slice_file_path)

    return sliced_images, output_dir if output_file_name and output_dir else None

In [32]:
from typing import List
import numpy as np

def preprocess(im: np.ndarray) -> np.ndarray:
    """
    Preprocesses the input image before performing inference.

    Returns:
        image_data: Preprocessed image data ready for inference.
    """

    # Convert the image color space from BGR to RGB
    im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)

    slices, _ = slice_image(im)

    # Resize the image to match the input shape
    # Normalize the image data by dividing it by 255.0
    slices_resized = np.array([cv2.resize(slc, INPUT_SZ, interpolation=cv2.INTER_LINEAR) for slc in slices]) / 255.0

    # Transpose the image to have the channel dimension as the first dimension
    image_data = np.transpose(slices_resized, (0, 3, 1, 2)).astype(np.float32)  # Batch, Channel, H, W

    # Return the preprocessed image data
    return np.ascontiguousarray(image_data)

# Preprocess the image data
im_input = preprocess(im)
print(im_input.shape)

(6, 3, 640, 640)


### Run

In [33]:
inname = [i.name for i in session.get_inputs()]
inname

['images']

In [34]:
outname = [i.name for i in session.get_outputs()]
outname

['output0']

In [35]:
# Run inference using the preprocessed image data
outputs = session.run(None, {'images': im_input})

print(outputs)
print(outputs[0].shape)

[array([[[     15.056,    -0.94189,      631.69,      157.59,   0.0078498,           5],
        [     539.25,      230.53,      639.83,      523.82,    0.006619,           2],
        [     128.57,      372.87,      608.01,      536.34,   0.0056322,          10],
        ...,
        [     584.71,    -0.68367,      640.27,      109.94,  3.5822e-05,           1],
        [    0.35823,      120.02,        43.7,       309.2,  3.5793e-05,           5],
        [     487.37,      1.6805,      638.92,      188.28,  3.5703e-05,           7]],

       [[     93.698,      111.04,      526.99,      510.33,    0.034215,           2],
        [    -1.9896,    -0.70264,      271.52,      163.56,    0.012508,           5],
        [     337.09,      586.71,      373.96,      625.69,   0.0054928,          10],
        ...,
        [     598.41,     0.17943,      643.93,      11.817,  2.9445e-05,           5],
        [   0.034458,      256.91,       80.25,      449.54,  2.9117e-05,           5],
   

### Postprocess

#### Object Classes

In [36]:
from typing import Optional, List

classes: List[str] = [
    'black_smoke',
    'gray_smoke',
    'white_smoke',
    'flame',
    'cloud',
    'fog',
    'lamp_light',
    'sun_light',
    'shaky_object',
    'wind-swayed_leaves',
    'irrelevant',
]

classes_aligned: List[Optional[str]] = [
    'Smoke','Flame',None
]

classes_map: List[int] = [
    0,0,0,1,2,2,2,2,2,2,2
] # 0:smoke, 1:flame, 2:none

color_palette: List[tuple] = [ # BGR
    (255,0,0), # Smoke
    (0,0,255), # Flame
    (0,0,0)
]

#### Functions

- Reference: `Ultralytics/examples/YOLOv8-ONNXRuntime`

In [42]:
im_height, im_width = im.shape[:2]

def draw_detections(img:np.ndarray, box, score, class_id) -> None:
    """
    Draws bounding boxes and labels on the input image based on the detected objects.

    Args:
        img: The input image to draw detections on.
        box: Detected bounding box.
        score: Corresponding detection score.
        class_id: Class ID for the detected object.

    Returns:
        None
    """

    if class_name := classes_aligned[class_id]:
        pass
    else:
        return

    # Extract the coordinates of the bounding box
    x1, y1, w, h = box

    # Retrieve the color for the class ID
    color = color_palette[class_id]

    # Draw the bounding box on the image
    cv2.rectangle(img, (int(x1), int(y1)), (int(x1 + w), int(y1 + h)), color, 2)

    # Create the label text with class name and score
    label = f"{class_name}: {score:.2f}"

    # Calculate the dimensions of the label text
    (label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)

    # Calculate the position of the label text
    label_x = x1
    label_y = y1 - 10 if y1 - 10 > label_height else y1 + 10

    # Draw a filled rectangle as the background for the label text
    cv2.rectangle(
        img, (label_x, label_y - label_height), (label_x + label_width, label_y + label_height), color, cv2.FILLED
    )

    # Draw the label text on the image
    cv2.putText(img, label, (label_x, label_y), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1, cv2.LINE_AA)



def postprocess(input_image:np.ndarray, outputs:List[np.ndarray], confidence_thres:float=0.05, iou_thres:float=0.5) -> np.ndarray:
    """
    Performs post-processing on the model's output to extract bounding boxes, scores, and class IDs.

    Args:
        input_image (numpy.ndarray): The input image.
        outputs[0] (numpy.ndarray): The outputs of the model.

    Returns:
        numpy.ndarray: The input image with detections drawn on it.
    """

    # Lists to store the bounding boxes, scores, and class IDs of the detections
    boxes = []
    scores = []
    class_ids = []

    for outputs_slice, pos_slice in zip(outputs[0], SLICE_POSITIONS):
        # Get the number of rows in the outputs array
        rows = outputs_slice.shape[0]
        
        # Iterate over each row in the outputs array
        for i in range(rows):
            # Extract the score from the current row
            best_score = outputs_slice[i][4]

            # If the maximum score is above the confidence threshold
            if best_score >= confidence_thres:
                # Get the class ID with the highest score
                class_id = int(outputs_slice[i][-1]) # detected class
                class_id_aligned = classes_map[class_id]
                
                # Neglact `None` class
                if classes_aligned[class_id_aligned]:
                    pass
                else:
                    continue

                # Extract the bounding box coordinates from the current row
                x1, y1, x2, y2 = outputs_slice[i][0], outputs_slice[i][1], outputs_slice[i][2], outputs_slice[i][3]

                # Calculate the scaled coordinates of the bounding box
                left = int(y1 * RATIO_INPUT2SLICE) + pos_slice[0]
                top = int(x1* RATIO_INPUT2SLICE) + pos_slice[1]
                width = int((y2-y1) * RATIO_INPUT2SLICE)
                height = int((x2-x1) * RATIO_INPUT2SLICE)

                # Add the class ID, score, and box coordinates to the respective lists
                class_ids.append(class_id_aligned)
                scores.append(best_score)
                boxes.append([left, top, width, height])

    print(class_ids)
    # Apply non-maximum suppression to filter out overlapping bounding boxes
    indices = cv2.dnn.NMSBoxes(boxes, scores, confidence_thres, iou_thres)
    print(class_ids)

    # Iterate over the selected indices after non-maximum suppression
    for i in indices:
        # Get the box, score, and class ID corresponding to the index
        box = boxes[i]
        score = scores[i]
        class_id = class_ids[i]

        # Draw the detection on the input image
        draw_detections(input_image, box, score, class_id)

    # Return the modified input image
    return input_image

In [43]:
final_output = postprocess(im, outputs)

[1]
[1]


In [44]:
cv2.imwrite('sample_result.jpg', final_output)

True