In [1]:
#Import packages
import numpy as np
import cv2
from ultralytics import YOLO
from src.byte_tracker import BYTETracker

In [2]:
#args, Please refer to the original paper for the specific parameter meaning
class BYTETrackerArgs:
    track_thresh: float = 0.25   #yolo min Confidence degree
    track_buffer: int = 60   #The maximum number of vanishing frames of an object
    match_thresh: float = 0.5 #match_thresh
    aspect_ratio_thresh: float = 3.0 #Length-width ratio
    min_box_area: float = 1.0 #Minimum detection box area
    mot20: bool = False  #Validation set, invalid

In [3]:
def box_label(image, box, label='', color=(128, 128, 128), txt_color=(255, 255, 255)):
    thickness = 3  # Thickness of the rectangle border
    font_scale = 1.0  # Scale of the font for the label
    
    # Convert box coordinates to integer tuples for drawing
    p1, p2 = (int(box[0]), int(box[1])), (int(box[2]), int(box[3]))
    
    # Draw the rectangle on the image
    cv2.rectangle(image, p1, p2, color, thickness=thickness, lineType=cv2.LINE_AA)
    
    if label:  # Check if a label is provided
        # Get the size of the text for the label
        w, h = cv2.getTextSize(label, 0, fontScale=font_scale, thickness=thickness)[0]  
        
        # Determine whether the label should be placed above or below the rectangle
        outside = p1[1] - h >= 3
        p2 = p1[0] + w, p1[1] - h - 3 if outside else p1[1] + h + 3
        
        # Draw a filled rectangle for the label background
        cv2.rectangle(image, p1, p2, color, -1, cv2.LINE_AA)
        
        # Put the label text on the image
        cv2.putText(image,
                    label, 
                    (p1[0], p1[1] - 2 if outside else p1[1] + h + 2),
                    0,
                    font_scale,
                    txt_color,
                    thickness=thickness,  
                    lineType=cv2.LINE_AA)
        
def iou(box: np.ndarray, boxes: np.ndarray):
    # Calculate the intersection area between the predicted box and the ground truth boxes
    xy_max = np.minimum(boxes[:, 2:], box[2:])  # Maximum x and y coordinates of the intersection
    xy_min = np.maximum(boxes[:, :2], box[:2])  # Minimum x and y coordinates of the intersection
    inter = np.clip(xy_max - xy_min, a_min=0, a_max=np.inf)  # Clip to ensure non-negative dimensions
    inter = inter[:, 0] * inter[:, 1]  # Calculate intersection area
    
    # Calculate the area of each box
    area_boxes = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])  # Ground truth boxes' area
    area_box = (box[2] - box[0]) * (box[3] - box[1])  # Predicted box's area
    
    # Calculate Intersection over Union (IoU)
    return inter / (area_box + area_boxes - inter)  # Return the IoU value

In [None]:
model = YOLO('./model/yolo11n.pt')
 
cap = cv2.VideoCapture("./video/1.mp4")
fps = cap.get(cv2.CAP_PROP_FPS)
size = (int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)))
fNUMS = cap.get(cv2.CAP_PROP_FRAME_COUNT)
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
videoWriter = cv2.VideoWriter("./video/1_t.mp4", fourcc, fps, size)

output_file = open('./video/tracking_results1.txt', 'w')


byte_tracker = BYTETracker(BYTETrackerArgs(),frame_rate= fps)
frame_count = 0  #Used to record the frame number

In [5]:
while cap.isOpened():
    success, frame = cap.read()
 
    if success:      

        # Run the model on the current frame with a confidence threshold
        results = model(frame, conf=0.5, verbose=False)
        
        # Get the output boxes data as a NumPy array
        outputs = results[0].boxes.data.cpu().numpy()
        
        if outputs is not None:
            # Filter for the person class (class ID 0)
            person_mask = outputs[:, 5] == 0
            outputs = outputs[person_mask]
            
            # Set confidence for detected persons
            for output in outputs:
                output[4] = 0.95
            
            # Update the tracker with the current frame's outputs
            tracks = byte_tracker.update(outputs[:, :5], img_info=frame.shape, img_size=frame.shape)
            
            for track in tracks:
                # Calculate IoU between the tracked box and detected boxes
                box_iou = iou(track.tlbr, outputs[:, :4])
                maxindex = np.argmax(box_iou)
                
                # Get the bounding box coordinates and confidence
                box = outputs[maxindex]
                x1, y1, x2, y2 = map(int, box[:4])
                confidence = box[4]
                
                # Write to the txt file
                # Format: Frame number, Track ID, x1, y1, x2, y2, Confidence
                output_file.write(f"{frame_count},{track.track_id},{x1},{y1},{x2},{y2},{confidence:.3f}\n")
                
                # Label the detected person in the frame
                if outputs[maxindex, 5] == 0:
                    box_label(frame, outputs[maxindex], '#' + str(track.track_id) + ' person', (167, 146, 11))
                                        
        # Display the frame (commented out for now)
        # cv2.imshow("ByteTrack", frame)
        videoWriter.write(frame)  # Write the frame to the video file
        frame_count += 1  # Increment the frame count
     
        # Exit the loop if 'q' is pressed
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break
 
    else:
        break
    
# Release resources
cap.release()
videoWriter.release()
cv2.destroyAllWindows()
output_file.close()