In [1]:

def compute_iou(box_a, box_b):
    
    # box_a and box_b are lists containing the coordinates of the top-left and bottom-right corners of the bounding boxes
    x_a = max(box_a[0], box_b[0])
    y_a = max(box_a[1], box_b[1])
    x_b = min(box_a[2], box_b[2])
    y_b = min(box_a[3], box_b[3])
    
    # compute the area of intersection
    intersection_area = max(0, x_b - x_a + 1) * max(0, y_b - y_a + 1)
    box_a_area = (box_a[2] - box_a[0] + 1) * (box_a[3] - box_a[1] + 1)
    box_b_area = (box_b[2] - box_b[0] + 1) * (box_b[3] - box_b[1] + 1)
    
    # compute the iou
    iou = intersection_area / float(box_a_area + box_b_area - intersection_area)
    return iou

In [26]:
import cv2
import numpy as np
from ultralytics import YOLO

model = YOLO('yolov9e.pt')

#here we load the videos
video_path = '/home/radu/Facultate/Computer_Vision/Project2/train/Task3/14.mp4'
video = cv2.VideoCapture(video_path)

#here we load the initial bounding box
initial_bounding_box_file = '/home/radu/Facultate/Computer_Vision/Project2/train/Task3/14.txt'
with open(initial_bounding_box_file, 'r') as f:
    lines = f.readlines()
    stop_frame = int(lines[0].strip().split()[0])
    initial_coords = list(map(int, lines[1].strip().split())) #extracting the coordinates


current_frame_index = 0
output_coordinates = []
all_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
bounding_box = initial_coords[1:] #slcing from coordinates

output_coordinates.append([all_frames - 1, -1, -1, -1, -1])

while True:
    ret, frame = video.read()
    
    if not ret or current_frame_index > stop_frame:
        break
    
    if frame is None:
        print(f"Frame {current_frame_index} is None")
        break
    
    results = model.predict(frame, classes=[2, 7], verbose=False,conf = 0.2)
    
    best_iou = 0
    best_bounding_box = None
    best_confidence = 0
    best_class_id = None
    
    for result in results:
        boxes = result.boxes
        for box in boxes:
            x1, y1, x2, y2 = map(int, box.xyxy[0]) #we use map function to convert the coordinates to integers
            cls_id = int(box.cls[0]) #class id
            confidence = box.conf[0] #confidence score
            
            #compute the iou with the initial bounding box
            iou = compute_iou([x1, y1, x2, y2], bounding_box)
            
            #update the best bounding box if the iou is higher
            if iou > best_iou:
                best_iou = iou
                best_bounding_box = [x1, y1, x2, y2]
                best_confidence = confidence
                best_cls_id = cls_id
    
    #if the best iou is higher than 0.3, draw the bounding box    
    if best_bounding_box is not None and best_iou > 0.2:
            x1, y1, x2, y2 = best_bounding_box
            cv2.rectangle(frame, (x1,y1), (x2,y2), (255,0,0), 2)
            label = f'{"Car" if best_class_id == 2 else "Truck"}: {best_confidence:.2f}' 
            cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)
            output_coordinates.append([current_frame_index, x1, y1, x2, y2]) #append the coordinates to the output
            bounding_box = best_bounding_box #update the bounding box
            
            #print the frame and the best iou
            print(f'Frame {current_frame_index}, Best IOU: {best_iou:.4f}')
    else:
        output_coordinates.append([current_frame_index, -1, -1, -1, -1])
    
    cv2.imshow('Tracking', frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
    #increment the frame index 
    current_frame_index += 1

video.release()
cv2.destroyAllWindows()


output_file = '/home/radu/Facultate/Computer_Vision/Project2/train/Task3/output2.txt'
with open(output_file, 'w') as f:
    for coords in output_coordinates:
        f.write(' '.join(map(str, coords)) + '\n')

print(f"Output saved to {output_file}")

Initial Coordinates: [0, 392, 229, 449, 283]
Initial bounding box: [392, 229, 449, 283]
Frame 68, Best IOU: 0.2208
Frame 69, Best IOU: 1.0000
Frame 74, Best IOU: 0.8340
Frame 75, Best IOU: 1.0000
Frame 76, Best IOU: 1.0000
Frame 77, Best IOU: 1.0000
Frame 78, Best IOU: 0.9181
Frame 79, Best IOU: 1.0000
Frame 80, Best IOU: 0.9227
Frame 81, Best IOU: 1.0000
Frame 82, Best IOU: 0.9234
Frame 83, Best IOU: 1.0000
Frame 84, Best IOU: 0.9369
Frame 85, Best IOU: 1.0000
Frame 86, Best IOU: 0.9667
Frame 87, Best IOU: 1.0000
Frame 88, Best IOU: 1.0000
Frame 89, Best IOU: 1.0000
Frame 90, Best IOU: 0.9228
Frame 91, Best IOU: 1.0000
Frame 92, Best IOU: 0.9234
Frame 93, Best IOU: 1.0000
Frame 94, Best IOU: 0.9354
Frame 95, Best IOU: 1.0000
Frame 96, Best IOU: 0.9528
Frame 97, Best IOU: 1.0000
Frame 98, Best IOU: 0.9672
Frame 99, Best IOU: 0.9836
Frame 100, Best IOU: 1.0000
Frame 101, Best IOU: 1.0000
Frame 102, Best IOU: 0.8805
Frame 103, Best IOU: 1.0000
Frame 104, Best IOU: 0.9531
Frame 105, Best 

In [11]:
def load_bounding_boxes(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    boxes = []
    for line in lines[1:]:  # Skip the first line
        frame_data = list(map(int, line.strip().split()))
        boxes.append(frame_data)
    return boxes

def evaluate_tracking(ground_truth_file, predicted_file):
    ground_truth_boxes = load_bounding_boxes(ground_truth_file)
    predicted_boxes = load_bounding_boxes(predicted_file)
    
    assert len(ground_truth_boxes) == len(predicted_boxes), "Mismatch in number of frames"

    total_frames = len(ground_truth_boxes)
    correct_frames = 0

    for gt_box, pred_box in zip(ground_truth_boxes, predicted_boxes):
        if pred_box[1:] == [-1, -1, -1, -1]:
            continue  # Skip frames where the prediction is missing
        iou = compute_iou(gt_box[1:], pred_box[1:])
        if iou > 0.3:
            correct_frames += 1

    accuracy = correct_frames / total_frames
    print(f"Correct frames: {correct_frames}/{total_frames}, Accuracy: {accuracy:.2f}")

    if accuracy >= 0.8:
        print("Tracking performance is satisfactory.")
    else:
        print("Tracking performance is unsatisfactory.")
    
    return accuracy

# Example usage
ground_truth_file = '/home/radu/Facultate/Computer_Vision/Project2/train/Task3/ground-truth/01_gt.txt'
predicted_file = '/home/radu/Facultate/Computer_Vision/Project2/train/Task3/output1.txt'

evaluate_tracking(ground_truth_file, predicted_file)


Correct frames: 1771/1771, Accuracy: 1.00
Tracking performance is satisfactory.


1.0