# Object Tracking
## Computer Vision


### Object Detection

Use a pre-trained YOLOv5 model to detect people in each frame of the video. The model should identify the bounding boxes around each person along with the confidence scores.

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Clone the YOLOv5 repository
!git clone https://github.com/ultralytics/yolov5
%cd yolov5

# Install dependencies for YOLOv5
!pip install -r requirements.txt

# Clone the DeepSORT repository
!git clone https://github.com/mikel-brostrom/Yolov5_DeepSort_Pytorch.git
%cd Yolov5_DeepSort_Pytorch

# Install dependencies for DeepSORT
!pip install -r requirements.txt

# Go back to the main directory
%cd ..

fatal: destination path 'yolov5' already exists and is not an empty directory.
/content/yolov5
fatal: destination path 'Yolov5_DeepSort_Pytorch' already exists and is not an empty directory.
/content/yolov5/Yolov5_DeepSort_Pytorch
[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'[0m[31m
[0m/content/yolov5


In [3]:
!pip install deep-sort-realtime



### Object Tracking

Implement DeepSORT to track each detected person across frames. Each person should be assigned a unique ID that remains consistent throughout their appearance in the video.



### Tracking Output

Display the video with bounding boxes around each tracked person. Each bounding box should be labeled with the corresponding track ID.

In [5]:
import torch
import cv2

# Load YOLOv5 model (small version for speed)
yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)

# Video path and output setup
video_path = "person.mp4"  # Ensure this file is uploaded
output_video_path = "output_detected_people.mp4"

# Open the video file
cap = cv2.VideoCapture(video_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)

# VideoWriter to save the output video
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

# Function to draw bounding boxes
def draw_bboxes(img, bbox, confidences):
    for i, box in enumerate(bbox):
        x1, y1, x2, y2 = [int(coord) for coord in box]
        conf = confidences[i]
        color = (255, 0, 0)
        label = f'Person {conf:.2f}'
        cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
        cv2.putText(img, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.75, color, 2)
    return img

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Apply YOLOv5 to detect people in the frame
    results = yolo_model(frame)

    # Filter only 'person' detections (class 0 in YOLOv5)
    bbox_xyxy = []
    confidences = []
    for detection in results.xyxy[0]:
        if int(detection[-1]) == 0:  # class 0 corresponds to 'person'
            x1, y1, x2, y2, conf, _ = detection
            bbox_xyxy.append([x1.item(), y1.item(), x2.item(), y2.item()])
            confidences.append(conf.item())

    # Draw bounding boxes with confidence scores
    if bbox_xyxy:
        frame = draw_bboxes(frame, bbox_xyxy, confidences)

    # Write the frame with bounding boxes to the output video
    out.write(frame)

# Release everything
cap.release()
out.release()

# After processing, download the video file from Colab
from google.colab import files
files.download(output_video_path)

Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2024-9-13 Python-3.10.12 torch-2.4.0+cu121 CUDA:0 (Tesla T4, 15102MiB)

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [6]:
import torch
import cv2
from deep_sort_realtime.deepsort_tracker import DeepSort

# Load YOLOv5 model (small version for speed)
yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)

# Initialize DeepSORT
deepsort = DeepSort(max_age=30, nn_budget=70, max_iou_distance=0.7)

# Video path and output setup
video_path = "person.mp4"  # Ensure this file is uploaded
output_video_path = "output_tracked_people.mp4"

# Open the video file
cap = cv2.VideoCapture(video_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)

# VideoWriter to save the output video
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

# Function to draw bounding boxes with track IDs
def draw_bboxes(img, bbox, identities=None, offset=(0, 0)):
    for i, box in enumerate(bbox):
        x1, y1, x2, y2 = [int(coord) for coord in box]
        id = int(identities[i]) if identities is not None else 0
        color = (255, 0, 0)
        label = '{}{:d}'.format("ID ", id)
        cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
        cv2.putText(img, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.75, color, 2)
    return img

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Apply YOLOv5 to detect people in the frame
    results = yolo_model(frame)

    # Filter only 'person' detections (class 0 in YOLOv5)
    detections = []
    for detection in results.xyxy[0]:
        if int(detection[-1]) == 0:  # class 0 corresponds to 'person'
            x1, y1, x2, y2, conf, _ = detection
            detections.append(([x1.item(), y1.item(), x2.item(), y2.item()], conf.item()))  # bbox and confidence

    # If there are detections, pass them to DeepSORT for tracking
    if detections:
        tracks = deepsort.update_tracks(detections, frame=frame)  # Call DeepSORT

        # Collect bounding boxes and identities from the tracks
        bbox_xyxy = []
        identities = []
        for track in tracks:
            if not track.is_confirmed():
                continue
            bbox = track.to_ltrb()  # Get bbox coordinates (left, top, right, bottom)
            id = track.track_id  # Get track ID
            bbox_xyxy.append(bbox)
            identities.append(id)

        # Draw bounding boxes with track IDs
        frame = draw_bboxes(frame, bbox_xyxy, identities)

    # Write the frame with bounding boxes to the output video
    out.write(frame)

# Release everything
cap.release()
out.release()

# After processing, download the video file from Colab
from google.colab import files
files.download(output_video_path)

Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2024-9-13 Python-3.10.12 torch-2.4.0+cu121 CUDA:0 (Tesla T4, 15102MiB)

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>