<a href="https://colab.research.google.com/github/VanshJain4/bytetracker/blob/main/bytetracker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =======================
# 1. Install YOLOv8 (Ultralytics)
# =======================
!pip install ultralytics --upgrade -q
from IPython.display import clear_output
clear_output()

# =======================
# 2. Import libraries
# =======================
import cv2
import os
from ultralytics import YOLO
from IPython.display import Video
from tqdm import tqdm

# =======================
# 3. Upload your input video
# =======================
from google.colab import files
uploaded = files.upload()

# Get uploaded video path
video_path = next(iter(uploaded))

# =======================
# 4. Load YOLOv8 model (pretrained on COCO)
# =======================
model = YOLO("yolov8n.pt")  # Options: yolov8n.pt, yolov8s.pt, yolov8m.pt, etc.

# =======================
# 5. Open input video
# =======================
cap = cv2.VideoCapture(video_path)
width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps    = cap.get(cv2.CAP_PROP_FPS)

# Output video writer
out_path = "output_people_detected.mp4"
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(out_path, fourcc, fps, (width, height))

# =======================
# 6. Process video frame by frame
# =======================
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Run YOLOv8 inference
    results = model(frame, verbose=False)[0]

    # Draw bounding boxes for 'person' class (class_id = 0 in COCO)
    for box in results.boxes:
        cls_id = int(box.cls[0])
        if cls_id == 0:  # person class
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            conf = float(box.conf[0])
            label = f'Person {conf:.2f}'
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, label, (x1, y1 - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

    out.write(frame)

cap.release()
out.release()

# =======================
# 7. Display the output video
# =======================
Video(out_path, embed=True)


In [None]:
import os

output_txt_dir = "yolo_dets"
os.makedirs(output_txt_dir, exist_ok=True)

frame_id = 0  # Increment this per frame

# Inside your video loop after getting results:
detections = []

for box in results.boxes:
    cls_id = int(box.cls[0])
    if cls_id == 0:  # Person class only
        x1, y1, x2, y2 = map(float, box.xyxy[0])
        w = x2 - x1
        h = y2 - y1
        conf = float(box.conf[0])
        detections.append(f"{frame_id},{x1:.2f},{y1:.2f},{w:.2f},{h:.2f},{conf:.4f}")

# Write detections to file (append per frame)
with open(os.path.join(output_txt_dir, "results.txt"), "a") as f:
    for line in detections:
        f.write(line + "\n")

frame_id += 1


In [None]:
import os

os.makedirs("yolo_dets", exist_ok=True)

def save_detections_for_bytetrack(results, video_name="video", conf_threshold=0.3):
    with open(f'yolo_dets/{video_name}.txt', 'w') as f:
        for frame_id, result in enumerate(results):
            boxes = result.boxes
            for box in boxes:
                cls_id = int(box.cls.cpu().numpy()[0])
                if cls_id != 0:  # class 0 = person in COCO
                    continue
                conf = float(box.conf.cpu().numpy()[0])
                if conf < conf_threshold:
                    continue
                x1, y1, x2, y2 = box.xyxy.cpu().numpy()[0]
                w, h = x2 - x1, y2 - y1
                f.write(f"{frame_id+1},{x1},{y1},{w},{h},{conf},-1,-1,-1\n")

# Pass your YOLOv8 results here
save_detections_for_bytetrack(results)


In [None]:
import sys
import os
import torch
import cv2
import numpy as np

# Add yolox repo path to Python
sys.path.append('/content/ByteTrack-cpp/build/ByteTrack/yolox')  # adjust if needed

from yolox.exp import get_exp
from yolox.utils import postprocess


In [None]:
import cv2
from ultralytics import YOLO
import numpy as np

model = YOLO('/content/yolov8n.pt')

# Define a color palette — you can assign a unique color to each class id
np.random.seed(42)  # for consistent colors
num_classes = 80  # COCO dataset classes count, adjust if needed
colors = np.random.randint(0, 255, size=(num_classes, 3), dtype=np.uint8)

# Optional: Map class IDs to class names (from COCO dataset for YOLOv8)
coco_classes = [
    "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train",
    "truck", "boat", "traffic light", "fire hydrant", "stop sign",
    "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep",
    "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
    "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard",
    "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
    "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork",
    "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
    "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
    "couch", "potted plant", "bed", "dining table", "toilet", "TV",
    "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave",
    "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase",
    "scissors", "teddy bear", "hair drier", "toothbrush"
]


cap = cv2.VideoCapture("/content/output_people_detected.mp4")
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = None

while True:
    ret, frame = cap.read()
    if not ret:
        break

    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = model(rgb_frame)
    result = results[0]

    boxes = result.boxes.xyxy.cpu().numpy()
    scores = result.boxes.conf.cpu().numpy()
    classes = result.boxes.cls.cpu().numpy().astype(int)

    for box, score, cls in zip(boxes, scores, classes):
        x1, y1, x2, y2 = box.astype(int)
        color = tuple(int(c) for c in colors[cls])  # color for this class
        class_name = coco_classes[cls] if cls < len(coco_classes) else f"Class {cls}"
        label = f"{class_name} {score:.2f}"

        cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
        cv2.putText(frame, label, (x1, y1 - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)

    if out is None:
        height, width = frame.shape[:2]
        out = cv2.VideoWriter('/content/output_annotated_better.mp4', fourcc, 30, (width, height))

    out.write(frame)

cap.release()
out.release()
print("Video saved as /content/output_annotated_better.mp4")


In [None]:
from yolox.tracker.byte_tracker import BYTETracker
from yolox.tracker.byte_tracker import STrack
import numpy as np
np.float = float  # Temporary fix for deprecated np.float usage


# ByteTrack arguments (mock basic config)
class Args:
    track_thresh = 0.5
    track_buffer = 30
    match_thresh = 0.8
    aspect_ratio_thresh = 1.6
    min_box_area = 10
    mot20 = False

args = Args()
tracker = BYTETracker(args, frame_rate=30)

# Load detections saved to file
detections_file = "yolo_dets/video.txt"

frame_id = 1
track_results = []

with open(detections_file, 'r') as f:
    for line in f:
        items = line.strip().split(',')
        fid, x1, y1, w, h, score = int(items[0]), float(items[1]), float(items[2]), float(items[3]), float(items[4]), float(items[5])
        if fid != frame_id:
            online_targets = tracker.update(np.array(track_dets), (height, width), (height, width))
            for t in online_targets:
                tlwh = t.tlwh
                tid = t.track_id
                track_results.append(f"{frame_id},{int(tid)},{tlwh[0]:.2f},{tlwh[1]:.2f},{tlwh[2]:.2f},{tlwh[3]:.2f},1,-1,-1,-1")
            track_dets = []
            frame_id = fid
        if 'track_dets' not in locals():
            track_dets = []
        track_dets.append([x1, y1, x1 + w, y1 + h, score])

# Optional: save results
with open("bytetrack_results.txt", "w") as f:
    for line in track_results:
        f.write(line + "\n")


In [None]:
from google.colab import files
uploaded = files.upload()

# Get uploaded video path
video_path = next(iter(uploaded))

In [None]:
#downloads the yolo object deetection tool but silently
!pip install ultralytics --upgrade -q
# Install yolox dependencies
#fast bouding box utility library
!pip install cython-bbox
!git clone https://github.com/ifzhang/ByteTrack.git
%cd ByteTrack
#install bytrtrack dependencies, that are written in requirments.txt
!pip install -r requirements.txt
#install bytetrack as an editable package, sets up buyetrack so you can use it as a python packaga
!python setup.py develop


In [None]:
!grep -rnw '/content/ByteTrack' -e 'np.float'
!find /content/ByteTrack -type f -name "*.py" -exec sed -i 's/np.float/float/g' {} +
%cd /content/ByteTrack
!python setup.py develop

In [None]:
#for logging
!pip install loguru
#needed for assignment problem in tracking algorithm
!pip install lap
!sed -i 's/float32/np.float32/g' /content/ByteTrack/yolox/utils/visualize.py


In [None]:
import os
import cv2
import numpy as np
from ultralytics import YOLO
import sys

# make sure the output folder for yolo detection exists
# creates folder yolo_dets oif doesnt
os.makedirs("yolo_dets", exist_ok=True)

# solution to outpdated numpy code
np.float = float

# add bytetracker code to python path to import its tracker class
sys.path.append('/content/ByteTrack')
from yolox.tracker.byte_tracker import BYTETracker

# ByteTrack config
#track thresh , lowering it jeporidizes the accuracy but also makes sure that more people are detected by bytetracker,
# in the current video if yolo detects 13 people with 0.3 you will see 8
# detected by bytetracker
class Args:
    track_thresh = 0.3
    track_buffer = 30
    match_thresh = 0.8
    aspect_ratio_thresh = 1.6
    min_box_area = 1
    mot20 = False

args = Args()
tracker = BYTETracker(args, frame_rate=30)

# Load YOLOv8 model, a pretrained model
model = YOLO('/content/yolov8n.pt')  # Adjust path as needed

# Video paths
input_video = "/content/palace.mp4"  # Change as needed
output_video = "/content/output_tracked.mp4"
output_txt = "/content/bytetrack_results.txt"
yolo_dets_txt = "/content/yolo_detections.txt"


# Open video
cap = cv2.VideoCapture(input_video)
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Video writer for output
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video, fourcc, fps, (width, height))

# Prepare colors for drawing
np.random.seed(42)
colors = np.random.randint(0, 255, size=(1000, 3), dtype=np.uint8)

frame_id = 0
track_results = []

while True:
    ret, frame = cap.read()
    if not ret:
        break
    frame_id += 1

    # Run YOLOv8 detection on RGB frame
    #dont know what this does not written by me......
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = model(rgb_frame)[0]

    # Filter detections: person class (0), conf > 0.3
    dets = []
    for box, score, cls in zip(results.boxes.xyxy.cpu().numpy(),
                               results.boxes.conf.cpu().numpy(),
                               results.boxes.cls.cpu().numpy().astype(int)):
        if cls == 0 and score > 0.3:
            x1, y1, x2, y2 = box
            dets.append([x1, y1, x2, y2, score])
    with open(yolo_dets_txt, "a") as f_dets:
      for det in dets:
        x1, y1, x2, y2, score = det
        f_dets.write(f"{frame_id},{x1:.2f},{y1:.2f},{x2:.2f},{y2:.2f},{score:.3f}\n")


    if len(dets) == 0:
        online_targets = tracker.update(np.empty((0, 5)), (height, width), (height, width))
    else:
        online_targets = tracker.update(np.array(dets), (height, width), (height, width))

    # Draw boxes and IDs
    for t in online_targets:
        tlwh = t.tlwh
        tid = t.track_id
        x, y, w, h = map(int, tlwh)
        color = tuple(int(c) for c in colors[tid % 1000])

        cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
        cv2.putText(frame, f'ID:{tid}', (x, y - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)

        track_results.append(f"{frame_id},{tid},{x},{y},{w},{h},1,-1,-1,-1")

    # Write frame to output video
    out.write(frame)

    # Display live (optional, uncomment if running locally)
    # cv2.imshow("Tracking", frame)
    # if cv2.waitKey(1) & 0xFF == ord('q'):
    #     break

cap.release()
out.release()
# cv2.destroyAllWindows()  # Uncomment if using imshow

# Save tracking results
with open(output_txt, "w") as f:
    for line in track_results:
        f.write(line + "\n")

print(f"Input video FPS: {fps}")
print(f"Video resolution: {width}x{height}")
print(f"Total frames processed: {frame_id}")
print(f"Tracking done. Video saved to {output_video}")
print(f"Tracking results saved to {output_txt}")
