In [1]:
import torch
from ultralytics import YOLO
import cv2
import numpy as np
from torchvision import models, transforms
from scipy.spatial.distance import cosine
from tqdm import tqdm
import os

In [2]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [3]:
# Load YOLOv11 model (pretrained and fine-tuned)
model = YOLO(r'D:\Liat_ai\best.pt')  # Raw string path

# Load ResNet18 feature extractor
resnet = models.resnet18(pretrained=True)
resnet.fc = torch.nn.Identity()
resnet = resnet.to(device).eval()

# Transform function for crops
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((128, 64)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])



In [4]:
def detect_players(frame):
    results = model(frame)[0]
    detections = []
    for box in results.boxes:
        cls_id = int(box.cls.item())
        conf = float(box.conf.item())
        if conf > 0.5:
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            detections.append((x1, y1, x2, y2, conf))
    return detections


In [5]:
# Init re-id tracking
player_embeddings = {}
next_player_id = 0
SIMILARITY_THRESHOLD = 0.6

# Input video
cap = cv2.VideoCapture(r'D:\Liat_ai\15sec_input_720p.mp4')
if not cap.isOpened():
    raise ValueError("Error: video not opened!")

width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)

# Output video
out = cv2.VideoWriter(r'D:\Liat_ai\output.mp4',
                      cv2.VideoWriter_fourcc(*'mp4v'),
                      fps, (width, height))


In [6]:
frame_num = 0
while True:
    ret, frame = cap.read()
    if not ret:
        break

    detections = detect_players(frame)
    current_frame_ids = []

    for (x1, y1, x2, y2, conf) in detections:
        crop = frame[y1:y2, x1:x2]
        if crop.size == 0:
            continue

        input_tensor = transform(crop).unsqueeze(0).to(device)
        with torch.no_grad():
            feature = resnet(input_tensor).squeeze().cpu().numpy()

        # Match with existing players
        best_pid = None
        best_similarity = 1

        for pid, prev_feat in player_embeddings.items():
            similarity = cosine(prev_feat, feature)
            if similarity < best_similarity:
                best_similarity = similarity
                best_pid = pid

        if best_similarity < SIMILARITY_THRESHOLD:
            current_frame_ids.append((x1, y1, x2, y2, best_pid))
            player_embeddings[best_pid] = feature
        else:
            player_embeddings[next_player_id] = feature
            current_frame_ids.append((x1, y1, x2, y2, next_player_id))
            next_player_id += 1

    # Draw results
    for (x1, y1, x2, y2, pid) in current_frame_ids:
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0,255,0), 2)
        cv2.putText(frame, f'Player {pid}', (x1, y1 - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,0), 2)

    out.write(frame)
    frame_num += 1
    if frame_num % 10 == 0:
        print(f"Processed {frame_num} frames...")

cap.release()
out.release()
print("✅ Done! Video saved.")



0: 384x640 1 ball, 16 players, 2 referees, 813.1ms
Speed: 6.5ms preprocess, 813.1ms inference, 9.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 18 players, 2 referees, 721.9ms
Speed: 2.5ms preprocess, 721.9ms inference, 2.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 16 players, 2 referees, 724.4ms
Speed: 2.0ms preprocess, 724.4ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 14 players, 2 referees, 741.4ms
Speed: 2.2ms preprocess, 741.4ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 14 players, 2 referees, 1207.1ms
Speed: 3.4ms preprocess, 1207.1ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 16 players, 2 referees, 1255.6ms
Speed: 2.5ms preprocess, 1255.6ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 players, 2 referees, 1125.3ms
Speed: 2.2ms preprocess, 1125.3ms inference, 1.1ms postproc

In [15]:
import cv2
import matplotlib.pyplot as plt

cap = cv2.VideoCapture(r'D:\Liat_ai\output.mp4')
ret, frame = cap.read()

if ret:
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    plt.imshow(frame_rgb)
    plt.axis('off')
    plt.title("First Frame from Output Video")
    plt.show()
else:
    print("❌ Could not read frame.")
    
cap.release()



error: OpenCV(4.11.0) D:\a\opencv-python\opencv-python\opencv\modules\highgui\src\window.cpp:1301: error: (-2:Unspecified error) The function is not implemented. Rebuild the library with Windows, GTK+ 2.x or Cocoa support. If you are on Ubuntu or Debian, install libgtk2.0-dev and pkg-config, then re-run cmake or configure script in function 'cvShowImage'
