In [None]:
!pip install deep-sort-realtime
!pip install opencv-python-headless
!pip install ultralytics
!pip install torchreid

!git clone https://github.com/KaiyangZhou/deep-person-reid.git
%cd deep-person-reid
!pip install -e .

!pip install torchvision



Collecting deep-sort-realtime
  Downloading deep_sort_realtime-1.3.2-py3-none-any.whl.metadata (12 kB)
Downloading deep_sort_realtime-1.3.2-py3-none-any.whl (8.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep-sort-realtime
Successfully installed deep-sort-realtime-1.3.2


Best.py model inference

In [None]:
from ultralytics import YOLO
import cv2
import os
from IPython.display import Image, display
from torchvision.models import resnet50
from scipy.optimize import linear_sum_assignment
import torch
import numpy as np
from deep_sort_realtime.deepsort_tracker import DeepSort
from sklearn.metrics.pairwise import cosine_similarity
import torch.nn.functional as F
import torchvision.models as models
import torchvision.transforms as transforms

In [None]:
# Make sure to change the path to where the best.pt model is stored
yolo = YOLO("best.pt")

# Load a frame from a video to perform the inference
video_path = "Videos/broadcast.mp4"
cap = cv2.VideoCapture(video_path)
ret, frame = cap.read()
cap.release()

if not ret:
    raise ValueError("Could not read a frame from the video.")

# Run inference
results = yolo(frame)

# Annotate image with bounding boxes
annotated_frame = results[0].plot()

# Save the annotated image
#cv2.imwrite("/content/annotated_output.jpg", annotated_frame)

# OPTIONAL: Show it inline (if using Jupyter or Colab)

display(Image(filename="/content/annotated_output.jpg"))


In [None]:

# --- Config ---
VIDEO_PATH = "Videos/broadcast.mp4"
OUTPUT_PATH = "Videos/broadcast_yolo_only.mp4"


class_names = yolo.names  # e.g., {0: 'ball', 1: 'goalkeeper', 2: 'player', 3: 'referee'}

# --- Open Video ---
cap = cv2.VideoCapture(VIDEO_PATH)
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(OUTPUT_PATH, fourcc, fps, (width, height))

# --- Frame-by-frame detection ---
while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Run YOLO inference
    results = yolo(frame)[0]

    # Draw detections
    for box in results.boxes:
        x1, y1, x2, y2 = map(int, box.xyxy[0])
        conf = box.conf[0].item()
        cls = int(box.cls[0])
        label = f"{class_names[cls]} {conf:.2f}"
        color = (0, 255, 0) if cls == 2 else (255, 0, 0)  # green for player, blue for others

        cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
        cv2.putText(frame, label, (x1, y1 - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

    out.write(frame)

# --- Cleanup ---
cap.release()
out.release()
print("Done! YOLO-only tracked video saved to:", OUTPUT_PATH)



0: 384x640 3 players, 2689.2ms
Speed: 3.4ms preprocess, 2689.2ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 3118.8ms
Speed: 4.0ms preprocess, 3118.8ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 players, 3634.1ms
Speed: 5.6ms preprocess, 3634.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 players, 2679.6ms
Speed: 4.9ms preprocess, 2679.6ms inference, 5.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 players, 2518.2ms
Speed: 4.6ms preprocess, 2518.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 players, 1 referee, 2734.2ms
Speed: 3.5ms preprocess, 2734.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 goalkeeper, 12 players, 1 referee, 3973.3ms
Speed: 3.6ms preprocess, 3973.3ms inference, 3.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 1 goalkeeper, 14 players,

In [None]:
print(yolo.names)

{0: 'ball', 1: 'goalkeeper', 2: 'player', 3: 'referee'}


In [None]:
# --- Cross-Camera Player Re-Identification Pipeline using Visual, Spatial, and Temporal Features ---

VIDEO_DIR = "Videos"
OUTPUT_DIR ="Results"

# Setting the confidence and similarity threshold for identifying the class and same player
CONF_THRESHOLD = 0.7
SIMILARITY_THRESHOLD = 0.90

# ------------------------ SETUP --------------------------

tracker = DeepSort(max_age=30, n_init=2)

resnet = resnet50(weights=models.ResNet50_Weights.DEFAULT)
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])
resnet.eval()

preprocess = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resnet = resnet.to(device)

# ------------------------ FUNCTIONS --------------------------
def extract_embedding(crop):
    if crop is None or crop.shape[0] < 10 or crop.shape[1] < 10:
        return None
    try:
        img_tensor = preprocess(crop).unsqueeze(0).to(device)
        with torch.no_grad():
            feat = resnet(img_tensor).squeeze()
            if feat.dim() > 1:
                feat = feat.flatten()
            emb = F.normalize(feat, dim=0).cpu().numpy()
        return emb
    except:
        return None

def extract_track_features(video_path):
    cap = cv2.VideoCapture(video_path)
    track_data = {}
    frame_id = 0
    frame_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        results = yolo(frame, verbose=False)[0]
        detections = []
        for box in results.boxes:
            cls = int(box.cls[0])
            conf = float(box.conf[0])
            if cls == 2 and conf > CONF_THRESHOLD:
                x1, y1, x2, y2 = box.xyxy[0]
                detections.append(([float(x1), float(y1), float(x2 - x1), float(y2 - y1)], conf, "player"))

        tracks = tracker.update_tracks(detections, frame=frame)

        for track in tracks:
            if not track.is_confirmed():
                continue
            tid = track.track_id
            l, t, r, b = map(int, track.to_ltrb())
            cx = (l + r) / 2 / frame_w
            cy = (t + b) / 2 / frame_h
            crop = frame[t:b, l:r]
            feature = extract_embedding(crop)
            if feature is None:
                continue
            if tid not in track_data:
                track_data[tid] = {"features": [], "positions": []}
            track_data[tid]["features"].append(feature)
            track_data[tid]["positions"].append([cx, cy])

        frame_id += 1

    cap.release()

    # Aggregate features
    final_tracks = {}
    for tid, data in track_data.items():
        feats = np.array(data["features"])
        pos = np.array(data["positions"])

        # Visual: average embedding
        avg_feat = feats.mean(axis=0)
        avg_feat = avg_feat / np.linalg.norm(avg_feat)

        # Spatial: average position
        avg_pos = pos.mean(axis=0)

        # Temporal: motion sequence flattened (padded to length 10 if needed)
        motion = pos[1:] - pos[:-1]
        max_len = 10
        motion_flat = np.zeros(max_len * 2)
        valid_motion = motion[-max_len:]
        motion_flat[:valid_motion.size] = valid_motion.flatten()

        final_tracks[tid] = (avg_feat, avg_pos, motion_flat)

    return final_tracks




def match_tracks_hungarian(source_tracks, target_tracks):
    source_ids = list(source_tracks.keys())
    target_ids = list(target_tracks.keys())

    cost_matrix = np.zeros((len(target_ids), len(source_ids)))

    for i, tid in enumerate(target_ids):
        t_feat, t_pos, t_mov = target_tracks[tid]
        for j, sid in enumerate(source_ids):
            s_feat, s_pos, s_mov = source_tracks[sid]

            emb_score = cosine_similarity([t_feat], [s_feat])[0][0]
            pos_score = 1 - np.linalg.norm(t_pos - s_pos)
            mov_score = 1 - np.linalg.norm(t_mov - s_mov)

            # Cost = 1 - total score
            total_score = 0.6 * emb_score + 0.25 * pos_score + 0.15 * mov_score
            cost_matrix[i, j] = 1 - total_score

    row_ind, col_ind = linear_sum_assignment(cost_matrix)

    mapping = {}
    for i, j in zip(row_ind, col_ind):
        score = 1 - cost_matrix[i, j]
        tid = int(target_ids[i])
        sid = int(source_ids[j])
        if score >= SIMILARITY_THRESHOLD:
            mapping[tid] = sid
        else:
            mapping[tid] = tid + 1000  # no confident match

    return mapping


def remap_and_save(video_path, id_map, output_path):
    cap = cv2.VideoCapture(video_path)
    out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'),
                          int(cap.get(cv2.CAP_PROP_FPS)),
                          (int(cap.get(3)), int(cap.get(4))))
    tracker = DeepSort(max_age=30, n_init=2)

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        results = yolo(frame, verbose=False)[0]
        detections = []
        for box in results.boxes:
            cls = int(box.cls[0])
            conf = float(box.conf[0])
            if cls == 2 and conf > CONF_THRESHOLD:
                x1, y1, x2, y2 = box.xyxy[0]
                detections.append(([float(x1), float(y1), float(x2 - x1), float(y2 - y1)], conf, "player"))

        tracks = tracker.update_tracks(detections, frame=frame)

        for track in tracks:
            if not track.is_confirmed():
                continue

            tid_int = int(track.track_id)
            mapped_id = id_map.get(tid_int, tid_int + 1000)

            l, t, r, b = map(int, track.to_ltrb())
            cv2.rectangle(frame, (l, t), (r, b), (0, 255, 0), 2)
            cv2.putText(frame, f"ID: {mapped_id}", (l, t - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)

        out.write(frame)

    cap.release()
    out.release()

# ------------------------ MAIN --------------------------
if __name__ == "__main__":
    broadcast_path = os.path.join(VIDEO_DIR, "broadcast.mp4")
    tacticam_path = os.path.join(VIDEO_DIR, "tacticam.mp4")

    # Extracting visual + spatial + temporal features from broadcast
    b_tracks = extract_track_features(broadcast_path)

    #Extracting visual + spatial + temporal features from tacticam
    t_tracks = extract_track_features(tacticam_path)

    #Matching players across videos
    id_map = match_tracks_hungarian(b_tracks, t_tracks)


    #Saving the remapped video
    remap_and_save(tacticam_path, id_map, os.path.join(OUTPUT_DIR, "tacticam_remapped.mp4"))

    #saving the tracked video
    remap_and_save(broadcast_path, {k: k for k in b_tracks}, os.path.join(OUTPUT_DIR, "broadcast_tracked.mp4"))

    