## code to read frames from tacticam file and generate the bounding box for set of frames

In [17]:
from ultralytics import YOLO
import cv2
import os

#defin the model and video
model = YOLO('model/best.pt')

video_path = "dataset/tacticam.mp4"
cap = cv2.VideoCapture(video_path)

In [18]:
if not cap.isOpened():
    print("no capture found")
else:
    print("capture found")

capture found


In [19]:
# creating output dir
output_dir = "saved_output"
os.makedirs(output_dir, exist_ok=True)


In [20]:
#set threshold to process
max_frames = 5
frame_count = 0

In [21]:
## read the frames and process the boxes
while frame_count<max_frames:
    ret, frame = cap.read()

    #checking if we have frame
    if not ret:
        print("End of frames reacherd")
        break

    #run inference on model 
    output = model.predict(source=frame, conf=0.50, save = False, verbose=False)
    output = output[0]

    #get the annotated frame with labels
    annotated_frame = output.plot()
    
    #save the frame
    save_frame_dir = os.path.join(output_dir, f"frame_{frame_count}")
    os.makedirs(save_frame_dir, exist_ok=True)
    save_frame_obj_dir = os.path.join(save_frame_dir, f"frame_{frame_count}.jpg")
    cv2.imwrite(save_frame_obj_dir, annotated_frame)
    print(f"saved frame to : {save_frame_obj_dir}")

    #save each player crop
    boxes = output.boxes
    if boxes is not None:
        for i, box in enumerate(boxes):
            x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
            cls_id = int(box.cls[0].item())
            class_name = model.names[cls_id]

            if class_name.lower() == "player":  
                crop = frame[y1:y2, x1:x2]
                player_path = os.path.join(save_frame_dir, f"player_{i+1}.jpg")
                cv2.imwrite(player_path, crop)
                print(f"Saved player crop: {player_path}")

    frame_count += 1

cap.release()


saved frame to : saved_output/frame_0/frame_0.jpg
Saved player crop: saved_output/frame_0/player_1.jpg
Saved player crop: saved_output/frame_0/player_2.jpg
Saved player crop: saved_output/frame_0/player_3.jpg
Saved player crop: saved_output/frame_0/player_4.jpg
Saved player crop: saved_output/frame_0/player_5.jpg
Saved player crop: saved_output/frame_0/player_6.jpg
Saved player crop: saved_output/frame_0/player_7.jpg
Saved player crop: saved_output/frame_0/player_8.jpg
Saved player crop: saved_output/frame_0/player_9.jpg
Saved player crop: saved_output/frame_0/player_10.jpg
Saved player crop: saved_output/frame_0/player_11.jpg
Saved player crop: saved_output/frame_0/player_12.jpg
Saved player crop: saved_output/frame_0/player_13.jpg
Saved player crop: saved_output/frame_0/player_14.jpg
Saved player crop: saved_output/frame_0/player_15.jpg
Saved player crop: saved_output/frame_0/player_16.jpg
Saved player crop: saved_output/frame_0/player_17.jpg
Saved player crop: saved_output/frame_0/p

In [29]:
import os
import cv2
from ultralytics import YOLO

# Load the model
model = YOLO("model/best.pt")

# Open the video
video_path = "dataset/tacticam.mp4"
output_dir = "saved_frames_tacticam"
# video_path = "dataset/broadcast.mp4"
# output_dir = "saved_frames_broadcast"
cap = cv2.VideoCapture(video_path)

os.makedirs(output_dir, exist_ok=True)

frame_count = 0
max_frames = 5
font = cv2.FONT_HERSHEY_SIMPLEX

while frame_count < max_frames:
    ret, frame = cap.read()
    if not ret:
        print("End of frames reached.")
        break

    # Run YOLO inference
    output = model.predict(source=frame, conf=0.75, save=False, verbose=False)[0]

    # Prepare to manually draw on frame
    annotated_frame = frame.copy()

    boxes = output.boxes
    player_id = 0  # Start from 0 for each frame

    # Prepare directories
    save_frame_dir = os.path.join(output_dir, f"frame_{frame_count}")
    os.makedirs(save_frame_dir, exist_ok=True)

    if boxes is not None:
        for i, box in enumerate(boxes):
            x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
            cls_id = int(box.cls[0].item())
            class_name = model.names[cls_id]

            if class_name.lower() == "player":
                # Draw bounding box and custom player ID
                cv2.rectangle(annotated_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                conf_score = float(box.conf[0])
                label_text = f"ID: {player_id}, Conf: {conf_score:.2f}"
                cv2.putText(annotated_frame, label_text, (x1, y1 - 10),
                            font, 0.6, (0, 255, 0), 2)

                # Save player crop
                crop = frame[y1:y2, x1:x2]
                player_path = os.path.join(save_frame_dir, f"player_{player_id}.jpg")
                cv2.imwrite(player_path, crop)
                print(f"Saved player crop: {player_path}")

                player_id += 1

    # Save annotated frame with custom IDs
    save_frame_path = os.path.join(save_frame_dir, f"frame_{frame_count}.jpg")
    cv2.imwrite(save_frame_path, annotated_frame)
    print(f"Saved frame with player IDs: {save_frame_path}")

    frame_count += 1

cap.release()

Saved player crop: saved_frames_tacticam/frame_0/player_0.jpg
Saved player crop: saved_frames_tacticam/frame_0/player_1.jpg
Saved player crop: saved_frames_tacticam/frame_0/player_2.jpg
Saved player crop: saved_frames_tacticam/frame_0/player_3.jpg
Saved player crop: saved_frames_tacticam/frame_0/player_4.jpg
Saved player crop: saved_frames_tacticam/frame_0/player_5.jpg
Saved player crop: saved_frames_tacticam/frame_0/player_6.jpg
Saved player crop: saved_frames_tacticam/frame_0/player_7.jpg
Saved player crop: saved_frames_tacticam/frame_0/player_8.jpg
Saved player crop: saved_frames_tacticam/frame_0/player_9.jpg
Saved player crop: saved_frames_tacticam/frame_0/player_10.jpg
Saved player crop: saved_frames_tacticam/frame_0/player_11.jpg
Saved player crop: saved_frames_tacticam/frame_0/player_12.jpg
Saved player crop: saved_frames_tacticam/frame_0/player_13.jpg
Saved player crop: saved_frames_tacticam/frame_0/player_14.jpg
Saved player crop: saved_frames_tacticam/frame_0/player_15.jpg
Sa

In [None]:
import os
import cv2
import torch
import numpy as np
import torch.nn.functional as F
from ultralytics import YOLO
from torchreid.utils import FeatureExtractor

# Load YOLO model
model = YOLO("model/best.pt")

# Load ReID model
extractor = FeatureExtractor(
    model_name='osnet_x1_0',
    model_path='osnet_x1_0_market1501.pth',
    device='cpu'  # Change to 'cuda' if using GPU
)

# Config
video_path = "dataset/tacticam.mp4"
output_dir = "saved_frames_tacticam_1"
cap = cv2.VideoCapture(video_path)

os.makedirs(output_dir, exist_ok=True)

# Variables
frame_count = 0
max_frames = 5
font = cv2.FONT_HERSHEY_SIMPLEX
threshold = 0.8  # similarity threshold

# Store known player embeddings and their unique IDs
embed_array = []  # list of (embedding tensor, player_id)
global_player_id = 0

while frame_count < max_frames:
    ret, frame = cap.read()
    if not ret:
        print("End of frames reached.")
        break

    output = model.predict(source=frame, conf=0.75, save=False, verbose=False)[0]
    annotated_frame = frame.copy()
    boxes = output.boxes

    # Frame directory
    save_frame_dir = os.path.join(output_dir, f"frame_{frame_count}")
    os.makedirs(save_frame_dir, exist_ok=True)

    if boxes is not None:
        for i, box in enumerate(boxes):
            x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
            cls_id = int(box.cls[0].item())
            class_name = model.names[cls_id]

            if class_name.lower() == "player":
                # Crop player
                crop = frame[y1:y2, x1:x2]
                crop_rgb = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
                emb = extractor(crop_rgb)
                emb = F.normalize(emb, p=2, dim=1)  # normalize embedding

                matched_id = None
                max_sim = 0

                for saved_emb, pid in embed_array:
                    sim = torch.mm(emb, saved_emb.t()).item()
                    if sim > max_sim and sim > threshold:
                        max_sim = sim
                        matched_id = pid

                if matched_id is not None:
                    # Update embedding (simple average for now)
                    new_emb = (emb + [e for e, id in embed_array if id == matched_id][0]) / 2
                    new_emb = F.normalize(new_emb, p=2, dim=1)
                    for idx, (e, id) in enumerate(embed_array):
                        if id == matched_id:
                            embed_array[idx] = (new_emb, matched_id)
                            break
                    player_id = matched_id
                else:
                    # New player
                    player_id = global_player_id
                    embed_array.append((emb, player_id))
                    global_player_id += 1

                # Annotate
                conf_score = float(box.conf[0])
                label_text = f"ID: {player_id}, Conf: {conf_score:.2f}"
                cv2.rectangle(annotated_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                cv2.putText(annotated_frame, label_text, (x1, y1 - 10),
                            font, 0.6, (0, 255, 0), 2)

                # Save player crop
                player_path = os.path.join(save_frame_dir, f"player_{player_id}.jpg")
                cv2.imwrite(player_path, crop)
                print(f"Saved player {player_id} crop: {player_path}")

    # Save annotated frame
    save_frame_path = os.path.join(save_frame_dir, f"frame_{frame_count}.jpg")
    cv2.imwrite(save_frame_path, annotated_frame)
    print(f"Saved annotated frame: {save_frame_path}")

    frame_count += 1

cap.release()

Successfully loaded imagenet pretrained weights from "/Users/vivek/.cache/torch/checkpoints/osnet_x1_0_imagenet.pth"
** The following layers are discarded due to unmatched keys or layer size: ['classifier.weight', 'classifier.bias']
Model: osnet_x1_0
- params: 2,193,616
- flops: 978,878,352
🧍 Saved player 0 crop: saved_frames_tacticam_1/frame_0/player_0.jpg
🧍 Saved player 1 crop: saved_frames_tacticam_1/frame_0/player_1.jpg
🧍 Saved player 2 crop: saved_frames_tacticam_1/frame_0/player_2.jpg
🧍 Saved player 3 crop: saved_frames_tacticam_1/frame_0/player_3.jpg
🧍 Saved player 0 crop: saved_frames_tacticam_1/frame_0/player_0.jpg
🧍 Saved player 4 crop: saved_frames_tacticam_1/frame_0/player_4.jpg
🧍 Saved player 1 crop: saved_frames_tacticam_1/frame_0/player_1.jpg
🧍 Saved player 1 crop: saved_frames_tacticam_1/frame_0/player_1.jpg
🧍 Saved player 1 crop: saved_frames_tacticam_1/frame_0/player_1.jpg
🧍 Saved player 5 crop: saved_frames_tacticam_1/frame_0/player_5.jpg
🧍 Saved player 6 crop: saved

## incorparting spatial distance in corrosponsig bouding box at i frame and i-1 frame for similiary search

In [9]:
import os
import cv2
import torch
import numpy as np
from ultralytics import YOLO
from torchreid.utils import FeatureExtractor
import torch.nn.functional as F

# Load the model
model = YOLO("model/best.pt")

# Load the ReID extractor
extractor = FeatureExtractor(
    model_name='osnet_x1_0',
    model_path='osnet_x1_0_market1501.pth',
    device='cpu'
)

# Video settings
video_path = "dataset/tacticam.mp4"
output_dir = "saved_frames_tacticam_2"
cap = cv2.VideoCapture(video_path)
os.makedirs(output_dir, exist_ok=True)

frame_count = 0
max_frames = 20
font = cv2.FONT_HERSHEY_SIMPLEX

# Tracking database
embed_array = []  # stores (embedding, player_id, (cx, cy))
global_player_id = 0

while frame_count < max_frames:
    ret, frame = cap.read()
    if not ret:
        print("End of frames reached.")
        break

    # Run YOLO inference
    output = model.predict(source=frame, conf=0.75, save=False, verbose=False)[0]

    # Prepare to manually draw on frame
    annotated_frame = frame.copy()

    boxes = output.boxes

    # Prepare directories
    save_frame_dir = os.path.join(output_dir, f"frame_{frame_count}")
    os.makedirs(save_frame_dir, exist_ok=True)

    if boxes is not None:
        for i, box in enumerate(boxes):
            x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
            cls_id = int(box.cls[0].item())
            class_name = model.names[cls_id]

            if class_name.lower() == "player":
                # get the bounding box for each player
                crop = frame[y1:y2, x1:x2]
                #convert to rgb
                crop_rgb = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)

                # Extract embedding
                emb = extractor(crop_rgb)
                emb = F.normalize(emb, p=2, dim=1)

                # Compute center of bounding box
                cx, cy = (x1 + x2) / 2, (y1 + y2) / 2

                matched_id = None
                best_score = 0

                for saved_emb, pid, (prev_cx, prev_cy) in embed_array:
                    ## getting the similiaty score and spatial distance
                    sim = torch.mm(emb, saved_emb.t()).item()
                    spatial_dist = np.sqrt((cx - prev_cx) ** 2 + (cy - prev_cy) ** 2)
                    
                    box_diag = np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
                    norm_dist = spatial_dist / (box_diag + 1e-6)
                    final_score = (sim * 6 + (1 - norm_dist)) / 7

                    if final_score > best_score and final_score > 0.7:
                        best_score = final_score
                        matched_id = pid

                if matched_id is not None:
                    for idx, (e, pid, pos) in enumerate(embed_array):
                        if pid == matched_id:
                            new_emb = F.normalize(e * 0.6 + emb * 0.4, p=2, dim=1)
                            embed_array[idx] = (new_emb, matched_id, (cx, cy))
                            break
                    player_id = matched_id
                else:
                    player_id = global_player_id
                    embed_array.append((emb, player_id, (cx, cy)))
                    global_player_id += 1

                # Draw box and ID
                cv2.rectangle(annotated_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                conf_score = float(box.conf[0])
                label_text = f"ID: {player_id}, Conf: {conf_score:.2f}"
                cv2.putText(annotated_frame, label_text, (x1, y1 - 10), font, 0.6, (0, 255, 0), 2)

                # Save crop
                player_path = os.path.join(save_frame_dir, f"player_{player_id}.jpg")
                cv2.imwrite(player_path, crop)
                print(f"Saved player crop: {player_path}")

    # Save annotated frame
    save_frame_path = os.path.join(save_frame_dir, f"frame_{frame_count}.jpg")
    cv2.imwrite(save_frame_path, annotated_frame)
    print(f"Saved frame with player IDs: {save_frame_path}")

    frame_count += 1

cap.release()

Successfully loaded imagenet pretrained weights from "/Users/vivek/.cache/torch/checkpoints/osnet_x1_0_imagenet.pth"
** The following layers are discarded due to unmatched keys or layer size: ['classifier.weight', 'classifier.bias']
Model: osnet_x1_0
- params: 2,193,616
- flops: 978,878,352
Saved player crop: saved_frames_tacticam_2/frame_0/player_0.jpg
Saved player crop: saved_frames_tacticam_2/frame_0/player_1.jpg
Saved player crop: saved_frames_tacticam_2/frame_0/player_2.jpg
Saved player crop: saved_frames_tacticam_2/frame_0/player_3.jpg
Saved player crop: saved_frames_tacticam_2/frame_0/player_4.jpg
Saved player crop: saved_frames_tacticam_2/frame_0/player_5.jpg
Saved player crop: saved_frames_tacticam_2/frame_0/player_6.jpg
Saved player crop: saved_frames_tacticam_2/frame_0/player_7.jpg
Saved player crop: saved_frames_tacticam_2/frame_0/player_8.jpg
Saved player crop: saved_frames_tacticam_2/frame_0/player_9.jpg
Saved player crop: saved_frames_tacticam_2/frame_0/player_10.jpg
Sav

## doing the above for braodcast cam

In [10]:
import os
import cv2
import torch
import numpy as np
from ultralytics import YOLO
from torchreid.utils import FeatureExtractor
import torch.nn.functional as F

# Load the model
model = YOLO("model/best.pt")

# Load the ReID extractor
extractor = FeatureExtractor(
    model_name='osnet_x1_0',
    model_path='osnet_x1_0_market1501.pth',
    device='cpu'
)

# Video settings
video_path = "dataset/broadcast.mp4"
output_dir = "saved_frames_broadcast"
cap = cv2.VideoCapture(video_path)
os.makedirs(output_dir, exist_ok=True)

frame_count = 0
max_frames = 20
font = cv2.FONT_HERSHEY_SIMPLEX

# Tracking database
embed_array = []  # stores (embedding, player_id, (cx, cy))
global_player_id = 0

while frame_count < max_frames:
    ret, frame = cap.read()
    if not ret:
        print("End of frames reached.")
        break

    # Run YOLO inference
    output = model.predict(source=frame, conf=0.4, save=False, verbose=False)[0]

    # Prepare to manually draw on frame
    annotated_frame = frame.copy()

    boxes = output.boxes

    # Prepare directories
    save_frame_dir = os.path.join(output_dir, f"frame_{frame_count}")
    os.makedirs(save_frame_dir, exist_ok=True)

    if boxes is not None:
        for i, box in enumerate(boxes):
            x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
            cls_id = int(box.cls[0].item())
            class_name = model.names[cls_id]

            if class_name.lower() == "player":
                # get the bounding box for each player
                crop = frame[y1:y2, x1:x2]
                #convert to rgb
                crop_rgb = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)

                # Extract embedding
                emb = extractor(crop_rgb)
                emb = F.normalize(emb, p=2, dim=1)

                # Compute center of bounding box
                cx, cy = (x1 + x2) / 2, (y1 + y2) / 2

                matched_id = None
                best_score = 0

                for saved_emb, pid, (prev_cx, prev_cy) in embed_array:
                    ## getting the similiaty score and spatial distance
                    sim = torch.mm(emb, saved_emb.t()).item()
                    spatial_dist = np.sqrt((cx - prev_cx) ** 2 + (cy - prev_cy) ** 2)
                    
                    box_diag = np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
                    norm_dist = spatial_dist / (box_diag + 1e-6)
                    final_score = (sim * 6 + (1 - norm_dist)) / 7

                    if final_score > best_score and final_score > 0.7:
                        best_score = final_score
                        matched_id = pid

                if matched_id is not None:
                    for idx, (e, pid, pos) in enumerate(embed_array):
                        if pid == matched_id:
                            new_emb = F.normalize(e * 0.6 + emb * 0.4, p=2, dim=1)
                            embed_array[idx] = (new_emb, matched_id, (cx, cy))
                            break
                    player_id = matched_id
                else:
                    player_id = global_player_id
                    embed_array.append((emb, player_id, (cx, cy)))
                    global_player_id += 1

                # Draw box and ID
                cv2.rectangle(annotated_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                conf_score = float(box.conf[0])
                label_text = f"ID: {player_id}, Conf: {conf_score:.2f}"
                cv2.putText(annotated_frame, label_text, (x1, y1 - 10), font, 0.6, (0, 255, 0), 2)

                # Save crop
                player_path = os.path.join(save_frame_dir, f"player_{player_id}.jpg")
                cv2.imwrite(player_path, crop)
                print(f"Saved player crop: {player_path}")

    # Save annotated frame
    save_frame_path = os.path.join(save_frame_dir, f"frame_{frame_count}.jpg")
    cv2.imwrite(save_frame_path, annotated_frame)
    print(f"Saved frame with player IDs: {save_frame_path}")

    frame_count += 1

cap.release()

Successfully loaded imagenet pretrained weights from "/Users/vivek/.cache/torch/checkpoints/osnet_x1_0_imagenet.pth"
** The following layers are discarded due to unmatched keys or layer size: ['classifier.weight', 'classifier.bias']
Model: osnet_x1_0
- params: 2,193,616
- flops: 978,878,352
Saved player crop: saved_frames_broadcast/frame_0/player_0.jpg
Saved player crop: saved_frames_broadcast/frame_0/player_1.jpg
Saved player crop: saved_frames_broadcast/frame_0/player_2.jpg
Saved player crop: saved_frames_broadcast/frame_0/player_3.jpg
Saved player crop: saved_frames_broadcast/frame_0/player_4.jpg
Saved player crop: saved_frames_broadcast/frame_0/player_5.jpg
Saved player crop: saved_frames_broadcast/frame_0/player_6.jpg
Saved player crop: saved_frames_broadcast/frame_0/player_7.jpg
Saved player crop: saved_frames_broadcast/frame_0/player_8.jpg
Saved player crop: saved_frames_broadcast/frame_0/player_9.jpg
Saved player crop: saved_frames_broadcast/frame_0/player_8.jpg
Saved frame wit

### mixing tacticam and broadcast

In [19]:
import os
import cv2
import torch
import numpy as np
from ultralytics import YOLO
from torchreid.utils import FeatureExtractor
import torch.nn.functional as F

# Load YOLO and ReID models
model = YOLO("model/best.pt")
extractor = FeatureExtractor(
    model_name='osnet_x1_0',
    model_path='osnet_x1_0_market1501.pth',
    device='cpu'
)

# Video paths and output dirs
video_paths = {
    "tacticam": "dataset/tacticam.mp4",
    "broadcast": "dataset/broadcast.mp4"
}
output_dirs = {
    "tacticam": "saved_frames_tacticam",
    "broadcast": "saved_frames_broadcast"
}

# Create output directories
for view in output_dirs:
    os.makedirs(output_dirs[view], exist_ok=True)

# Open video captures
cap_tacticam = cv2.VideoCapture(video_paths["tacticam"])
cap_broadcast = cv2.VideoCapture(video_paths["broadcast"])

# Parameters
max_frames = 5
font = cv2.FONT_HERSHEY_SIMPLEX

# Shared ReID database
embed_array = []  # stores (embedding, player_id, (cx, cy))
global_player_id = 0


def process_frame(view_name, frame, frame_index):
    global embed_array, global_player_id

    annotated_frame = frame.copy()
    save_frame_dir = os.path.join(output_dirs[view_name], f"frame_{frame_index}")
    os.makedirs(save_frame_dir, exist_ok=True)

    # Run YOLO detection
    output = model.predict(source=frame, conf=0.4, save=False, verbose=False)[0]
    boxes = output.boxes

    if boxes is not None:
        for i, box in enumerate(boxes):
            x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
            cls_id = int(box.cls[0].item())
            class_name = model.names[cls_id]

            if class_name.lower() == "player":
                crop = frame[y1:y2, x1:x2]
                crop_rgb = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)

                # Extract and normalize embedding
                emb = extractor(crop_rgb)
                emb = F.normalize(emb, p=2, dim=1)
                cx, cy = (x1 + x2) / 2, (y1 + y2) / 2

                matched_id = None
                best_score = 0

                for saved_emb, pid, (prev_cx, prev_cy) in embed_array:
                    sim = torch.mm(emb, saved_emb.t()).item()
                    spatial_dist = np.sqrt((cx - prev_cx) ** 2 + (cy - prev_cy) ** 2)
                    box_diag = np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
                    norm_dist = spatial_dist / (box_diag + 1e-6)
                    final_score = (sim * 6 + (1 - norm_dist)) / 7

                    if final_score > best_score and final_score > 0.7:
                        best_score = final_score
                        matched_id = pid

                # Update or assign new player ID
                if matched_id is not None:
                    for idx, (e, pid, pos) in enumerate(embed_array):
                        if pid == matched_id:
                            new_emb = F.normalize(e * 0.6 + emb * 0.4, p=2, dim=1)
                            embed_array[idx] = (new_emb, pid, (cx, cy))
                            break
                    player_id = matched_id
                else:
                    player_id = global_player_id
                    embed_array.append((emb, player_id, (cx, cy)))
                    global_player_id += 1

                # Annotate frame
                cv2.rectangle(annotated_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                conf_score = float(box.conf[0])
                label_text = f"ID: {player_id}, Conf: {conf_score:.2f}"
                cv2.putText(annotated_frame, label_text, (x1, y1 - 10), font, 0.6, (0, 255, 0), 2)

                # Save cropped player image
                player_path = os.path.join(save_frame_dir, f"{view_name}_player_{player_id}.jpg")
                cv2.imwrite(player_path, crop)
                print(f"[{view_name}] Saved: {player_path}")

    # Save annotated frame
    annotated_path = os.path.join(save_frame_dir, f"{view_name}_frame_{frame_index}.jpg")
    cv2.imwrite(annotated_path, annotated_frame)
    print(f"[{view_name}] Frame saved: {annotated_path}")


# Main loop: synchronized frame-by-frame processing
for frame_idx in range(max_frames):
    ret1, frame_tacticam = cap_tacticam.read()
    ret2, frame_broadcast = cap_broadcast.read()

    if not ret1 or not ret2:
        print("One or both videos have ended.")
        break

    print(f"\n🔁 Processing Frame {frame_idx} from both views...")
    process_frame("tacticam", frame_tacticam, frame_idx)
    process_frame("broadcast", frame_broadcast, frame_idx)

# Release video resources
cap_tacticam.release()
cap_broadcast.release()

Successfully loaded imagenet pretrained weights from "/Users/vivek/.cache/torch/checkpoints/osnet_x1_0_imagenet.pth"
** The following layers are discarded due to unmatched keys or layer size: ['classifier.weight', 'classifier.bias']
Model: osnet_x1_0
- params: 2,193,616
- flops: 978,878,352

🔁 Processing Frame 0 from both views...
[tacticam] Saved: saved_frames_tacticam/frame_0/tacticam_player_0.jpg
[tacticam] Saved: saved_frames_tacticam/frame_0/tacticam_player_1.jpg
[tacticam] Saved: saved_frames_tacticam/frame_0/tacticam_player_2.jpg
[tacticam] Saved: saved_frames_tacticam/frame_0/tacticam_player_3.jpg
[tacticam] Saved: saved_frames_tacticam/frame_0/tacticam_player_4.jpg
[tacticam] Saved: saved_frames_tacticam/frame_0/tacticam_player_5.jpg
[tacticam] Saved: saved_frames_tacticam/frame_0/tacticam_player_6.jpg
[tacticam] Saved: saved_frames_tacticam/frame_0/tacticam_player_7.jpg
[tacticam] Saved: saved_frames_tacticam/frame_0/tacticam_player_8.jpg
[tacticam] Saved: saved_frames_tactic

## resolving bad cross mapping

In [18]:
import os
import cv2
import torch
import numpy as np
from ultralytics import YOLO
from torchreid.utils import FeatureExtractor
import torch.nn.functional as F

# Load YOLO and ReID models
model = YOLO("model/best.pt")
extractor = FeatureExtractor(
    model_name='osnet_x1_0',
    model_path='osnet_x1_0_market1501.pth',
    device='cpu'
)

video_paths = {
    "tacticam": "dataset/tacticam.mp4",
    "broadcast": "dataset/broadcast.mp4"
}
output_dirs = {
    "tacticam": "saved_frames_tacticam",
    "broadcast": "saved_frames_broadcast"
}
for view in output_dirs:
    os.makedirs(output_dirs[view], exist_ok=True)

cap_tacticam = cv2.VideoCapture(video_paths["tacticam"])
cap_broadcast = cv2.VideoCapture(video_paths["broadcast"])

max_frames = 5
font = cv2.FONT_HERSHEY_SIMPLEX
global_player_id = 0
embed_array = []  # permanent global DB


def get_player_info(frame, frame_index, view_name, save_path, allow_id_assignment=True, reference_db=None):
    global global_player_id

    player_infos = []  # list of (emb, player_id, (cx, cy))

    annotated_frame = frame.copy()
    output = model.predict(source=frame, conf=0.4, save=False, verbose=False)[0]
    boxes = output.boxes

    if boxes is not None:
        for i, box in enumerate(boxes):
            x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
            cls_id = int(box.cls[0].item())
            class_name = model.names[cls_id]

            if class_name.lower() != "player":
                continue

            crop = frame[y1:y2, x1:x2]
            crop_rgb = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)

            emb = extractor(crop_rgb)
            emb = F.normalize(emb, p=2, dim=1)
            cx, cy = (x1 + x2) / 2, (y1 + y2) / 2

            matched_id = None
            best_score = 0

            if reference_db:
                for saved_emb, pid, (prev_cx, prev_cy) in reference_db:
                    sim = torch.mm(emb, saved_emb.t()).item()
                    spatial_dist = np.sqrt((cx - prev_cx) ** 2 + (cy - prev_cy) ** 2)
                    box_diag = np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
                    norm_dist = spatial_dist / (box_diag + 1e-6)
                    final_score = (sim * 6 + (1 - norm_dist)) / 7
                    if final_score > best_score and final_score > 0.7:
                        best_score = final_score
                        matched_id = pid

            if matched_id is not None:
                player_id = matched_id
            elif allow_id_assignment:
                player_id = global_player_id
                global_player_id += 1
            else:
                player_id = -1  # placeholder if ID assignment is disabled

            if player_id != -1:
                player_infos.append((emb, player_id, (cx, cy)))

            # Annotate and save
            label = f"ID: {player_id}" if player_id != -1 else "ID: ?"
            cv2.rectangle(annotated_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(annotated_frame, label, (x1, y1 - 10), font, 0.6, (0, 255, 0), 2)

            crop_path = os.path.join(save_path, f"{view_name}_player_{player_id}.jpg")
            cv2.imwrite(crop_path, crop)

    annotated_path = os.path.join(save_path, f"{view_name}_frame_{frame_index}.jpg")
    cv2.imwrite(annotated_path, annotated_frame)
    return player_infos


# Main loop: synchronize tacticam and broadcast
for frame_idx in range(max_frames):
    ret1, frame_tacticam = cap_tacticam.read()
    ret2, frame_broadcast = cap_broadcast.read()
    if not ret1 or not ret2:
        break

    print(f"\n📸 Processing frame {frame_idx}...")

    tacticam_dir = os.path.join(output_dirs["tacticam"], f"frame_{frame_idx}")
    os.makedirs(tacticam_dir, exist_ok=True)
    broadcast_dir = os.path.join(output_dirs["broadcast"], f"frame_{frame_idx}")
    os.makedirs(broadcast_dir, exist_ok=True)

    # Step 1: Process Tacticam frame (assign new IDs)
    temp_embed_array = get_player_info(
        frame=frame_tacticam,
        frame_index=frame_idx,
        view_name="tacticam",
        save_path=tacticam_dir,
        allow_id_assignment=True
    )
    embed_array.extend(temp_embed_array)

    # Step 2: Process Broadcast frame using only tacticam frame 0 info
    _ = get_player_info(
        frame=frame_broadcast,
        frame_index=frame_idx,
        view_name="broadcast",
        save_path=broadcast_dir,
        allow_id_assignment=False,
        reference_db=temp_embed_array  # only compare against current tacticam frame
    )

cap_tacticam.release()
cap_broadcast.release()

Successfully loaded imagenet pretrained weights from "/Users/vivek/.cache/torch/checkpoints/osnet_x1_0_imagenet.pth"
** The following layers are discarded due to unmatched keys or layer size: ['classifier.weight', 'classifier.bias']
Model: osnet_x1_0
- params: 2,193,616
- flops: 978,878,352

📸 Processing frame 0...

📸 Processing frame 1...

📸 Processing frame 2...

📸 Processing frame 3...

📸 Processing frame 4...


## cross camera mapping failed -  using TransReID for reid

In [3]:
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'

import cv2
import torch
import numpy as np
from ultralytics import YOLO
from torchreid.utils import FeatureExtractor
import torch.nn.functional as F

# Initialize YOLO
model = YOLO("model/best.pt")

# Initialize OSNet feature extractor (best available in torchreid for cross-camera)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
extractor = FeatureExtractor(
    model_name='osnet_ain_x1_0',  # Using Appearance-Invariant OSNet
    model_path='osnet_ain_x1_0_market1501.pth',
    device=device
)

# Video configurations
video_paths = {
    "tacticam": "dataset/tacticam.mp4",
    "broadcast": "dataset/broadcast.mp4"
}
output_dirs = {
    "tacticam": "saved_frames_tacticam",
    "broadcast": "saved_frames_broadcast"
}

# Create output directories
for view in output_dirs:
    os.makedirs(output_dirs[view], exist_ok=True)

# Initialize video captures
cap_tacticam = cv2.VideoCapture(video_paths["tacticam"])
cap_broadcast = cv2.VideoCapture(video_paths["broadcast"])

# Parameters
max_frames = 5
font = cv2.FONT_HERSHEY_SIMPLEX
global_player_id = 0
embed_array = []  # permanent global DB

def get_player_info(frame, frame_index, view_name, save_path, allow_id_assignment=True, reference_db=None):
    global global_player_id

    player_infos = []  # list of (emb, player_id, (cx, cy))

    annotated_frame = frame.copy()
    output = model.predict(source=frame, conf=0.4, save=False, verbose=False)[0]
    boxes = output.boxes

    if boxes is not None:
        for i, box in enumerate(boxes):
            x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
            cls_id = int(box.cls[0].item())
            class_name = model.names[cls_id]

            if class_name.lower() != "player":
                continue

            crop = frame[y1:y2, x1:x2]
            crop_rgb = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)

            # Extract features with OSNet
            emb = extractor(crop_rgb)
            emb = F.normalize(emb, p=2, dim=1)
            cx, cy = (x1 + x2) / 2, (y1 + y2) / 2

            matched_id = None
            best_score = 0

            if reference_db:
                for saved_emb, pid, (prev_cx, prev_cy) in reference_db:
                    # Calculate cosine similarity
                    sim = torch.mm(emb, saved_emb.t()).item()
                    
                    # Calculate spatial distance
                    spatial_dist = np.sqrt((cx - prev_cx) ** 2 + (cy - prev_cy) ** 2)
                    box_diag = np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
                    norm_dist = spatial_dist / (box_diag + 1e-6)
                    
                    # Combined score (weighted)
                    final_score = (sim * 6 + (1 - norm_dist)) / 7
                    
                    if final_score > best_score and final_score > 0.7:
                        best_score = final_score
                        matched_id = pid

            if matched_id is not None:
                player_id = matched_id
            elif allow_id_assignment:
                player_id = global_player_id
                global_player_id += 1
            else:
                player_id = -1  # placeholder if ID assignment is disabled

            if player_id != -1:
                player_infos.append((emb, player_id, (cx, cy)))

            # Annotate and save
            label = f"ID: {player_id}" if player_id != -1 else "ID: ?"
            cv2.rectangle(annotated_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(annotated_frame, label, (x1, y1 - 10), font, 0.6, (0, 255, 0), 2)

            crop_path = os.path.join(save_path, f"{view_name}_player_{player_id}.jpg")
            cv2.imwrite(crop_path, crop)

    annotated_path = os.path.join(save_path, f"{view_name}_frame_{frame_index}.jpg")
    cv2.imwrite(annotated_path, annotated_frame)
    return player_infos

# Main processing loop
for frame_idx in range(max_frames):
    ret1, frame_tacticam = cap_tacticam.read()
    ret2, frame_broadcast = cap_broadcast.read()
    if not ret1 or not ret2:
        break

    print(f"\n📸 Processing frame {frame_idx}...")

    tacticam_dir = os.path.join(output_dirs["tacticam"], f"frame_{frame_idx}")
    os.makedirs(tacticam_dir, exist_ok=True)
    broadcast_dir = os.path.join(output_dirs["broadcast"], f"frame_{frame_idx}")
    os.makedirs(broadcast_dir, exist_ok=True)

    # Process Tacticam frame (assign new IDs)
    temp_embed_array = get_player_info(
        frame=frame_tacticam,
        frame_index=frame_idx,
        view_name="tacticam",
        save_path=tacticam_dir,
        allow_id_assignment=True
    )
    embed_array.extend(temp_embed_array)

    # Process Broadcast frame using only tacticam frame info
    _ = get_player_info(
        frame=frame_broadcast,
        frame_index=frame_idx,
        view_name="broadcast",
        save_path=broadcast_dir,
        allow_id_assignment=False,
        reference_db=temp_embed_array
    )

# Release resources
cap_tacticam.release()
cap_broadcast.release()
print("✅ Processing complete!")

Downloading...
From: https://drive.google.com/uc?id=1-CaioD9NaqbHK_kzSMW8VE4_3KcsRjEo
To: /Users/vivek/.cache/torch/checkpoints/osnet_ain_x1_0_imagenet.pth
100%|██████████| 10.9M/10.9M [00:02<00:00, 4.28MB/s]


Successfully loaded imagenet pretrained weights from "/Users/vivek/.cache/torch/checkpoints/osnet_ain_x1_0_imagenet.pth"
** The following layers are discarded due to unmatched keys or layer size: ['classifier.weight', 'classifier.bias']
Model: osnet_ain_x1_0
- params: 2,193,616
- flops: 978,878,352

📸 Processing frame 0...

📸 Processing frame 1...

📸 Processing frame 2...

📸 Processing frame 3...

📸 Processing frame 4...
✅ Processing complete!


## using other reid

In [4]:
import os
import cv2
import torch
import numpy as np
from ultralytics import YOLO
from torchreid.utils import FeatureExtractor
import torch.nn.functional as F
from collections import defaultdict

# Initialize models
model = YOLO("yolov8x-pose.pt")  # Using pose estimation model
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Initialize OSNet-AIN for cross-camera re-id
extractor = FeatureExtractor(
    model_name='osnet_ain_x1_0',
    model_path='osnet_ain_x1_0_market1501.pth',
    device=device
)

# Configuration
class Config:
    MIN_CONFIDENCE = 0.4
    MATCH_THRESHOLD = 0.7
    POSE_WEIGHT = 0.4
    TEMPORAL_WINDOW = 5
    DEBUG = True

class PlayerTracker:
    def __init__(self):
        self.next_id = 0
        self.players = defaultdict(dict)
        self.temporal_embeddings = defaultdict(list)
        
    def _get_combined_features(self, crop, box, pose):
        """Extract combined appearance and pose features"""
        # Appearance features
        crop_rgb = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
        emb = extractor(crop_rgb)
        emb = F.normalize(emb, p=2, dim=1)
        
        # Pose features (normalized keypoints)
        pose_features = pose.xy.cpu().numpy()[0].flatten()
        pose_features = pose_features / np.linalg.norm(pose_features)
        
        # Combine features
        combined = np.concatenate([
            emb.cpu().numpy().flatten(),
            pose_features
        ])
        return torch.from_numpy(combined).float().to(device)
    
    def update(self, frame, view, frame_idx):
        results = model(frame, verbose=False)[0]
        annotated = frame.copy()
        current_detections = []
        
        for box, pose in zip(results.boxes, results.keypoints):
            if box.cls != 0:  # Only people
                continue
                
            x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
            crop = frame[y1:y2, x1:x2]
            
            # Get combined features
            features = self._get_combined_features(crop, box, pose)
            
            # Find best match across views
            best_match = None
            best_score = Config.MATCH_THRESHOLD
            
            for pid, player in self.players.items():
                # Get most recent embeddings from all views
                recent_embs = []
                for vdata in player['views'].values():
                    if frame_idx - vdata['last_seen'] <= Config.TEMPORAL_WINDOW:
                        recent_embs.append(vdata['embedding'])
                
                if not recent_embs:
                    continue
                    
                # Compare with average of recent embeddings
                avg_emb = torch.mean(torch.stack(recent_embs), dim=0)
                similarity = F.cosine_similarity(
                    features.unsqueeze(0), 
                    avg_emb.unsqueeze(0)
                ).item()
                
                if similarity > best_score:
                    best_score = similarity
                    best_match = pid
            
            # Assign ID
            if best_match is not None:
                player_id = best_match
                # Update with exponential moving average
                self.players[player_id]['views'][view] = {
                    'embedding': features,
                    'last_seen': frame_idx,
                    'box': (x1, y1, x2, y2)
                }
            else:
                player_id = self.next_id
                self.next_id += 1
                self.players[player_id] = {
                    'views': {
                        view: {
                            'embedding': features,
                            'last_seen': frame_idx,
                            'box': (x1, y1, x2, y2)
                        }
                    }
                }
            
            # Draw annotations
            color = (0, 255, 0) if view == "tacticam" else (0, 0, 255)
            cv2.rectangle(annotated, (x1, y1), (x2, y2), color, 2)
            label = f"ID:{player_id} {view[:3]}"
            cv2.putText(annotated, label, (x1, y1-10), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)
            
            # Draw pose keypoints
            for x, y, conf in pose.xy.cpu().numpy()[0]:
                if conf > 0.3:
                    cv2.circle(annotated, (int(x), int(y)), 3, color, -1)
            
            current_detections.append({
                'id': player_id,
                'box': (x1, y1, x2, y2),
                'crop': crop
            })
        
        return annotated, current_detections

# Main processing
def process_videos():
    tracker = PlayerTracker()
    caps = {
        "tacticam": cv2.VideoCapture("dataset/tacticam.mp4"),
        "broadcast": cv2.VideoCapture("dataset/broadcast.mp4")
    }
    
    frame_idx = 0
    while True:
        frames = {}
        ret_all = True
        
        # Read synchronized frames
        for view, cap in caps.items():
            ret, frame = cap.read()
            if not ret:
                ret_all = False
                break
            frames[view] = frame
        
        if not ret_all:
            break
        
        # Process each view
        all_detections = {}
        for view, frame in frames.items():
            annotated, detections = tracker.update(frame, view, frame_idx)
            all_detections[view] = detections
            
            # Save output
            os.makedirs(f"output/{view}", exist_ok=True)
            cv2.imwrite(f"output/{view}/frame_{frame_idx:04d}.jpg", annotated)
        
        # Visualize matches
        visualize_matches(all_detections, frame_idx)
        
        frame_idx += 1
        if frame_idx % 10 == 0:
            print(f"Processed frame {frame_idx}")
    
    # Release resources
    for cap in caps.values():
        cap.release()

def visualize_matches(detections, frame_idx):
    """Create side-by-side comparisons of matched players"""
    os.makedirs("output/matches", exist_ok=True)
    
    # Find players seen in both views
    common_players = set(d['id'] for d in detections.get("tacticam", [])) & \
                    set(d['id'] for d in detections.get("broadcast", []))
    
    for pid in common_players:
        # Get crops from both views
        crops = []
        for view in ["tacticam", "broadcast"]:
            for det in detections.get(view, []):
                if det['id'] == pid:
                    crops.append(det['crop'])
                    break
        
        if len(crops) == 2:
            # Create side-by-side comparison
            comparison = np.hstack([
                cv2.resize(crops[0], (200, 400)),
                cv2.resize(crops[1], (200, 400))
            ])
            cv2.imwrite(f"output/matches/match_{pid}_frame_{frame_idx:04d}.jpg", comparison)

if __name__ == "__main__":
    process_videos()

Successfully loaded imagenet pretrained weights from "/Users/vivek/.cache/torch/checkpoints/osnet_ain_x1_0_imagenet.pth"
** The following layers are discarded due to unmatched keys or layer size: ['classifier.weight', 'classifier.bias']
Model: osnet_ain_x1_0
- params: 2,193,616
- flops: 978,878,352
Processed frame 10
Processed frame 20
Processed frame 30
Processed frame 40
Processed frame 50
Processed frame 60
Processed frame 70
Processed frame 80
Processed frame 90
Processed frame 100
Processed frame 110
Processed frame 120
Processed frame 130
Processed frame 140


ValueError: not enough values to unpack (expected 3, got 2)

## no good result with pose estimation, working to inprove the output

In [5]:
import os
import cv2
import torch
import numpy as np
from ultralytics import YOLO
from torchreid.utils import FeatureExtractor
import torch.nn.functional as F
from collections import defaultdict
from sklearn.preprocessing import normalize
from scipy.spatial.distance import cdist

class EnhancedPlayerTracker:
    def __init__(self):
        # Initialize models
        self.detector = YOLO("yolov8s.pt")  # Smaller, faster model
        self.extractor = FeatureExtractor(
            model_name='osnet_ain_x1_0',
            model_path='osnet_ain_x1_0_market1501.pth',
            device='cuda' if torch.cuda.is_available() else 'cpu'
        )
        
        # Tracking state
        self.next_id = 0
        self.players = defaultdict(dict)
        self.temporal_features = defaultdict(list)
        
        # Configuration
        self.MIN_CONFIDENCE = 0.4
        self.MATCH_THRESHOLD = 0.65  # Lower threshold for small players
        self.TEMPORAL_WINDOW = 10
        self.MAX_FEATURES = 50  # Maximum features to store per player
        self.SPATIAL_WEIGHT = 0.3  # Weight for spatial consistency
        self.APPEARANCE_WEIGHT = 0.7  # Weight for appearance similarity

    def _extract_enhanced_features(self, crop):
        """Extract multiple robust features for small player matching"""
        # 1. Main appearance features (OSNet)
        rgb_crop = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
        appearance_feat = self.extractor(rgb_crop)
        appearance_feat = F.normalize(appearance_feat, p=2, dim=1)
        
        # 2. Color histogram features (HSV space)
        hsv = cv2.cvtColor(crop, cv2.COLOR_BGR2HSV)
        hist_h = cv2.calcHist([hsv], [0], None, [8], [0, 180])
        hist_s = cv2.calcHist([hsv], [1], None, [8], [0, 256])
        hist_v = cv2.calcHist([hsv], [2], None, [8], [0, 256])
        color_feat = np.concatenate([hist_h, hist_s, hist_v]).flatten()
        color_feat = normalize(color_feat.reshape(1, -1))[0]
        
        # 3. LBP texture features
        gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY)
        lbp = self._local_binary_pattern(gray)
        hist_lbp = np.histogram(lbp, bins=16, range=(0, 256))[0]
        texture_feat = normalize(hist_lbp.reshape(1, -1))[0]
        
        # Combine all features
        combined = np.concatenate([
            appearance_feat.cpu().numpy().flatten(),
            color_feat,
            texture_feat
        ])
        
        return torch.from_numpy(combined).float().to(self.extractor.device)

    def _local_binary_pattern(self, img, radius=1, neighbors=8):
        """Compute LBP texture features"""
        lbp = np.zeros_like(img)
        for i in range(radius, img.shape[0]-radius):
            for j in range(radius, img.shape[1]-radius):
                center = img[i,j]
                binary = []
                for k in range(neighbors):
                    theta = 2*np.pi*k/neighbors
                    x = i + int(radius*np.cos(theta))
                    y = j + int(radius*np.sin(theta))
                    binary.append(img[x,y] >= center)
                lbp[i,j] = sum([2**k for k,b in enumerate(binary) if b])
        return lbp

    def _spatial_consistency(self, box1, box2, frame_size=(1920, 1080)):
        """Calculate spatial consistency score between two detections"""
        # Normalize coordinates
        cx1 = (box1[0] + box1[2]) / 2 / frame_size[0]
        cy1 = (box1[1] + box1[3]) / 2 / frame_size[1]
        cx2 = (box2[0] + box2[2]) / 2 / frame_size[0]
        cy2 = (box2[1] + box2[3]) / 2 / frame_size[1]
        
        # Calculate normalized distance
        dist = np.sqrt((cx1-cx2)**2 + (cy1-cy2)**2)
        return max(0, 1 - dist)  # 1 when same position, 0 when far apart

    def update(self, frame, view, frame_idx):
        """Process a frame and update player tracking"""
        results = self.detector(frame, verbose=False)[0]
        annotated = frame.copy()
        current_detections = []
        
        for box in results.boxes:
            if box.cls != 0 or box.conf < self.MIN_CONFIDENCE:
                continue
                
            x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
            crop = frame[y1:y2, x1:x2]
            
            # Skip very small detections (adjust based on your video resolution)
            if (x2-x1) < 20 or (y2-y1) < 40:
                continue
                
            # Extract enhanced features
            features = self._extract_enhanced_features(crop)
            
            # Find best match across views and time
            best_match = None
            best_score = self.MATCH_THRESHOLD
            
            for pid, player in self.players.items():
                # Skip if recently seen in same view
                if view in player['views'] and \
                   frame_idx - player['views'][view]['last_seen'] < 5:
                    continue
                
                # Get most recent features from all views
                recent_features = []
                for vdata in player['views'].values():
                    if frame_idx - vdata['last_seen'] <= self.TEMPORAL_WINDOW:
                        recent_features.append(vdata['features'])
                
                if not recent_features:
                    continue
                    
                # Calculate appearance similarity (average over recent features)
                avg_features = torch.mean(torch.stack(recent_features), dim=0)
                appearance_sim = F.cosine_similarity(
                    features.unsqueeze(0), 
                    avg_features.unsqueeze(0)
                ).item()
                
                # Calculate spatial consistency with last position in each view
                spatial_scores = []
                for v, vdata in player['views'].items():
                    if frame_idx - vdata['last_seen'] <= self.TEMPORAL_WINDOW:
                        spatial_score = self._spatial_consistency(
                            (x1, y1, x2, y2),
                            vdata['box']
                        )
                        spatial_scores.append(spatial_score)
                
                spatial_sim = max(spatial_scores) if spatial_scores else 0
                
                # Combined score
                combined_score = (self.APPEARANCE_WEIGHT * appearance_sim + 
                                self.SPATIAL_WEIGHT * spatial_sim)
                
                if combined_score > best_score:
                    best_score = combined_score
                    best_match = pid
            
            # Assign ID
            if best_match is not None:
                player_id = best_match
                # Update with exponential moving average
                if view in self.players[player_id]['views']:
                    prev_features = self.players[player_id]['views'][view]['features']
                    new_features = 0.7 * prev_features + 0.3 * features
                else:
                    new_features = features
                
                self.players[player_id]['views'][view] = {
                    'features': new_features,
                    'last_seen': frame_idx,
                    'box': (x1, y1, x2, y2)
                }
                
                # Maintain temporal window of features
                self.temporal_features[player_id].append(new_features)
                if len(self.temporal_features[player_id]) > self.MAX_FEATURES:
                    self.temporal_features[player_id].pop(0)
            else:
                player_id = self.next_id
                self.next_id += 1
                self.players[player_id] = {
                    'views': {
                        view: {
                            'features': features,
                            'last_seen': frame_idx,
                            'box': (x1, y1, x2, y2)
                        }
                    }
                }
                self.temporal_features[player_id] = [features]
            
            # Draw annotations
            color = (0, 255, 0) if view == "tacticam" else (0, 0, 255)
            cv2.rectangle(annotated, (x1, y1), (x2, y2), color, 2)
            label = f"ID:{player_id} {view[:3]}"
            cv2.putText(annotated, label, (x1, y1-10), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)
            
            current_detections.append({
                'id': player_id,
                'box': (x1, y1, x2, y2),
                'crop': crop
            })
        
        return annotated, current_detections

def process_videos():
    tracker = EnhancedPlayerTracker()
    caps = {
        "tacticam": cv2.VideoCapture("dataset/tacticam.mp4"),
        "broadcast": cv2.VideoCapture("dataset/broadcast.mp4")
    }
    
    frame_idx = 0
    while True:
        frames = {}
        ret_all = True
        
        # Read synchronized frames
        for view, cap in caps.items():
            ret, frame = cap.read()
            if not ret:
                ret_all = False
                break
            frames[view] = frame
        
        if not ret_all:
            break
        
        # Process each view
        all_detections = {}
        for view, frame in frames.items():
            annotated, detections = tracker.update(frame, view, frame_idx)
            all_detections[view] = detections
            
            # Save output
            os.makedirs(f"output/{view}", exist_ok=True)
            cv2.imwrite(f"output/{view}/frame_{frame_idx:04d}.jpg", annotated)
        
        # Visualize matches between views
        visualize_matches(all_detections, frame_idx)
        
        frame_idx += 1
        if frame_idx % 10 == 0:
            print(f"Processed frame {frame_idx}")
    
    # Release resources
    for cap in caps.values():
        cap.release()

def visualize_matches(detections, frame_idx):
    """Create side-by-side comparisons of matched players"""
    os.makedirs("output/matches", exist_ok=True)
    
    # Find players seen in both views
    tacticam_ids = {d['id'] for d in detections.get("tacticam", [])}
    broadcast_ids = {d['id'] for d in detections.get("broadcast", [])}
    common_players = tacticam_ids & broadcast_ids
    
    for pid in common_players:
        # Get crops from both views
        crops = []
        for view in ["tacticam", "broadcast"]:
            for det in detections.get(view, []):
                if det['id'] == pid:
                    crops.append(det['crop'])
                    break
        
        if len(crops) == 2:
            # Resize to common height
            h = min(crops[0].shape[0], crops[1].shape[0])
            resized = [
                cv2.resize(crops[0], (int(crops[0].shape[1]*h/crops[0].shape[0]), h)),
                cv2.resize(crops[1], (int(crops[1].shape[1]*h/crops[1].shape[0]), h))
            ]
            
            # Create side-by-side comparison
            comparison = np.hstack(resized)
            cv2.imwrite(f"output/matches/match_{pid}_frame_{frame_idx:04d}.jpg", comparison)

if __name__ == "__main__":
    process_videos()

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8s.pt to 'yolov8s.pt'...


100%|██████████| 21.5M/21.5M [00:12<00:00, 1.79MB/s]


Successfully loaded imagenet pretrained weights from "/Users/vivek/.cache/torch/checkpoints/osnet_ain_x1_0_imagenet.pth"
** The following layers are discarded due to unmatched keys or layer size: ['classifier.weight', 'classifier.bias']
Model: osnet_ain_x1_0
- params: 2,193,616
- flops: 978,878,352
Processed frame 10
Processed frame 20
Processed frame 30
Processed frame 40
Processed frame 50
Processed frame 60
Processed frame 70
Processed frame 80
Processed frame 90
Processed frame 100
Processed frame 110
Processed frame 120
Processed frame 130
Processed frame 140


### above model works okaish not good