In [1]:
!pip install -q git+https://github.com/openai/CLIP.git
!pip install -q ftfy regex tqdm opencv-python-headless ultralytics matplotlib scikit-learn

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m98.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m79.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m54.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.3 MB/s[0m eta [36m0:0

In [2]:
import cv2
import os
import torch
import clip
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity
from ultralytics import YOLO

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)
model_yolo = YOLO("/content/best (5).pt")

In [5]:
def extract_frames(video_path, step=15):
    cap = cv2.VideoCapture(video_path)
    frames, frame_ids = [], []
    i = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret: break
        if i % step == 0:
            frames.append(frame)
            frame_ids.append(i)
        i += 1
    cap.release()
    return frames, frame_ids

In [6]:
def detect_players(model, frames):
    all_detections = []
    for frame in frames:
        results = model(frame, verbose=False)[0]
        detections = []
        for box in results.boxes.data.tolist():
            x1, y1, x2, y2, conf, cls = map(int, box[:6])
            if int(cls) == 0:
                detections.append((x1, y1, x2, y2))
        all_detections.append(detections)
    return all_detections

In [7]:
def get_clip_embedding(image, bbox):
    x1, y1, x2, y2 = bbox
    crop = image[y1:y2, x1:x2]
    if crop.size == 0:
        return np.zeros((512,))
    crop_pil = Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
    crop_tensor = clip_preprocess(crop_pil).unsqueeze(0).to(device)
    with torch.no_grad():
        embedding = clip_model.encode_image(crop_tensor)
    return embedding.cpu().numpy().flatten()

def extract_features(frames, detections):
    features = []
    for frame, det_list in zip(frames, detections):
        frame_features = [get_clip_embedding(frame, box) for box in det_list]
        features.append(frame_features)
    return features

In [8]:
def match_players(features1, features2, threshold=0.75):
    matches = []
    for i1, f_list1 in enumerate(features1):
        for j1, f1 in enumerate(f_list1):
            best_match = (-1, -1, 0)
            for i2, f_list2 in enumerate(features2):
                for j2, f2 in enumerate(f_list2):
                    sim = cosine_similarity([f1], [f2])[0][0]
                    if sim > best_match[2] and sim > threshold:
                        best_match = (i2, j2, sim)
            if best_match[0] != -1:
                matches.append(((i1, j1), (best_match[0], best_match[1])))
    return matches

In [9]:
def draw_comparison_grid(frames_list, detections_list, matched_indices, output_dir="clip_output"):
    os.makedirs(output_dir, exist_ok=True)
    for idx, ((i1, j1), (i2, j2)) in enumerate(matched_indices):
        fig, axs = plt.subplots(1, 2, figsize=(10, 5))
        for ax, (frames, dets, i, j, color) in zip(
            axs, [*zip(frames_list, detections_list, [i1, i2], [j1, j2], ['red', 'blue'])]
        ):
            frame = frames[i].copy()
            x1, y1, x2, y2 = dets[i][j]
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, f'Player {idx+1}', (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0,255,0), 2)
            ax.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            ax.axis("off")
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f'player_{idx+1}_clipmatch.png'))
        plt.close()

In [10]:
broadcast_path = "/content/broadcast.mp4"
tacticam_path = "/content/tacticam.mp4"
matchB_path = "/content/15sec_input_720p.mp4"

# Extract frames
frames_broadcast, _ = extract_frames(broadcast_path)
frames_tacticam, _ = extract_frames(tacticam_path)
frames_matchB, _ = extract_frames(matchB_path)

# Detect players
dets_broadcast = detect_players(model_yolo, frames_broadcast)
dets_tacticam = detect_players(model_yolo, frames_tacticam)
dets_matchB = detect_players(model_yolo, frames_matchB)

# Extract embeddings
feat_broadcast = extract_features(frames_broadcast, dets_broadcast)
feat_tacticam = extract_features(frames_tacticam, dets_tacticam)
feat_matchB = extract_features(frames_matchB, dets_matchB)

# Cross-camera match (broadcast ↔ tacticam)
matches_AB = match_players(feat_broadcast, feat_tacticam)
draw_comparison_grid([frames_broadcast, frames_tacticam], [dets_broadcast, dets_tacticam], matches_AB, output_dir="matchA_mapping")

# Re-identification: match broadcast ↔ matchB
matches_reid = match_players(feat_broadcast, feat_matchB)
draw_comparison_grid([frames_broadcast, frames_matchB], [dets_broadcast, dets_matchB], matches_reid, output_dir="matchB_reid")