# **VIDEO ACTION & OBJECT DETECTION -> MULTIMODAL**



---

DATE: 29 NOV 2025


In [2]:
!pip install -U open-clip-torch --no-cache-dir

Collecting open-clip-torch
  Downloading open_clip_torch-3.2.0-py3-none-any.whl.metadata (32 kB)
Collecting ftfy (from open-clip-torch)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading open_clip_torch-3.2.0-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m322.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy, open-clip-torch
Successfully installed ftfy-6.3.1 open-clip-torch-3.2.0


In [18]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.233-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.18 (from ultralytics)
  Downloading ultralytics_thop-2.0.18-py3-none-any.whl.metadata (14 kB)
Downloading ultralytics-8.3.233-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ultralytics_thop-2.0.18-py3-none-any.whl (28 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.3.233 ultralytics-thop-2.0.18


In [3]:
import torch
import open_clip

device = "cuda" if torch.cuda.is_available() else "cpu"
model, _, preprocess = open_clip.create_model_and_transforms(
    "ViT-B-32", pretrained="openai"
)
model = model.to(device)

print("CLIP model loaded on:", device)


open_clip_model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]



CLIP model loaded on: cuda


In [20]:
import cv2
import torch
import open_clip
from ultralytics import YOLO
from PIL import Image
import numpy as np
import pandas as pd
import os
from datetime import datetime

# =============================
# CONFIG
# =============================
input_video = "/content/drive/MyDrive/yolo/car.mp4"
output_video = "/content/output_yolo_clip_race_car.mp4"
output_csv   = "/content/output_yolo_clip_race_car.csv"

frame_skip = 3
motion_boost_weight = 0.8
smooth_window = 5
conf_yolo = 0.25

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# =============================
# LOAD YOLOv8
# =============================
yolo_model = YOLO("yolov8n.pt")   # nano = fastest for Colab free GPU

# =============================
# LOAD CLIP
# =============================
model, _, preprocess = open_clip.create_model_and_transforms(
    "ViT-B-32", pretrained="openai"
)
tokenizer = open_clip.get_tokenizer("ViT-B-32")
model = model.to(device).eval()

# =============================
# PROMPTS (HORSE + CAR)
# =============================
texts = [
    "a professional horse racing competition",
    "horses sprinting on a racetrack with jockeys",
    "a jockey riding a racing horse",
    "a horse galloping at high speed",

    "a professional car racing competition",
    "race cars speeding on a racetrack",
    "a formula car racing at high speed",
    "multiple cars competing in a race",

    "a person walking",
    "a person running",
    "a person standing"
]

horse_label_indices = [0, 1, 2, 3]
car_label_indices   = [4, 5, 6, 7]

text_tokens = tokenizer(texts).to(device)
with torch.no_grad():
    text_features = model.encode_text(text_tokens)
    text_features = text_features / text_features.norm(dim=-1, keepdim=True)

# =============================
# VIDEO SETUP
# =============================
cap = cv2.VideoCapture(input_video)
if not cap.isOpened():
    raise ValueError("Cannot open input video")

fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out = cv2.VideoWriter(output_video, fourcc, fps, (width, height))

# =============================
# PROCESS VIDEO
# =============================
rows = []
frame_features = []
motion_magnitudes = []

prev_gray = None
frame_idx = 0
sample_idx = 0

print("Processing video...")

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # ----------------------------------
    # YOLO DETECTION
    # ----------------------------------
    results = yolo_model(frame, conf=conf_yolo, verbose=False)[0]

    crops = []
    boxes_draw = []

    for box in results.boxes:
        cls_id = int(box.cls[0])
        conf = float(box.conf[0])
        x1, y1, x2, y2 = map(int, box.xyxy[0])

        class_name = yolo_model.names[cls_id]

        # Only keep horses + cars
        if class_name in ["horse", "car"]:
            crop = frame[y1:y2, x1:x2]
            if crop.size > 0:
                crops.append(crop)
                boxes_draw.append((x1, y1, x2, y2, class_name, conf))

    # ----------------------------------
    # MOTION (OPTICAL FLOW)
    # ----------------------------------
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    if prev_gray is None:
        flow_mag = 0.0
    else:
        flow = cv2.calcOpticalFlowFarneback(
            prev_gray, gray, None,
            0.5, 3, 15, 3, 5, 1.2, 0
        )
        mag, _ = cv2.cartToPolar(flow[...,0], flow[...,1])
        flow_mag = float(np.mean(mag))

    prev_gray = gray.copy()
    motion_magnitudes.append(flow_mag)

    # ----------------------------------
    # CLIP ON CROPS
    # ----------------------------------
    clip_scores = []

    for crop in crops:
        crop_rgb = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(crop_rgb)
        image = preprocess(pil_image).unsqueeze(0).to(device)

        with torch.no_grad():
            img_feat = model.encode_image(image)
            img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
            sim = (img_feat @ text_features.T).squeeze(0).cpu().numpy()
            sim_soft = np.exp(sim) / np.sum(np.exp(sim) + 1e-8)
            clip_scores.append(sim_soft)

    # Average across all detected crops
    if len(clip_scores) > 0:
        clip_scores = np.mean(np.array(clip_scores), axis=0)
    else:
        clip_scores = np.ones(len(texts)) / len(texts)

    frame_features.append(clip_scores)

    # ----------------------------------
    # TEMPORAL + MOTION BOOST
    # ----------------------------------
    K = min(len(frame_features), smooth_window)
    avg_clip = np.mean(np.array(frame_features[-K:]), axis=0)

    recent_motion = np.array(motion_magnitudes[-K:])
    p95 = np.percentile(motion_magnitudes, 95) if len(motion_magnitudes) > 1 else 1.0
    motion_norm = float(np.clip(np.mean(recent_motion) / (p95 + 1e-6), 0, 1))

    boosted = avg_clip.copy()

    for idx in horse_label_indices:
        boosted[idx] *= (1.0 + motion_boost_weight * motion_norm)

    for idx in car_label_indices:
        boosted[idx] *= (1.0 + motion_boost_weight * motion_norm)

    boosted = boosted / (np.sum(boosted) + 1e-8)

    pred_idx = int(np.argmax(boosted))
    pred_label = texts[pred_idx]
    pred_conf = float(boosted[pred_idx])

    # ----------------------------------
    # DRAW YOLO BOXES
    # ----------------------------------
    draw_frame = frame.copy()

    for (x1, y1, x2, y2, cls, conf) in boxes_draw:
        cv2.rectangle(draw_frame, (x1,y1), (x2,y2), (0,255,0), 2)
        cv2.putText(
            draw_frame, f"{cls} {conf:.2f}",
            (x1, y1-5),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.6, (0,255,0), 2
        )

    # ----------------------------------
    # DRAW ACTION
    # ----------------------------------
    cv2.rectangle(draw_frame, (0,0), (width,80), (0,0,0), -1)

    cv2.putText(
        draw_frame,
        f"Action: {pred_label} ({pred_conf:.2f})",
        (20,40),
        cv2.FONT_HERSHEY_SIMPLEX,
        1.0, (0,255,255), 2
    )

    cv2.putText(
        draw_frame,
        f"Motion: {motion_norm:.2f}",
        (20,70),
        cv2.FONT_HERSHEY_SIMPLEX,
        0.7, (200,200,0), 2
    )

    out.write(draw_frame)

    # ----------------------------------
    # LOG CSV
    # ----------------------------------
    row = {
        "frame_idx": frame_idx,
        "motion": motion_norm,
        "pred_label": pred_label,
        "confidence": pred_conf
    }

    for i, t in enumerate(texts):
        row[f"clip_{i}"] = float(boosted[i])

    rows.append(row)

    frame_idx += 1

cap.release()
out.release()

# =============================
# SAVE CSV
# =============================
df = pd.DataFrame(rows)
if os.path.exists(output_csv):
    base, ext = os.path.splitext(output_csv)
    output_csv = f"{base}_{datetime.now().strftime('%Y%m%d_%H%M%S')}{ext}"

df.to_csv(output_csv, index=False)

print("\n✅ PROCESS COMPLETED")
print("✅ Output video:", output_video)
print("✅ CSV log:", output_csv)


Device: cuda
Processing video...

✅ PROCESS COMPLETED
✅ Output video: /content/output_yolo_clip_race_car.mp4
✅ CSV log: /content/output_yolo_clip_race_car.csv
