In [1]:
# Cell 1: Imports & Config
import os, csv, math, random, time, warnings
from dataclasses import dataclass
from typing import List, Tuple, Optional, Dict

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

import numpy as np
from pytorchvideo.models import x3d  # load X3D via PyTorchVideo

from decord import VideoReader, cpu  # for video decoding

import cv2  # for resizing frames

# ---------- Config ----------
@dataclass
class Cfg:
    csv_path: str = "dataset_index.csv"   # video_path,binary_label,type_label
    clip_len: int = 16
    stride: int = 8                       # overlap 50%
    frame_size: int = 224
    batch_size: int = 14               # per-clip
    num_workers: int = 0
    epochs: int = 10
    lr: float = 1e-4
    weight_decay: float = 1e-4
    # imbalance handling
    pos_weight: Optional[float] = None    # if None, auto-compute from CSV
    # 13 crime classes (0..12). Normal videos use type_label = -1
    num_crime_classes: int = 13
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    seed: int = 42

cfg = Cfg()

def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


In [2]:
# Cell 2: Dataset Parsing + Clip Loader

def parse_csv(csv_path: str) -> List[Tuple[str, int, int]]:
    items = []
    with open(csv_path, newline="") as f:
        reader = csv.DictReader(f)
        for r in reader:
            vp = r["video_path"]
            b = int(r["binary_label"])
            t = int(r["type_label"])
            if os.path.exists(vp):
                items.append((vp, b, t))
    if not items:
        raise RuntimeError("CSV parsed but no existing files found. Check paths in CSV.")
    return items

def load_clip_decord(video_path: str, start_idx: int, clip_len: int, resize: int) -> torch.Tensor:
    """
    Returns tensor (3, T, H, W) normalized to [0,1] then ImageNet/Kinetics mean/std.
    """
    vr = VideoReader(video_path, ctx=cpu(0))
    fnum = len(vr)
    inds = np.arange(start_idx, start_idx + clip_len)
    inds = np.clip(inds, 0, fnum - 1)  # pad last frame if overflow
    frames = vr.get_batch(inds)  # (T, H, W, 3), uint8
    frames = frames.asnumpy()
    # Resize each frame
    frames_resized = np.stack([
        cv2.resize(fr, (resize, resize), interpolation=cv2.INTER_AREA)
        for fr in frames
    ], axis=0)
    frames_resized = frames_resized.astype(np.float32) / 255.0
    # To torch (3, T, H, W)
    frames_t = torch.from_numpy(frames_resized).permute(3, 0, 1, 2)
    # Normalize using Kinetics / ImageNet style means and stds used in PyTorchVideo docs
    mean = torch.tensor([0.45, 0.45, 0.45]).view(3,1,1,1)
    std  = torch.tensor([0.225, 0.225, 0.225]).view(3,1,1,1)
    frames_t = (frames_t - mean) / std
    return frames_t

class ClipDataset(Dataset):
    """
    Lazy version: generates clip start indices on the fly.
    - Faster startup (no scanning all videos).
    - Each __getitem__ samples a random clip from a video.
    """
    def __init__(self, rows, clip_len=16, stride=8, frame_size=224, 
                 max_clips_per_video=None, mode="train"):
        """
        rows: list of (video_path, binary_label, type_label)
        mode: "train" = random clip per call, "val" = deterministic stride
        """
        self.rows = rows
        self.clip_len = clip_len
        self.stride = stride
        self.frame_size = frame_size
        self.max_clips_per_video = max_clips_per_video
        self.mode = mode

        # preload fps + frame counts (cheap metadata)
        self.video_meta = []
        for i, (vp, b, t) in enumerate(self.rows):
            try:
                vr = VideoReader(vp, ctx=cpu(0))
                fnum = len(vr)
            except Exception:
                fnum = 0
            self.video_meta.append((fnum, vp, b, t))
            if i % 100 == 0:
                print(f"Scanned {i}/{len(self.rows)} videos")   
        print(f"Scanned {i}/{len(self.rows)} videos")


    def __len__(self):
        # If max_clips_per_video is set, cap it
        if self.max_clips_per_video:
            return len(self.rows) * self.max_clips_per_video
        return len(self.rows)

    def __getitem__(self, idx):
        # Pick video index
        vidx = idx % len(self.rows)
        fnum, vp, b, t = self.video_meta[vidx]

        if fnum <= 0:
            raise RuntimeError(f"Video {vp} could not be read")

        # Training → random start, Validation → stride-based
        if self.mode == "train":
            max_start = max(1, fnum - self.clip_len)
            start = np.random.randint(0, max_start + 1)
        else:  # validation: pick evenly spaced windows
            stride = max(1, fnum // max(1, self.max_clips_per_video or 8))
            start = (idx // len(self.rows)) * stride
            start = min(start, fnum - self.clip_len)

        clip = load_clip_decord(vp, start, self.clip_len, self.frame_size)

        return {
            "clip": clip,
            "binary_label": torch.tensor(float(b)),
            "type_label": torch.tensor(t, dtype=torch.long),
            "video_idx": torch.tensor(vidx, dtype=torch.long),
            "start_frame": torch.tensor(start, dtype=torch.long),
            "video_path": vp
        }


In [3]:
class X3DMultiTask(nn.Module):
    def __init__(self, backbone=None, num_classes=13):
        super().__init__()
        print("[ModelInit] Starting X3DMultiTask init")

        if backbone is None:
            print("[ModelInit] Loading x3d_m backbone from torch.hub …")
            backbone = torch.hub.load("facebookresearch/pytorchvideo", "x3d_m", pretrained=True)
            print("[ModelInit] Backbone loaded")

        self.backbone = backbone

        # --- Find classification head ---
        if hasattr(self.backbone, "head") and hasattr(self.backbone.head, "proj"):
            head = self.backbone.head
            print("[ModelInit] Found head at self.backbone.head")
        elif hasattr(self.backbone, "blocks") and hasattr(self.backbone.blocks[-1], "proj"):
            head = self.backbone.blocks[-1]
            print("[ModelInit] Found head at self.backbone.blocks[-1]")
        else:
            raise RuntimeError("[ModelInit] Could not locate X3D head")

        # --- Inspect feature dim ---
        feat_dim = head.proj.in_features
        print(f"[ModelInit] Feature dimension detected: {feat_dim}")

        # --- Remove original classifier ---
        head.proj = nn.Identity()
        if hasattr(head, "activation"):
            head.activation = nn.Identity()
        print("[ModelInit] Replaced head.proj and head.activation with Identity()")

        # --- Define new heads ---
        self.head_bin = nn.Linear(feat_dim, 1)
        self.head_type = nn.Linear(feat_dim, num_classes)
        print("[ModelInit] Added binary + type heads")
        print("[ModelInit] Init complete")

    def forward(self, x):
        feats = self.backbone(x)     # (B, feat_dim)
        s = self.head_bin(feats).squeeze(1)
        c = self.head_type(feats)
        return s, c


In [4]:
# Cell 4: Loss, Sampler & Training Helpers

def compute_pos_weight(rows: List[Tuple[str,int,int]]):
    pos = sum(1 for _, b, _ in rows if b == 1)
    neg = sum(1 for _, b, _ in rows if b == 0)
    if pos == 0:
        return 1.0
    return max(1.0, neg / max(1, pos))

def make_sampler(rows: List[Tuple[str,int,int]]):
    # rows[i] = (video_path, binary_label, type_label)
    video_labels = [b for (_, b, _) in rows]
    counts = np.bincount(video_labels, minlength=2)  # [neg, pos]
    w_neg = 0.5 / (counts[0] + 1e-6)
    w_pos = 0.5 / (counts[1] + 1e-6)
    weights = [w_pos if y == 1 else w_neg for y in video_labels]
    return WeightedRandomSampler(weights, num_samples=len(weights), replacement=True)


def train_epoch(model, loader, optim, crit_bin, crit_type, device, log_interval=50):
    model.train()
    total_loss = 0.0
    total_bin_acc = 0.0
    total_type_acc = 0.0
    type_count = 0

    running_loss, running_correct_bin, running_total_bin = 0.0, 0, 0
    running_correct_type, running_total_type = 0, 0

    for i, batch in enumerate(loader):
        clips = batch["clip"].to(device)
        y_bin = batch["binary_label"].to(device)
        y_type = batch["type_label"].to(device)

        s, c = model(clips)
        loss_b = crit_bin(s, y_bin)

        mask = (y_type >= 0)
        if mask.any():
            loss_t = crit_type(c[mask], y_type[mask])
            loss = loss_b + loss_t
            type_acc = (c[mask].argmax(dim=1) == y_type[mask]).float().sum().item()
            total_type_acc += type_acc
            type_count += mask.sum().item()
            running_correct_type += type_acc
            running_total_type += mask.sum().item()
        else:
            loss = loss_b

        # Binary accuracy
        preds_bin = (torch.sigmoid(s) >= 0.5).float()
        bin_acc = (preds_bin == y_bin).float().sum().item()
        total_bin_acc += bin_acc
        running_correct_bin += bin_acc
        running_total_bin += clips.size(0)

        optim.zero_grad()
        loss.backward()
        optim.step()

        total_loss += loss.item() * clips.size(0)
        running_loss += loss.item() * clips.size(0)

        # 👇 Print every N batches
        if (i + 1) % log_interval == 0:
            avg_bin = running_correct_bin / running_total_bin if running_total_bin > 0 else 0
            avg_type = running_correct_type / running_total_type if running_total_type > 0 else 0
            avg_loss = running_loss / running_total_bin if running_total_bin > 0 else 0
            print(f"[Train] Batch {i+1}/{len(loader)} "
                  f"loss={avg_loss:.4f} bin_acc={avg_bin:.3f} type_acc={avg_type:.3f}")
            running_loss, running_correct_bin, running_total_bin = 0.0, 0, 0
            running_correct_type, running_total_type = 0, 0

    avg_loss = total_loss / len(loader.dataset)
    avg_bin_acc = total_bin_acc / len(loader.dataset)
    avg_type_acc = (total_type_acc / type_count) if type_count > 0 else 0.0
    return {"loss": avg_loss, "acc_bin": avg_bin_acc, "acc_type": avg_type_acc}


In [5]:
# Cell 5: Validation & Main Training Loop
def eval_epoch(model, loader, crit_bin, crit_type, device, log_interval=50):
    print("[Eval] Starting validation loop …")
    model.eval()
    total_loss = 0.0
    total_bin_acc = 0.0
    total_type_acc = 0.0
    type_count = 0

    running_loss, running_correct_bin, running_total_bin = 0.0, 0, 0
    running_correct_type, running_total_type = 0, 0

    with torch.no_grad():
        for i, batch in enumerate(loader):
            clips = batch["clip"].to(device)
            y_bin = batch["binary_label"].to(device)
            y_type = batch["type_label"].to(device)

            s, c = model(clips)
            loss_b = crit_bin(s, y_bin)

            mask = (y_type >= 0)
            if mask.any():
                loss_t = crit_type(c[mask], y_type[mask])
                loss = loss_b + loss_t
                type_acc = (c[mask].argmax(dim=1) == y_type[mask]).float().sum().item()
                total_type_acc += type_acc
                type_count += mask.sum().item()
                running_correct_type += type_acc
                running_total_type += mask.sum().item()
            else:
                loss = loss_b

            preds_bin = (torch.sigmoid(s) >= 0.5).float()
            bin_acc = (preds_bin == y_bin).float().sum().item()
            total_bin_acc += bin_acc
            running_correct_bin += bin_acc
            running_total_bin += clips.size(0)

            total_loss += loss.item() * clips.size(0)
            running_loss += loss.item() * clips.size(0)

            # 👇 log every N batches
            if (i + 1) % log_interval == 0:
                avg_bin = running_correct_bin / running_total_bin if running_total_bin > 0 else 0
                avg_type = running_correct_type / running_total_type if running_total_type > 0 else 0
                avg_loss = running_loss / running_total_bin if running_total_bin > 0 else 0
                print(f"[Eval] Batch {i+1}/{len(loader)} "
                      f"loss={avg_loss:.4f} bin_acc={avg_bin:.3f} type_acc={avg_type:.3f}")
                running_loss, running_correct_bin, running_total_bin = 0.0, 0, 0
                running_correct_type, running_total_type = 0, 0

    avg_loss = total_loss / len(loader.dataset)
    avg_bin_acc = total_bin_acc / len(loader.dataset)
    avg_type_acc = (total_type_acc / type_count) if type_count > 0 else 0.0
    return {"loss": avg_loss, "acc_bin": avg_bin_acc, "acc_type": avg_type_acc}


def main():
    set_seed(cfg.seed)
    rows = parse_csv(cfg.csv_path)
    random.shuffle(rows)
    split = int(0.9 * len(rows))
    rows_tr, rows_va = rows[:split], rows[split:]

    # 🔹 filter validation to only crime
    rows_va = [r for r in rows_va if r[1] == 1]
    print(f"[INFO] #Train videos: {len(rows_tr)}, #Val crime videos: {len(rows_va)}")

    cfg_pos = cfg.pos_weight if cfg.pos_weight is not None else compute_pos_weight(rows_tr)
    print(f"[INFO] Pos_weight={cfg_pos:.2f}")

    ds_tr = ClipDataset(rows_tr, clip_len=cfg.clip_len, stride=cfg.stride,
                        frame_size=cfg.frame_size, mode="train")

    # smaller validation set (crime-only)
    rows_va_small = random.sample(rows_va, k=min(50, len(rows_va)))  # still crime-only
    ds_va = ClipDataset(rows_va_small, clip_len=cfg.clip_len, stride=cfg.stride,
                        frame_size=cfg.frame_size, max_clips_per_video=4, mode="val")

    # Build sampler for training
    sampler = make_sampler(rows_tr)

    loader_tr = DataLoader(
        ds_tr,
        batch_size=cfg.batch_size,
        sampler=sampler,
        num_workers=cfg.num_workers,
        pin_memory=True,
        drop_last=True
    )
    loader_va = DataLoader(
        ds_va,
        batch_size=cfg.batch_size,
        shuffle=False,
        num_workers=cfg.num_workers,
        pin_memory=True
    )

    model = X3DMultiTask(num_classes=cfg.num_crime_classes).to(cfg.device)
    crit_bin = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([cfg_pos], device=cfg.device))
    crit_type = nn.CrossEntropyLoss()
    opt = torch.optim.AdamW(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay)

    best_val = float("inf")
    for epoch in range(1, cfg.epochs + 1):
        t0 = time.time()
        tr = train_epoch(model, loader_tr, opt, crit_bin, crit_type, cfg.device)
        va = eval_epoch(model, loader_va, crit_bin, crit_type, cfg.device)
        dt = time.time() - t0

        print(f"[{epoch:02d}] "
              f"train loss {tr['loss']:.4f} | bin acc {tr['acc_bin']:.3f} | type acc {tr['acc_type']:.3f} || "
              f"val loss {va['loss']:.4f} | bin acc {va['acc_bin']:.3f} | type acc {va['acc_type']:.3f} | {dt:.1f}s")

        # 🔹 save every epoch
        torch.save(model.state_dict(), f"x3d_multitask_epoch{epoch:02d}.pt")
        print(f"  ↳ saved x3d_multitask_epoch{epoch:02d}.pt")

        # 🔹 also track "best" model
        if va["loss"] < best_val:
            best_val = va["loss"]
            torch.save(model.state_dict(), "x3d_multitask_best.pt")
            print("  ↳ updated best model: x3d_multitask_best.pt")


In [6]:
main()

[INFO] #Train videos: 12843, #Val crime videos: 88
[INFO] Pos_weight=13.90
Scanned 0/12843 videos
Scanned 100/12843 videos
Scanned 200/12843 videos
Scanned 300/12843 videos
Scanned 400/12843 videos
Scanned 500/12843 videos
Scanned 600/12843 videos
Scanned 700/12843 videos
Scanned 800/12843 videos
Scanned 900/12843 videos
Scanned 1000/12843 videos
Scanned 1100/12843 videos
Scanned 1200/12843 videos
Scanned 1300/12843 videos
Scanned 1400/12843 videos
Scanned 1500/12843 videos
Scanned 1600/12843 videos
Scanned 1700/12843 videos
Scanned 1800/12843 videos
Scanned 1900/12843 videos
Scanned 2000/12843 videos
Scanned 2100/12843 videos
Scanned 2200/12843 videos
Scanned 2300/12843 videos
Scanned 2400/12843 videos
Scanned 2500/12843 videos
Scanned 2600/12843 videos
Scanned 2700/12843 videos
Scanned 2800/12843 videos
Scanned 2900/12843 videos
Scanned 3000/12843 videos
Scanned 3100/12843 videos
Scanned 3200/12843 videos
Scanned 3300/12843 videos
Scanned 3400/12843 videos
Scanned 3500/12843 videos
S

Using cache found in C:\Users\shrit/.cache\torch\hub\facebookresearch_pytorchvideo_main


[ModelInit] Backbone loaded
[ModelInit] Found head at self.backbone.blocks[-1]
[ModelInit] Feature dimension detected: 2048
[ModelInit] Replaced head.proj and head.activation with Identity()
[ModelInit] Added binary + type heads
[ModelInit] Init complete
[Train] Batch 50/917 loss=4.2376 bin_acc=0.503 type_acc=0.111
[Train] Batch 100/917 loss=3.1087 bin_acc=0.661 type_acc=0.244
[Train] Batch 150/917 loss=2.7350 bin_acc=0.881 type_acc=0.247
[Train] Batch 200/917 loss=2.5367 bin_acc=0.917 type_acc=0.335
[Train] Batch 250/917 loss=2.3978 bin_acc=0.959 type_acc=0.271
[Train] Batch 300/917 loss=2.3702 bin_acc=0.924 type_acc=0.351
[Train] Batch 350/917 loss=2.1665 bin_acc=0.959 type_acc=0.375
[Train] Batch 400/917 loss=1.9742 bin_acc=0.956 type_acc=0.419
[Train] Batch 450/917 loss=1.9731 bin_acc=0.971 type_acc=0.460
[Train] Batch 500/917 loss=1.9192 bin_acc=0.951 type_acc=0.443
[Train] Batch 550/917 loss=1.8111 bin_acc=0.961 type_acc=0.515
[Train] Batch 600/917 loss=1.7696 bin_acc=0.973 type_

KeyboardInterrupt: 

In [None]:
# Define a helper to load any checkpoint
def load_model(ckpt_path, device="cuda"):
    model = X3DMultiTask(num_classes=cfg.num_crime_classes).to(device)
    state = torch.load(ckpt_path, map_location=device)
    model.load_state_dict(state)
    model.eval()
    print(f"[INFO] Loaded model from {ckpt_path}")
    return model

# Load your three checkpoints
model1 = load_model("x3d_multitask_epoch01.pt")
model2 = load_model("x3d_multitask_epoch02.pt")
model3 = load_model("x3d_multitask_best.pt")


In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import cv2
import numpy as np
import decord

# ========================
# Class Names
# ========================
CLASS_NAMES = [
    "Abuse", "Arrest", "Arson", "Assault", "Burglary",
    "Explosion", "Fighting", "RoadAccidents", "Robbery",
    "Shooting", "Shoplifting", "Stealing", "Vandalism"
]

# ========================
# Model Definition
# ========================
class X3DMultiTask(nn.Module):
    def __init__(self, backbone=None, num_classes=13):
        super().__init__()
        if backbone is None:
            backbone = torch.hub.load("facebookresearch/pytorchvideo", "x3d_m", pretrained=True)
        self.backbone = backbone

        # locate classification head
        if hasattr(self.backbone, "head") and hasattr(self.backbone.head, "proj"):
            head = self.backbone.head
        else:
            head = self.backbone.blocks[-1]

        feat_dim = head.proj.in_features
        head.proj = nn.Identity()
        if hasattr(head, "activation"):
            head.activation = nn.Identity()

        # new heads
        self.head_bin = nn.Linear(feat_dim, 1)
        self.head_type = nn.Linear(feat_dim, num_classes)

    def forward(self, x):
        feats = self.backbone(x)  # (N, feat_dim)
        s = self.head_bin(feats).squeeze(1)  # binary logits
        c = self.head_type(feats)            # type logits
        return s, c

# ========================
# Model Loader
# ========================
def load_model(weight_path, device="cuda"):
    model = X3DMultiTask(num_classes=len(CLASS_NAMES)).to(device)
    state = torch.load(weight_path, map_location=device)
    model.load_state_dict(state)
    model.eval()
    print(f"[INFO] Loaded {weight_path}")
    return model

# ========================
# Preprocess Video (matches training)
# ========================
def preprocess_video(video_path, clip_len=16, stride=8, frame_size=224, device="cuda"):
    vr = decord.VideoReader(video_path)
    fnum = len(vr)

    clips = []
    for start in range(0, fnum - clip_len + 1, stride):
        inds = list(range(start, start + clip_len))
        inds = [min(i, fnum - 1) for i in inds]
        frames = vr.get_batch(inds)
        if hasattr(frames, "asnumpy"):
            frames = frames.asnumpy()  # (T,H,W,3)
        else:
            frames = frames.cpu().numpy()

        frames_resized = np.stack([
            cv2.resize(fr, (frame_size, frame_size), interpolation=cv2.INTER_AREA)
            for fr in frames
        ], axis=0)

        frames_resized = frames_resized.astype(np.float32) / 255.0
        frames_t = torch.from_numpy(frames_resized).permute(3, 0, 1, 2)  # (3,T,H,W)

        # Normalize
        mean = torch.tensor([0.45, 0.45, 0.45]).view(3,1,1,1)
        std  = torch.tensor([0.225, 0.225, 0.225]).view(3,1,1,1)
        frames_t = (frames_t - mean) / std

        clips.append(frames_t)

    clips = torch.stack(clips, dim=0).to(device)  # (N,3,T,H,W)
    print(f"[INFO] Extracted {clips.shape[0]} clips from {video_path}")
    return clips

# ========================
# Inference
# ========================
def run_inference(model, clips, batch_size=16):
    all_prob_bin, all_prob_type = [], []
    model.eval()
    with torch.no_grad():
        for i in range(0, len(clips), batch_size):
            batch = clips[i:i+batch_size]
            s, c = model(batch)
            all_prob_bin.append(torch.sigmoid(s).cpu())
            all_prob_type.append(torch.softmax(c, dim=1).cpu())
    prob_bin = torch.cat(all_prob_bin, dim=0)
    prob_type = torch.cat(all_prob_type, dim=0)
    return prob_bin, prob_type

# ========================
# Segment Extraction
# ========================
def extract_segments(prob_bin, prob_type, clip_len, stride, fps=30, threshold=0.5):
    prob_bin = prob_bin.numpy()
    prob_type = prob_type.numpy()

    segments = []
    in_crime = False
    start_idx = None

    for i, p in enumerate(prob_bin):
        if not in_crime and p >= threshold:
            in_crime = True
            start_idx = i
        elif in_crime and p < threshold:
            end_idx = i - 1
            seg_types = prob_type[start_idx:end_idx+1].mean(0)
            top_class = int(np.argmax(seg_types))
            start_time = (start_idx * stride) / fps
            end_time = ((end_idx + clip_len) * stride) / fps
            segments.append((start_time, end_time, True, CLASS_NAMES[top_class]))
            in_crime = False

    if in_crime:
        end_idx = len(prob_bin) - 1
        seg_types = prob_type[start_idx:end_idx+1].mean(0)
        top_class = int(np.argmax(seg_types))
        start_time = (start_idx * stride) / fps
        end_time = ((end_idx + clip_len) * stride) / fps
        segments.append((start_time, end_time, True, CLASS_NAMES[top_class]))

    return segments

# ========================
# Main
# ========================
if __name__ == "__main__":
    video_path = r"C:\Users\shrit\Desktop\Ml_Projects\HackTheNest\UCF101\UCF-101\ApplyLipstick\v_ApplyLipstick_g01_c02.avi"
    model_path = "x3d_multitask_epoch01.pt"

    model = load_model(model_path)
    clips = preprocess_video(video_path)
    prob_bin, prob_type = run_inference(model, clips)

    segments = extract_segments(prob_bin, prob_type, clip_len=16, stride=8, fps=30, threshold=9.7)

    print("\n=== Crime Segments (epoch01) ===")
    for i, p in enumerate(prob_bin):
        start_time = (i * 8) / 30       # stride=8, fps=30
        end_time   = ((i + 16) * 8) / 30
        top_class = prob_type[i].argmax().item()
        top_score = prob_type[i].max().item()
        print(f"Clip {i:03d}: {start_time:.1f}s–{end_time:.1f}s | "
            f"crime_prob={p.item():.3f} | "
            f"type={CLASS_NAMES[top_class]} ({top_score:.2f})")
    for seg in segments:
        start, end, is_crime, crime_type = seg
        print(f"Start={start:.1f}s, End={end:.1f}s, Crime={is_crime}, Type={crime_type}")
        


Using cache found in C:\Users\shrit/.cache\torch\hub\facebookresearch_pytorchvideo_main


[INFO] Loaded x3d_multitask_epoch01.pt
[INFO] Extracted 19 clips from C:\Users\shrit\Desktop\Ml_Projects\HackTheNest\UCF101\UCF-101\ApplyLipstick\v_ApplyLipstick_g01_c02.avi

=== Crime Segments (epoch01) ===
Clip 000: 0.0s–4.3s | crime_prob=0.031 | type=Abuse (0.38)
Clip 001: 0.3s–4.5s | crime_prob=0.036 | type=Abuse (0.35)
Clip 002: 0.5s–4.8s | crime_prob=0.171 | type=Abuse (0.39)
Clip 003: 0.8s–5.1s | crime_prob=0.270 | type=Abuse (0.39)
Clip 004: 1.1s–5.3s | crime_prob=0.080 | type=Abuse (0.44)
Clip 005: 1.3s–5.6s | crime_prob=0.044 | type=Abuse (0.40)
Clip 006: 1.6s–5.9s | crime_prob=0.055 | type=Abuse (0.39)
Clip 007: 1.9s–6.1s | crime_prob=0.037 | type=Abuse (0.38)
Clip 008: 2.1s–6.4s | crime_prob=0.031 | type=Abuse (0.40)
Clip 009: 2.4s–6.7s | crime_prob=0.036 | type=Abuse (0.40)
Clip 010: 2.7s–6.9s | crime_prob=0.025 | type=Abuse (0.38)
Clip 011: 2.9s–7.2s | crime_prob=0.054 | type=Abuse (0.36)
Clip 012: 3.2s–7.5s | crime_prob=0.120 | type=Abuse (0.36)
Clip 013: 3.5s–7.7s | cri