In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"anhthuh","key":"555b7b9c5e836efdd35e27c1d31c1ffc"}'}

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c action-video
!unzip -q action-video.zip -d action-video


Downloading action-video.zip to /content
 99% 3.12G/3.14G [00:35<00:00, 176MB/s]
100% 3.14G/3.14G [00:35<00:00, 94.1MB/s]


In [None]:
from __future__ import annotations
import math
import random
import re
import warnings
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import amp
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from torchvision.transforms import InterpolationMode
import torchvision.transforms.functional as TF
from tqdm.auto import tqdm
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import timm

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {DEVICE}')

Using device: cuda


In [None]:
# Data paths
PATH_DATA_TRAIN = '/content/action-video/data/data_train'
PATH_DATA_TEST = '/content/action-video/data/test'

# Model parameters
NUM_FRAMES = 16
FRAME_STRIDE = 2
IMG_SIZE = 224

# Training parameters
BATCH_SIZE = 2
EPOCHS = 4
BASE_LR = 1e-5
HEAD_LR = 5e-4
WEIGHT_DECAY = 0.05
GRAD_ACCUM_STEPS = 8

PRETRAINED_NAME = 'vit_small_patch16_224'

print(f"Train data: {PATH_DATA_TRAIN}")
print(f"Test data: {PATH_DATA_TEST}")
print(f"Model: {PRETRAINED_NAME}")
print(f"Frames per video: {NUM_FRAMES}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Epochs: {EPOCHS}")

Train data: /content/action-video/data/data_train
Test data: /content/action-video/data/test
Model: vit_small_patch16_224
Frames per video: 16
Batch size: 2
Epochs: 4


In [None]:
class LightweightViTForAction(nn.Module):
    """Lightweight ViT for action recognition."""

    def __init__(self, num_classes=51, pretrained_name='vit_small_patch16_224'):
        super().__init__()

        # Load pretrained ViT (smaller backbone)
        self.vit = timm.create_model(pretrained_name, pretrained=True, num_classes=0)

        # Get embedding dimension
        self.embed_dim = self.vit.num_features

        # Simple classification head
        self.head = nn.Linear(self.embed_dim, num_classes)

    def forward(self, video):
        '''
        Args:
            video: [B, T, C, H, W] - batch of video clips
        Returns:
            logits: [B, num_classes]
        '''
        B, T, C, H, W = video.shape

        # Reshape to process all frames
        x = video.reshape(B * T, C, H, W)

        # Extract features with ViT
        features = self.vit.forward_features(x)  # [B*T, embed_dim]
        if features.dim() == 3:
            features = features[:, 0]  # CLS -> (B*T, embed_dim)

        # Reshape back
        features = features.reshape(B, T, self.embed_dim)

        # Temporal pooling
        pooled = features.mean(dim=1)  # [B, embed_dim]

        # Classification
        logits = self.head(pooled)

        return logits

print("Lightweight ViT defined")
print(f"  Backbone: {PRETRAINED_NAME}")

Lightweight ViT defined
  Backbone: vit_small_patch16_224


In [None]:
class VideoTransform:
    def __init__(self, image_size: int, is_train: bool = True):
        self.image_size = image_size
        self.is_train = is_train
        self.mean = [0.485, 0.456, 0.406]
        self.std = [0.229, 0.224, 0.225]

    def __call__(self, frames: torch.Tensor) -> torch.Tensor:
        # frames: [T, C, H, W]

        if self.is_train:
            # Random resized crop (scale 0.8-1.0)
            h, w = frames.shape[-2:]
            scale = random.uniform(0.8, 1.0)
            new_h, new_w = int(h * scale), int(w * scale)
            frames = TF.resize(frames, [new_h, new_w], interpolation=InterpolationMode.BILINEAR)

            # Random crop to target size
            i = random.randint(0, max(0, new_h - self.image_size))
            j = random.randint(0, max(0, new_w - self.image_size))
            frames = TF.crop(frames, i, j, min(self.image_size, new_h), min(self.image_size, new_w))
            frames = TF.resize(frames, [self.image_size, self.image_size], interpolation=InterpolationMode.BILINEAR)

            # Horizontal flip
            if random.random() < 0.5:
                frames = TF.hflip(frames)

            # Color jitter (brightness, contrast, saturation) - nháº¹
            if random.random() < 0.3:
                brightness_factor = random.uniform(0.9, 1.1)
                frames = TF.adjust_brightness(frames, brightness_factor)

            if random.random() < 0.3:
                contrast_factor = random.uniform(0.9, 1.1)
                frames = TF.adjust_contrast(frames, contrast_factor)

            if random.random() < 0.3:
                saturation_factor = random.uniform(0.9, 1.1)
                frames = TF.adjust_saturation(frames, saturation_factor)
        else:
            # Val/test: center crop
            frames = TF.resize(frames, [self.image_size, self.image_size], interpolation=InterpolationMode.BILINEAR)

        # Normalize
        normalized = [TF.normalize(frame, self.mean, self.std) for frame in frames]
        return torch.stack(normalized)

In [None]:
from pathlib import Path
from typing import List, Tuple, Union
import torch
from torch.utils.data import Dataset
from torchvision import transforms
from PIL import Image


class VideoDataset(Dataset):
    """
    Competition TRAIN dataset.

    Expected structure:
      root/
        class_1/
          video_folder_A/  (contains frames .jpg/.png)
          video_folder_B/
        class_2/
          ...

    Returns:
      video: (T, C, H, W) float tensor (normalized by VideoTransform)
      label: int
    """

    IMG_EXTS = {".jpg", ".jpeg", ".png"}

    def __init__(
        self,
        root: str,
        num_frames: int = 16,
        frame_stride: int = 2,
        image_size: int = 224,
        is_train: bool = True,
        transform=None,  # pass VideoTransform(image_size, is_train=...)
    ):
        super().__init__()
        self.root = Path(root)
        self.num_frames = int(num_frames)
        self.frame_stride = max(1, int(frame_stride))
        self.image_size = int(image_size)

        self.transform = transform  # expects VideoTransform
        self.to_tensor = transforms.ToTensor()

        # classes
        self.classes = sorted([d.name for d in self.root.iterdir() if d.is_dir()])
        if not self.classes:
            raise RuntimeError(f"No class folders found in: {self.root}")

        self.class_to_idx = {name: idx for idx, name in enumerate(self.classes)}

        # samples: list of (frame_paths, label)
        self.samples: List[Tuple[List[Path], int]] = []
        for cls in self.classes:
            cls_dir = self.root / cls
            for video_dir in sorted([d for d in cls_dir.iterdir() if d.is_dir()]):
                frame_paths = sorted(
                    [p for p in video_dir.iterdir() if p.suffix.lower() in self.IMG_EXTS]
                )
                if frame_paths:
                    self.samples.append((frame_paths, self.class_to_idx[cls]))

        if not self.samples:
            raise RuntimeError(f"No videos found under: {self.root}")

    def __len__(self) -> int:
        return len(self.samples)

    # def _select_indices(self, total: int) -> torch.Tensor:
    #     """
    #     HMDB-style uniform sampling:
    #     - Build a linspace grid across [0, total-1]
    #     - Take every frame_stride
    #     - Pad with last index if not enough
    #     - Return exactly num_frames indices
    #     """
    #     if total <= 0:
    #         raise ValueError("Video has no frames")
    #     if total == 1:
    #         return torch.zeros(self.num_frames, dtype=torch.long)

    #     steps = max(self.num_frames * self.frame_stride, self.num_frames)
    #     grid = torch.linspace(0, total - 1, steps=steps)
    #     idxs = grid[:: self.frame_stride].long()

    #     if idxs.numel() < self.num_frames:
    #         pad = idxs.new_full((self.num_frames - idxs.numel(),), idxs[-1].item())
    #         idxs = torch.cat([idxs, pad], dim=0)

    #     return idxs[: self.num_frames]

    def _select_indices(self, total: int, start_offset: int = 0) -> torch.Tensor:
    # total = number of frames in this video folder
        if total <= 0:
            raise ValueError("No frames")

        # Clamp offset so it's always valid
        start_offset = int(max(0, min(start_offset, total - 1)))

        # We want NUM_FRAMES samples with spacing FRAME_STRIDE (roughly)
        # Compute the maximum index we can reach
        max_needed = start_offset + (self.num_frames - 1) * self.frame_stride

        if max_needed <= total - 1:
            # Simple case: we can take a regular stride sequence
            idxs = start_offset + torch.arange(self.num_frames) * self.frame_stride
            return idxs.long()

        # Hard case: not enough frames for this offset/stride
        # Build as many as possible, then pad with last frame
        idxs = start_offset + torch.arange(self.num_frames) * self.frame_stride
        idxs = idxs.clamp(0, total - 1)  # clamp all to valid range
        return idxs.long()


    def __getitem__(self, idx: int):
        frame_paths, label = self.samples[idx]
        total = len(frame_paths)

        idxs = self._select_indices(total).tolist()

        frames = []
        for i in idxs:
            path = frame_paths[i]
            with Image.open(path) as img:
                img = img.convert("RGB")
                frames.append(self.to_tensor(img))  # (C,H,W) float [0,1]

        video = torch.stack(frames, dim=0)  # (T,C,H,W)

        if self.transform is not None:
            video = self.transform(video)

        return video, label


# class TestDataset(Dataset):
#     def __init__(self, root, num_frames=16, frame_stride=2, image_size=224, transform=None, start_offset=0):
#         self.root = Path(root)
#         self.num_frames = num_frames
#         self.frame_stride = frame_stride
#         self.transform = transform
#         self.start_offset = start_offset
#         self.to_tensor = transforms.ToTensor()

#         self.video_dirs = sorted([d for d in self.root.iterdir() if d.is_dir()], key=lambda x: int(x.name))
#         self.video_ids = [int(d.name) for d in self.video_dirs]

#     def _select_indices(self, total: int):
#         if total <= 0:
#             raise ValueError("No frames")

#         # total frames needed in timeline
#         needed = self.num_frames * self.frame_stride

#         # choose a start position that shifts the sampling window
#         # clamp so it stays inside [0, total-needed]
#         max_start = max(0, total - needed)
#         start = min(self.start_offset, max_start)

#         # indices: start, start+stride, start+2*stride, ...
#         idxs = torch.arange(start, start + needed, step=self.frame_stride, dtype=torch.long)
#         if idxs.numel() < self.num_frames:
#             # pad if video too short
#             pad = idxs.new_full((self.num_frames - idxs.numel(),), idxs[-1].item() if idxs.numel() else 0)
#             idxs = torch.cat([idxs, pad], dim=0)
#         return idxs[:self.num_frames]

#     def __len__(self):
#         return len(self.video_dirs)

#     def __getitem__(self, idx):
#         video_dir = self.video_dirs[idx]
#         video_id = self.video_ids[idx]
#         frame_paths = sorted([p for p in video_dir.iterdir() if p.suffix.lower() in {".jpg", ".jpeg", ".png"}])
#         total = len(frame_paths)
#         idxs = self._select_indices(total)

#         frames = []
#         for i in idxs:
#             path = frame_paths[int(i.item())]
#             with Image.open(path) as img:
#                 img = img.convert("RGB")
#             frames.append(self.to_tensor(img))  # [C,H,W]
#         video = torch.stack(frames)            # [T,C,H,W]
#         if self.transform:
#             video = self.transform(video)
#         return video, video_id


class TestDataset(Dataset):
    def __init__(self, root, num_frames=16, frame_stride=2, image_size=224, transform=None, start_offset=0):
        self.root = Path(root)
        self.num_frames = num_frames
        self.frame_stride = frame_stride
        self.transform = transform or VideoTransform(image_size=image_size, is_train=False)
        self.to_tensor = transforms.ToTensor()
        self.start_offset = int(start_offset)  # <-- add this

        self.video_dirs = sorted([d for d in self.root.iterdir() if d.is_dir()], key=lambda x: int(x.name))
        self.video_ids = [int(d.name) for d in self.video_dirs]

    def _select_indices(self, total: int, start_offset: int = 0) -> torch.Tensor:
    # total = number of frames in this video folder
        if total <= 0:
            raise ValueError("No frames")

        # Clamp offset so it's always valid
        start_offset = int(max(0, min(start_offset, total - 1)))

        # We want NUM_FRAMES samples with spacing FRAME_STRIDE (roughly)
        # Compute the maximum index we can reach
        max_needed = start_offset + (self.num_frames - 1) * self.frame_stride

        if max_needed <= total - 1:
            # Simple case: we can take a regular stride sequence
            idxs = start_offset + torch.arange(self.num_frames) * self.frame_stride
            return idxs.long()

        # Hard case: not enough frames for this offset/stride
        # Build as many as possible, then pad with last frame
        idxs = start_offset + torch.arange(self.num_frames) * self.frame_stride
        idxs = idxs.clamp(0, total - 1)  # clamp all to valid range
        return idxs.long()

    def __len__(self):
        return len(self.video_dirs)

    def __getitem__(self, idx):
        video_dir = self.video_dirs[idx]
        video_id = self.video_ids[idx]

        frame_paths = sorted([p for p in video_dir.iterdir() if p.suffix.lower() in (".jpg", ".jpeg", ".png")])
        total = len(frame_paths)

        idxs = self._select_indices(total, start_offset=self.start_offset)

        frames = []
        for i in idxs:
            path = frame_paths[int(i.item())]
            with Image.open(path) as img:
                img = img.convert("RGB")
                frames.append(self.to_tensor(img))

        video = torch.stack(frames)          # [T,C,H,W]
        video = self.transform(video)        # normalize etc
        return video, video_id


def collate_fn(batch):
    """
    Train batch: (video, int_label) -> labels tensor long
    Test batch:  (video, video_id)  -> ids list
    """
    videos = torch.stack([b[0] for b in batch], dim=0)  # (B,T,C,H,W)
    second = [b[1] for b in batch]
    if isinstance(second[0], int):
        second = torch.tensor(second, dtype=torch.long)
    return videos, second


In [None]:
import re
import numpy as np
from collections import defaultdict
from pathlib import Path
from torch.utils.data import DataLoader

TRAIN_ROOT = "/content/action-video/data/data_train"
# If you donâ€™t have a real val folder, you must split (see note below)
TEST_ROOT  = "/content/action-video/data/test"

NUM_FRAMES = 16
FRAME_STRIDE = 2
IMG_SIZE = 224
BATCH_SIZE = 8
NUM_WORKERS = 4

train_tf = VideoTransform(image_size=IMG_SIZE, is_train=True)
val_tf   = VideoTransform(image_size=IMG_SIZE, is_train=False)
full_dataset = VideoDataset(TRAIN_ROOT, NUM_FRAMES, FRAME_STRIDE, IMG_SIZE, is_train=True, transform=train_tf)


def base_video_name(name: str) -> str:
    """
    Make a stable 'group id' for a video.
    Examples:
      "jump_001" -> "jump"
      "jump-001" -> "jump"
      "jump001"  -> "jump001" (unchanged)
    """
    name = Path(name).name
    return re.sub(r'([_-])\d+$', '', name)

def make_group_ids_from_dataset(ds):
    """
    Build group ids aligned with ds.samples.
    We reconstruct the group id from the parent folder name of the first frame path.
    If your dataset already stores video_dir or id, replace this logic with that field.
    """
    group_ids = []
    for frame_paths, label in ds.samples:
        # frame_paths: List[Path] or List[str] for frames in one clip/video folder
        p0 = Path(frame_paths[0])
        video_dir = p0.parent          # .../class_name/<video_dir>/frame001.jpg
        gid = (label, base_video_name(video_dir.name))  # include class to avoid cross-class collision
        group_ids.append(gid)
    return group_ids

def group_split_indices(ds, val_ratio=0.1, seed=42):
    group_ids = make_group_ids_from_dataset(ds)

    # group -> list of indices
    buckets = defaultdict(list)
    for i, g in enumerate(group_ids):
        buckets[g].append(i)

    groups = list(buckets.keys())

    rng = np.random.RandomState(seed)
    rng.shuffle(groups)

    n_val_groups = max(1, int(len(groups) * val_ratio))
    val_groups = set(groups[:n_val_groups])

    train_idx, val_idx = [], []
    for g, idxs in buckets.items():
        if g in val_groups:
            val_idx.extend(idxs)
        else:
            train_idx.extend(idxs)

    return train_idx, val_idx

# -------------------------
# Build ONE dataset to scan samples (no leakage)
# -------------------------
full_scan = VideoDataset(
    TRAIN_ROOT,
    num_frames=NUM_FRAMES,
    frame_stride=FRAME_STRIDE,
    image_size=IMG_SIZE,
    is_train=True,
    transform=None,   # IMPORTANT: don't bake train transform here
)

train_idx, val_idx = group_split_indices(full_scan, val_ratio=0.1, seed=42)

# -------------------------
# Create TWO separate dataset objects (so transforms donâ€™t clash)
# -------------------------
train_dataset = VideoDataset(
    TRAIN_ROOT,
    num_frames=NUM_FRAMES,
    frame_stride=FRAME_STRIDE,
    image_size=IMG_SIZE,
    is_train=True,
    transform=train_tf,
)
val_dataset = VideoDataset(
    TRAIN_ROOT,
    num_frames=NUM_FRAMES,
    frame_stride=FRAME_STRIDE,
    image_size=IMG_SIZE,
    is_train=False,
    transform=val_tf,
)

# Copy shared metadata + subset samples
train_dataset.classes = full_scan.classes
train_dataset.class_to_idx = full_scan.class_to_idx
val_dataset.classes = full_scan.classes
val_dataset.class_to_idx = full_scan.class_to_idx

train_dataset.samples = [full_scan.samples[i] for i in train_idx]
val_dataset.samples   = [full_scan.samples[i] for i in val_idx]

# -------------------------
# Dataloaders
# -------------------------
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS,
    pin_memory=True,
    collate_fn=collate_fn,
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=True,
    collate_fn=collate_fn,
)

print(f"Train clips: {len(train_dataset)} | Val clips: {len(val_dataset)}")


Train clips: 5600 | Val clips: 654




In [None]:
TEST_ROOT = "/content/action-video/data/test"

test_tf = VideoTransform(image_size=IMG_SIZE, is_train=False)

test_dataset = TestDataset(
    root=TEST_ROOT,
    num_frames=NUM_FRAMES,
    frame_stride=FRAME_STRIDE,
    image_size=IMG_SIZE,
    transform=test_tf
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    collate_fn=collate_fn,
    pin_memory=True
)

print(f"Test clips: {len(test_dataset)} | Test batches: {len(test_loader)}")


Test clips: 510 | Test batches: 64


In [None]:
import torch
import torch.nn as nn

# If not installed in your environment:
# !pip -q install transformers accelerate

from transformers import VideoMAEForVideoClassification

class VideoMAEAction(nn.Module):
    """
    Video-pretrained backbone -> best accuracy baseline.
    Expects video tensor: [B, T, C, H, W] float32 normalized with ImageNet mean/std.
    """
    def __init__(self, num_classes: int, pretrained_id: str = "MCG-NJU/videomae-base"):
        super().__init__()

        # ignore_mismatched_sizes=True lets HF replace classification head for your num_classes
        self.model = VideoMAEForVideoClassification.from_pretrained(
            pretrained_id,
            num_labels=num_classes,
            ignore_mismatched_sizes=True,
        )

    def forward(self, video: torch.Tensor) -> torch.Tensor:
        # video: [B, T, C, H, W]
        out = self.model(pixel_values=video)
        return out.logits  # [B, num_classes]


In [None]:
import math
from transformers import get_cosine_schedule_with_warmup

EPOCHS = 20
BASE_LR = 1e-5            # good starting point for pretrained video transformer
WEIGHT_DECAY = 0.05
WARMUP_RATIO = 0.1
GRAD_ACCUM_STEPS = 4

optimizer = torch.optim.AdamW(model.parameters(), lr=BASE_LR, weight_decay=WEIGHT_DECAY)

# total training steps
train_steps_per_epoch = math.ceil(len(train_loader) / GRAD_ACCUM_STEPS)
total_steps = EPOCHS * train_steps_per_epoch
warmup_steps = int(WARMUP_RATIO * total_steps)

scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

scaler = torch.amp.GradScaler("cuda", enabled=(DEVICE.type == "cuda"))


In [None]:
import torch.nn.functional as F
from tqdm import tqdm

def train_one_epoch(model, loader, optimizer, scheduler, scaler, device, grad_accum_steps=1):
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0

    optimizer.zero_grad(set_to_none=True)
    pbar = tqdm(loader, desc="Train", leave=False)

    for step, (videos, labels) in enumerate(pbar):
        videos = videos.to(device, non_blocking=True)   # [B,T,C,H,W]
        labels = labels.to(device, non_blocking=True)   # [B]

        with torch.cuda.amp.autocast(enabled=(device.type == "cuda")):
            logits = model(videos)                      # [B,num_classes]
            loss = F.cross_entropy(logits, labels, label_smoothing=0.1)
            loss = loss / grad_accum_steps

        scaler.scale(loss).backward()

        # metrics (use the unscaled loss for logging)
        with torch.no_grad():
            preds = logits.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
            total_loss += (loss.item() * grad_accum_steps) * labels.size(0)

        if (step + 1) % grad_accum_steps == 0 or (step + 1) == len(loader):
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()

        pbar.set_postfix(loss=f"{(total_loss/max(total,1)):.4f}", acc=f"{(correct/max(total,1)):.4f}")

    return total_loss / max(total, 1), correct / max(total, 1)


@torch.no_grad()
def evaluate(model, loader, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0

    pbar = tqdm(loader, desc="Val", leave=False)
    for videos, labels in pbar:
        videos = videos.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        logits = model(videos)
        loss = F.cross_entropy(logits, labels)

        preds = logits.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
        total_loss += loss.item() * labels.size(0)

        pbar.set_postfix(loss=f"{(total_loss/max(total,1)):.4f}", acc=f"{(correct/max(total,1)):.4f}")

    return total_loss / max(total, 1), correct / max(total, 1)


In [None]:
num_classes = len(full_dataset.classes)   # or train_dataset.dataset.classes
model = VideoMAEAction(num_classes=num_classes, pretrained_id="MCG-NJU/videomae-base").to(DEVICE)

optimizer = torch.optim.AdamW(model.parameters(), lr=BASE_LR, weight_decay=WEIGHT_DECAY)

train_steps_per_epoch = math.ceil(len(train_loader) / GRAD_ACCUM_STEPS)
total_steps = EPOCHS * train_steps_per_epoch
warmup_steps = int(WARMUP_RATIO * total_steps)

scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

scaler = torch.amp.GradScaler('cuda', enabled=(DEVICE.type == "cuda"))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/377M [00:00<?, ?B/s]

Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# ---- model ----
num_classes = len(full_dataset.classes)  # or len(train_dataset.dataset.classes) if using random_split
model = VideoMAEAction(num_classes=num_classes, pretrained_id="MCG-NJU/videomae-base").to(DEVICE)

# ---- optimizer (recommended: correct weight decay handling) ----
decay, no_decay = [], []
for n, p in model.named_parameters():
    if not p.requires_grad:
        continue
    if p.ndim == 1 or n.endswith(".bias") or "layernorm" in n.lower() or "layer_norm" in n.lower():
        no_decay.append(p)
    else:
        decay.append(p)

optimizer = torch.optim.AdamW(
    [{"params": decay, "weight_decay": WEIGHT_DECAY},
     {"params": no_decay, "weight_decay": 0.0}],
    lr=BASE_LR
)

# ---- scheduler ----
train_steps_per_epoch = math.ceil(len(train_loader) / GRAD_ACCUM_STEPS)
total_steps = EPOCHS * train_steps_per_epoch
warmup_steps = int(WARMUP_RATIO * total_steps)

scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

scaler = torch.amp.GradScaler('cuda', enabled=(DEVICE.type == "cuda"))


Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from pathlib import Path
import torch

best_acc = -1.0
best_epoch = -1

checkpoint_path = Path("best_videomae.pt")

# early stopping settings
patience = 5       # stop after 5 epochs with no improvement
min_delta = 1e-4   # require at least this improvement
wait = 0

# classes list (important: keep same mapping used by dataset labels)
classes = full_dataset.classes  # if you used random_split, still OK because both subsets share same .dataset

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")

    train_loss, train_acc = train_one_epoch(
        model, train_loader, optimizer, scheduler, scaler, DEVICE, grad_accum_steps=GRAD_ACCUM_STEPS
    )

    val_loss, val_acc = evaluate(model, val_loader, DEVICE)

    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"Val   Loss: {val_loss:.4f} | Val   Acc: {val_acc:.4f}")

    # ---- save best + early stop ----
    if val_acc > best_acc + min_delta:
        best_acc = val_acc
        best_epoch = epoch
        wait = 0

        torch.save(
            {
                "model": model.state_dict(),
                "classes": classes,
                "best_acc": best_acc,
                "epoch": epoch + 1,
                # optional but useful (uncomment if you want resume training)
                # "optimizer": optimizer.state_dict(),
                # "scheduler": scheduler.state_dict() if scheduler is not None else None,
                # "scaler": scaler.state_dict() if scaler is not None else None,
            },
            checkpoint_path,
        )
        print(f"âœ… Best model saved (val_acc={best_acc:.4f}) -> {checkpoint_path}")
    else:
        wait += 1
        print(f"No improvement. patience {wait}/{patience}")

        if wait >= patience:
            print(f"ðŸ›‘ Early stopping at epoch {epoch+1}. Best val_acc={best_acc:.4f} (epoch {best_epoch+1})")
            break

print("\n" + "-" * 40)
print(f"Training completed! Best val accuracy: {best_acc:.4f} (epoch {best_epoch+1})")
print(f"Best model saved to: {checkpoint_path}")




Epoch 1/20


  with torch.cuda.amp.autocast(enabled=(device.type == "cuda")):


Train Loss: 3.9426 | Train Acc: 0.0371
Val   Loss: 3.8761 | Val   Acc: 0.0795
âœ… Best model saved (val_acc=0.0795) -> best_videomae.pt

Epoch 2/20




Train Loss: 3.6839 | Train Acc: 0.1030
Val   Loss: 3.5423 | Val   Acc: 0.1208
âœ… Best model saved (val_acc=0.1208) -> best_videomae.pt

Epoch 3/20




Train Loss: 3.1695 | Train Acc: 0.2409
Val   Loss: 2.9443 | Val   Acc: 0.2584
âœ… Best model saved (val_acc=0.2584) -> best_videomae.pt

Epoch 4/20




Train Loss: 2.6346 | Train Acc: 0.4443
Val   Loss: 2.5201 | Val   Acc: 0.3807
âœ… Best model saved (val_acc=0.3807) -> best_videomae.pt

Epoch 5/20




Train Loss: 2.2279 | Train Acc: 0.5682
Val   Loss: 2.1545 | Val   Acc: 0.4771
âœ… Best model saved (val_acc=0.4771) -> best_videomae.pt

Epoch 6/20




Train Loss: 1.9251 | Train Acc: 0.6589
Val   Loss: 1.9568 | Val   Acc: 0.5214
âœ… Best model saved (val_acc=0.5214) -> best_videomae.pt

Epoch 7/20




Train Loss: 1.6988 | Train Acc: 0.7221
Val   Loss: 1.7837 | Val   Acc: 0.5627
âœ… Best model saved (val_acc=0.5627) -> best_videomae.pt

Epoch 8/20




Train Loss: 1.5395 | Train Acc: 0.7654
Val   Loss: 1.7681 | Val   Acc: 0.5398
No improvement. patience 1/5

Epoch 9/20




Train Loss: 1.4003 | Train Acc: 0.8159
Val   Loss: 1.6406 | Val   Acc: 0.5642
âœ… Best model saved (val_acc=0.5642) -> best_videomae.pt

Epoch 10/20




Train Loss: 1.2981 | Train Acc: 0.8384
Val   Loss: 1.5347 | Val   Acc: 0.6070
âœ… Best model saved (val_acc=0.6070) -> best_videomae.pt

Epoch 11/20




Train Loss: 1.2165 | Train Acc: 0.8670
Val   Loss: 1.5094 | Val   Acc: 0.5948
No improvement. patience 1/5

Epoch 12/20




Train Loss: 1.1443 | Train Acc: 0.8914
Val   Loss: 1.4833 | Val   Acc: 0.6208
âœ… Best model saved (val_acc=0.6208) -> best_videomae.pt

Epoch 13/20




Train Loss: 1.0923 | Train Acc: 0.9118
Val   Loss: 1.4933 | Val   Acc: 0.6162
No improvement. patience 1/5

Epoch 14/20




Train Loss: 1.0570 | Train Acc: 0.9173
Val   Loss: 1.4854 | Val   Acc: 0.6070
No improvement. patience 2/5

Epoch 15/20




Train Loss: 1.0192 | Train Acc: 0.9268
Val   Loss: 1.4565 | Val   Acc: 0.6070
No improvement. patience 3/5

Epoch 16/20




Train Loss: 0.9929 | Train Acc: 0.9380
Val   Loss: 1.4674 | Val   Acc: 0.6086
No improvement. patience 4/5

Epoch 17/20


                                                                             

Train Loss: 0.9843 | Train Acc: 0.9402
Val   Loss: 1.4797 | Val   Acc: 0.6116
No improvement. patience 5/5
ðŸ›‘ Early stopping at epoch 17. Best val_acc=0.6208 (epoch 12)

----------------------------------------
Training completed! Best val accuracy: 0.6208 (epoch 12)
Best model saved to: best_videomae.pt




In [None]:
# =========================
# FINAL INFERENCE ON TEST SET (VideoMAE)
# =========================
from pathlib import Path
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

# ---- MUST EXIST from your previous cells ----
# DEVICE, IMG_SIZE, NUM_FRAMES, FRAME_STRIDE, BATCH_SIZE, NUM_WORKERS, TEST_ROOT
# VideoMAEAction (your model wrapper)
# TestDataset (returns: (video_tensor[T,C,H,W], video_id:int))
# VideoTransform (with is_train=False)
# --------------------------------------------

# âœ… Final collate_fn (works for train OR test)
def collate_fn(batch):
    videos = torch.stack([b[0] for b in batch], dim=0)   # [B,T,C,H,W]
    second = [b[1] for b in batch]                       # label:int OR video_id:int
    if isinstance(second[0], int):
        second = torch.tensor(second, dtype=torch.long)  # [B]
    return videos, second

print("INFERENCE ON TEST SET")

checkpoint_path = Path("best_videomae.pt")   # <-- must match what you saved
print(f"Loading checkpoint from {checkpoint_path}...")

ckpt = torch.load(checkpoint_path, map_location="cpu")
classes = ckpt["classes"]                    # list[str]
num_classes = len(classes)

# rebuild SAME model as training
model = VideoMAEAction(num_classes=num_classes, pretrained_id="MCG-NJU/videomae-base")
model.load_state_dict(ckpt["model"], strict=True)
model = model.to(DEVICE)
model.eval()

print(f"Model loaded (best val acc: {ckpt['best_acc']:.4f})")

# build test loader
print("\nLoading test dataset...")
test_tf = VideoTransform(image_size=IMG_SIZE, is_train=False)

test_dataset = TestDataset(
    root=TEST_ROOT,
    num_frames=NUM_FRAMES,
    frame_stride=FRAME_STRIDE,
    image_size=IMG_SIZE,
    transform=test_tf
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=True,
    collate_fn=collate_fn
)

print(f"Test clips: {len(test_dataset)} | Test batches: {len(test_loader)}")


# run inference

@torch.no_grad()
def predict_test_tta(
    model,
    classes,                 # list[str]
    test_root,
    num_frames,
    frame_stride,
    img_size,
    batch_size,
    num_workers,
    device,
    offsets=(0, 4, 8),       # try (0, 4, 8) or (0, 2, 4, 6)
):
    model.eval()

    # build ONE transform (val/test)
    test_tf = VideoTransform(image_size=img_size, is_train=False)

    logits_sum = None
    video_ids_ref = None

    for off in offsets:
        ds = TestDataset(
            root=test_root,
            num_frames=num_frames,
            frame_stride=frame_stride,
            image_size=img_size,
            transform=test_tf,
            start_offset=off,
        )

        loader = DataLoader(
            ds,
            batch_size=batch_size,
            shuffle=False,
            num_workers=num_workers,
            pin_memory=True,
            collate_fn=collate_fn
        )

        all_logits = []
        all_ids = []

        for videos, video_ids in tqdm(loader, desc=f"Infer offset={off}", leave=False):
            videos = videos.to(device, non_blocking=True)
            logits = model(videos)  # [B,num_classes]
            all_logits.append(logits.float().cpu())
            all_ids.append(video_ids.cpu())

        all_logits = torch.cat(all_logits, dim=0)  # [N,num_classes]
        all_ids = torch.cat(all_ids, dim=0)        # [N]

        # ensure consistent ordering across offsets
        if video_ids_ref is None:
            video_ids_ref = all_ids
        else:
            if not torch.equal(video_ids_ref, all_ids):
                raise RuntimeError("Video id order mismatch across offsets. Check sorting in TestDataset.")

        logits_sum = all_logits if logits_sum is None else (logits_sum + all_logits)

    logits_avg = logits_sum / float(len(offsets))
    pred_idx = logits_avg.argmax(dim=1)  # [N]

    # build list[(video_id, class_name)]
    preds = []
    for vid, pi in zip(video_ids_ref.tolist(), pred_idx.tolist()):
        preds.append((int(vid), classes[int(pi)]))

    preds.sort(key=lambda x: x[0])
    return preds


INFERENCE ON TEST SET
Loading checkpoint from best_videomae.pt...


Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded (best val acc: 0.6208)

Loading test dataset...
Test clips: 510 | Test batches: 64


In [None]:
predictions = predict_test_tta(
    model=model,
    classes=classes,
    test_root=TEST_ROOT,
    num_frames=NUM_FRAMES,
    frame_stride=FRAME_STRIDE,
    img_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    device=DEVICE,
    offsets=(0, 4, 8),
)
print("Total predictions:", len(predictions))


                                                               

Total predictions: 510




In [None]:
# save submission.csv
submission_path = Path("submission.csv")
with open(submission_path, "w") as f:
    f.write("id,class\n")
    for video_id, pred_class in predictions:
        f.write(f"{video_id},{pred_class}\n")

print(f"Submission saved to: {submission_path.resolve()}")


Submission saved to: /content/submission.csv
