In [1]:
import os
import json
from dataclasses import dataclass
from typing import Dict, List, Any, Tuple

import numpy as np
import torch
from torch.utils.data import Dataset

import av  # pip install av
from transformers import (
    AutoImageProcessor,
    VideoMAEForVideoClassification,
    TrainingArguments,
    Trainer,
)

import evaluate  # pip install evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from train import *
import os
os.environ["WANDB_DIR"] = "/tmp/wandb"  

In [3]:
# Point this at the Olympic Boxing dataset directory
DATASET_DIR = "Olympic Boxing Punch Classification Video Dataset"

# Pretrained VideoMAE base (self-supervised on K400)
model_name = "MCG-NJU/videomae-base"

image_processor = AutoImageProcessor.from_pretrained(model_name)
model = VideoMAEForVideoClassification.from_pretrained(
    model_name,
    num_labels=len(LABEL2ID),
    label2id=LABEL2ID,
    id2label=ID2LABEL,
)


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# check for cuda
if torch.cuda.is_available():
    model.to("cuda")
    print("Using CUDA")
else:
    print("Using CPU")

Using CUDA


## Data Loading Options

This notebook supports two data loading modes:

### 1. On-demand loading (DEFAULT - slower, CPU-bound)
- Decodes videos at runtime during training
- Memory efficient but CPU intensive
- GPU often waits for data to be decoded

### 2. Preprocessed loading (RECOMMENDED - faster, GPU-bound)
- Pre-extracts all clips to numpy arrays (one-time operation)
- **10-50x faster data loading**
- Eliminates CPU bottleneck, maximizes GPU utilization

**To use preprocessed mode:**
1. Run preprocessing script once: `python preprocess_clips.py`
2. Set `USE_PREPROCESSED = True` in the cell below
3. This will load pre-extracted clips instead of decoding videos

**Storage requirements:** ~27 GB for all clips (uint8 format)

In [None]:
# Configuration for data loading mode
USE_PREPROCESSED = False  # Set to True to use preprocessed clips (much faster!)
PREPROCESSED_DIR = "preprocessed_clips"  # Directory containing preprocessed clips

if USE_PREPROCESSED:
    print("Using PREPROCESSED mode - fast data loading from numpy arrays")
    print(f"Looking for clips in: {PREPROCESSED_DIR}")
else:
    print("Using ON-DEMAND mode - decoding videos at runtime (slower)")

In [None]:
train_dataset = BoxingDataset(
    dataset_dir=DATASET_DIR,
    split="train",
    image_processor=image_processor,
    use_preprocessed=USE_PREPROCESSED,
    preprocessed_dir=PREPROCESSED_DIR,
)
val_dataset = BoxingDataset(
    dataset_dir=DATASET_DIR,
    split="val",
    image_processor=image_processor,
    use_preprocessed=USE_PREPROCESSED,
    preprocessed_dir=PREPROCESSED_DIR,
)
test_dataset = BoxingDataset(
    dataset_dir=DATASET_DIR,
    split="test",
    image_processor=image_processor,
    use_preprocessed=USE_PREPROCESSED,
    preprocessed_dir=PREPROCESSED_DIR,
)

In [6]:

# FACTS used batch_size=4, grad_accum=2, warmup_ratio=0.1, epochs=10
# Learning rate is not rendered in the HTML; start with 1e-4 and tune around it.
training_args = TrainingArguments(
    output_dir="./facts-boxing-videomae",
    eval_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_steps=100,
    save_total_limit=2,
    num_train_epochs=10,
    per_device_train_batch_size=16,  # Increased from 8 - try even higher!
    per_device_eval_batch_size=8, 
    gradient_accumulation_steps=2,  # effective batch size 8
    warmup_ratio=0.1,
    learning_rate=1e-4,
    weight_decay=0.05,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    report_to="wandb",  # or "wandb"/"tensorboard"
    dataloader_num_workers=4,        # ADD THIS - use multiple workers
    dataloader_pin_memory=True,      # ADD THIS - faster CPU->GPU transfer
    dataloader_prefetch_factor=2, 
)

data_collator = VideoDataCollator()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [None]:

# Train
trainer.train()


[34m[1mwandb[0m: Currently logged in as: [33mnkosik11[0m ([33mnkosik11-hobby[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss


In [None]:

# Evaluate on test split
test_metrics = trainer.evaluate(test_dataset)
print("Test metrics:", test_metrics)
