In [1]:
import os
import json
from dataclasses import dataclass
from typing import Dict, List, Any, Tuple

import numpy as np
import torch
from torch.utils.data import Dataset
from sklearn.utils.class_weight import compute_class_weight

import av  # pip install av
from transformers import (
    AutoImageProcessor,
    VideoMAEForVideoClassification,
    TrainingArguments,
    Trainer,
)

import evaluate  # pip install evaluate

# load environment variables with dotenv
from dotenv import load_dotenv
load_dotenv()


  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
from train import *
import os


In [3]:
# Point this at the Olympic Boxing dataset directory
DATASET_DIR = "Olympic Boxing Punch Classification Video Dataset"

# Pretrained VideoMAE base (self-supervised on K400)
model_name = "MCG-NJU/videomae-base"

image_processor = AutoImageProcessor.from_pretrained(model_name)
model = VideoMAEForVideoClassification.from_pretrained(
    model_name,
    num_labels=len(LABEL2ID),
    label2id=LABEL2ID,
    id2label=ID2LABEL,
)


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# check for cuda
if torch.cuda.is_available():
    model.to("cuda")
    print("Using CUDA")
else:
    print("Using CPU")

Using CUDA


In [5]:
train_dataset = BoxingDataset(
    split="train",
)
val_dataset = BoxingDataset(
    split="val",
)
test_dataset = BoxingDataset(
    split="test",
)

In [6]:

# FACTS used batch_size=4, grad_accum=2, warmup_ratio=0.1, epochs=10
# Learning rate is not rendered in the HTML; start with 1e-4 and tune around it.
training_args = TrainingArguments(
    output_dir="./facts-boxing-videomae",
    eval_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_steps=100,
    save_total_limit=2,
    num_train_epochs=10,
    per_device_train_batch_size=4, 
    per_device_eval_batch_size=8, 
    gradient_accumulation_steps=2,  # effective batch size 8
    warmup_ratio=0.1,
    learning_rate=1e-4,
    weight_decay=0.05,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    report_to="wandb",  # or "wandb"/"tensorboard"
    dataloader_num_workers=4,        # ADD THIS - use multiple workers
    dataloader_pin_memory=True,      # ADD THIS - faster CPU->GPU transfer
    dataloader_prefetch_factor=2, 
)

data_collator = VideoDataCollator()

train_labels = [LABEL2ID[path.split("/")[-2]] for path in BoxingDataset.train_paths]

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.arange(len(LABEL2ID)),
    y=np.array(train_labels)  # Ensure it's a numpy array
)
class_weights = torch.tensor(class_weights, dtype=torch.float32)

trainer = WeightedLossTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    class_weights=class_weights,
)


In [7]:

# Train
trainer.train()


[34m[1mwandb[0m: Currently logged in as: [33mnkosik11[0m ([33mnkosik11-hobby[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,Macro F1,F1 Lhhp,Precision Lhhp,Recall Lhhp,F1 Rhhp,Precision Rhhp,Recall Rhhp,F1 Lhmp,Precision Lhmp,Recall Lhmp,F1 Rhmp,Precision Rhmp,Recall Rhmp,F1 Lhblp,Precision Lhblp,Recall Lhblp,F1 Rhblp,Precision Rhblp,Recall Rhblp,F1 Lhbp,Precision Lhbp,Recall Lhbp,F1 Rhbp,Precision Rhbp,Recall Rhbp
500,4.1042,1.887595,0.181619,0.038426,0.0,0.0,0.0,0.307407,0.181619,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000,4.0993,1.962222,0.078775,0.018256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.146045,0.078775,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1500,4.1419,1.913253,0.181619,0.038426,0.0,0.0,0.0,0.307407,0.181619,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2000,4.1045,1.981488,0.33698,0.082309,0.545794,0.39782,0.869048,0.0,0.0,0.0,0.112676,0.115942,0.109589,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2500,4.049,1.961707,0.19256,0.08218,0.375,0.419118,0.339286,0.150754,0.12931,0.180723,0.0,0.0,0.0,0.131687,0.080402,0.363636,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3000,3.9417,2.160443,0.126915,0.053998,0.233216,0.286957,0.196429,0.079365,0.116279,0.060241,0.0,0.0,0.0,0.0,0.0,0.0,0.119403,0.06689,0.555556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3500,3.9293,2.264908,0.070022,0.031385,0.011429,0.142857,0.005952,0.190871,0.14557,0.277108,0.0,0.0,0.0,0.0,0.0,0.0,0.04878,0.027397,0.222222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4000,3.8367,2.659745,0.100656,0.05972,0.161616,0.533333,0.095238,0.205714,0.195652,0.216867,0.051948,0.5,0.027397,0.0,0.0,0.0,0.05848,0.03268,0.277778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4500,3.6801,2.736856,0.102845,0.055652,0.165049,0.447368,0.10119,0.220994,0.204082,0.240964,0.0,0.0,0.0,0.0,0.0,0.0,0.059172,0.033113,0.277778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


TrainOutput(global_step=4610, training_loss=4.0098468697769265, metrics={'train_runtime': 571.7787, 'train_samples_per_second': 64.413, 'train_steps_per_second': 8.063, 'total_flos': 4.589501448796766e+19, 'train_loss': 4.0098468697769265, 'epoch': 10.0})

In [8]:

# Evaluate on test split
test_metrics = trainer.evaluate(test_dataset)
print("Test metrics:", test_metrics)


Test metrics: {'eval_loss': 1.9666829109191895, 'eval_accuracy': 0.25910064239828695, 'eval_macro_f1': 0.08485708280790248, 'eval_f1_LHHP': 0.44808743169398907, 'eval_precision_LHHP': 0.41624365482233505, 'eval_recall_LHHP': 0.48520710059171596, 'eval_f1_RHHP': 0.0, 'eval_precision_RHHP': 0.0, 'eval_recall_RHHP': 0.0, 'eval_f1_LHMP': 0.23076923076923078, 'eval_precision_LHMP': 0.14772727272727273, 'eval_recall_LHMP': 0.527027027027027, 'eval_f1_RHMP': 0.0, 'eval_precision_RHMP': 0.0, 'eval_recall_RHMP': 0.0, 'eval_f1_LHBlP': 0.0, 'eval_precision_LHBlP': 0.0, 'eval_recall_LHBlP': 0.0, 'eval_f1_RHBlP': 0.0, 'eval_precision_RHBlP': 0.0, 'eval_recall_RHBlP': 0.0, 'eval_f1_LHBP': 0.0, 'eval_precision_LHBP': 0.0, 'eval_recall_LHBP': 0.0, 'eval_f1_RHBP': 0.0, 'eval_precision_RHBP': 0.0, 'eval_recall_RHBP': 0.0, 'eval_runtime': 4.8455, 'eval_samples_per_second': 96.377, 'eval_steps_per_second': 12.176, 'epoch': 10.0}
