In [None]:
import torch
import os
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from dotenv import load_dotenv
from huggingface_hub import login
from datasets import load_dataset
from transformers import VideoMAEForPreTraining


In [None]:
load_dotenv()
HF_Token = os.getenv("HuggingFace_TOKEN")
login(HF_Token) 

In [None]:
# Load VideoMAE model for pretraining
model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

print(model)

In [None]:
dataset = load_dataset("MCG-NJU/MultiSports", split="train[:500]")

# match VideoMAE input size
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

def preprocess_video(sample):
    frames = [transform(frame) for frame in sample["frame"]]
    video_tensor = torch.stack(frames).permute(1, 0, 2, 3)  # (C, T, H, W)
    return {"video": video_tensor.unsqueeze(0)}  # (1, C, T, H, W)

# Apply transformation
dataset = dataset.map(preprocess_video)

In [None]:
train_loader = DataLoader(dataset, batch_size=4, shuffle=True)

optimizer = optim.AdamW(model.parameters(), lr=1e-4)

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        video_frames = batch["video"].to(device)

        optimizer.zero_grad()
        outputs = model(video_frames, mask_ratio=0.9)  # Mask 90% of frames
        loss = outputs.loss  # Masked frame reconstruction loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}")

# Save pretrained model
torch.save(model.state_dict(), "videoMAE_pretrained.pth")
print("Pretraining complete. Model saved.")