# Dependencies

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import torchvision.transforms as transforms
from torchvision.models.video import r3d_18
import cv2
import json
from tqdm import tqdm

# Check GPU Availibility

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load UCF-101 Dataset

In [None]:
dataset_dir = "ucf-101/UCF-101"
annotations_file = "ucf-101/annotations.json"

# Load Annotations

In [None]:
with open(annotations_file, "r", encoding="utf-8") as f:
    annotations = json.load(f)
print(f"Total videos loaded: {len(annotations)}")

# Define Video Dataset Class

In [None]:
class UCF101Dataset(Dataset):
    def __init__(self, annotations, dataset_dir, transform=None):
        self.annotations = annotations
        self.dataset_dir = dataset_dir
        self.transform = transform
    
    def __len__(self):
        return len(self.annotations)
    
    def __getitem__(self, idx):
        sample = self.annotations[idx]
        video_path = os.path.join(self.dataset_dir, sample["video_file"])
        frames = self.load_video(video_path)
        label = sample["label"]
        
        if self.transform:
            frames = torch.stack([self.transform(frame) for frame in frames])
        
        return {
            "frames": frames,
            "label": label
        }
    
    def load_video(self, video_path, frame_count=16):
        cap = cv2.VideoCapture(video_path)
        frames = []
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        frame_indices = torch.linspace(0, total_frames - 1, steps=frame_count).long()
        
        for i in range(total_frames):
            ret, frame = cap.read()
            if not ret:
                break
            if i in frame_indices:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = torch.tensor(frame).permute(2, 0, 1)  # Convert to CHW format
                frames.append(frame)
        
        cap.release()
        return frames if len(frames) == frame_count else frames[:frame_count]

# Data Transformation

In [None]:
video_transform = transforms.Compose([
    transforms.Resize((112, 112)),
    transforms.ToTensor(),
])

# Create Dataset

In [None]:
dataset = UCF101Dataset(annotations, dataset_dir, transform=video_transform)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Dataloaders

In [None]:
batch_size = 4
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)


# Define model

In [None]:
model = r3d_18(pretrained=True).to(device)
num_classes = len(set(sample["label"] for sample in annotations))
model.fc = nn.Linear(model.fc.in_features, num_classes)

# Training Setup

In [None]:
optimizer = optim.AdamW(model.parameters(), lr=1e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.7)
criterion = nn.CrossEntropyLoss()

# Training Loop

In [None]:
epochs = 10
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs}"):
        frames = batch["frames"].to(device)
        labels = torch.tensor(batch["label"]).to(device)
        
        optimizer.zero_grad()
        outputs = model(frames)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    scheduler.step()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {running_loss / len(train_dataloader)}")
    torch.save(model.state_dict(), "ucf101_checkpoint.pth")

# Evaluation

In [None]:
total_loss, correct, total = 0.0, 0, 0
model.eval()
with torch.no_grad():
    for batch in tqdm(val_dataloader, desc="Evaluating"):
        frames = batch["frames"].to(device)
        labels = torch.tensor(batch["label"]).to(device)
        
        outputs = model(frames)
        loss = criterion(outputs, labels)
        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total * 100
print(f"Validation Loss: {total_loss / len(val_dataloader):.4f}, Accuracy: {accuracy:.2f}%")