# Step 1: Install Required Libraries

In [None]:
! pip install torch torchvision moviepy opencv-python-headless streamlit scikit-learn


# Step 2: Prepare Your Custom Dataset

Organize your dataset:

Create directories where each subdirectory corresponds to an action class (e.g., running, walking, etc.).
Inside each directory, add corresponding video files.


#### example
|
dataset/
├── running/
│   ├── video1.mp4
│   ├── video2.mp4
├── walking/
    ├── video1.mp4
    ├── video2.mp4







Split Dataset into Training and Validation:

Use a ratio of 80% for training and 20% for validation.

# Step 3: Write Dataset Class to Load Videos

Create a custom VideoDataset class that will load video frames, apply transformations, and return the data as tensors for training.

In [1]:
import os
import torch
import cv2
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

class VideoDataset(Dataset):
    def __init__(self, root_dir, label_map, frames_per_clip=32, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.frames_per_clip = frames_per_clip
        self.label_map = label_map
        self.videos = self._get_video_paths()

    def _get_video_paths(self):
        videos = []
        for class_name in os.listdir(self.root_dir):
            class_dir = os.path.join(self.root_dir, class_name)
            if os.path.isdir(class_dir):
                for video in os.listdir(class_dir):
                    videos.append((os.path.join(class_dir, video), self.label_map[class_name]))
        return videos

    def __len__(self):
        return len(self.videos)

    def __getitem__(self, idx):
        video_path, label = self.videos[idx]
        frames = self._extract_frames(video_path)
        if self.transform:
            frames = [self.transform(frame) for frame in frames]
        frames = torch.stack(frames).permute(1, 0, 2, 3)  # (C, T, H, W)
        return frames, label

    def _extract_frames(self, video_path):
        cap = cv2.VideoCapture(video_path)
        frames = []
        while len(frames) < self.frames_per_clip:
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.resize(frame, (112, 112))  # Resize frames to match model input
            frames.append(torch.tensor(frame, dtype=torch.float32).permute(2, 0, 1) / 255.0)  # Normalize
        cap.release()
        return frames


# Step 4: Define Data Loaders
Split the data into training and validation sets and create data loaders.

In [7]:
from sklearn.model_selection import train_test_split

def get_loaders(root_dir, batch_size=4):
    label_map = {class_name: i for i, class_name in enumerate(os.listdir(root_dir))}
    dataset = VideoDataset(root_dir, label_map, transform=transforms.Normalize((0.5,), (0.5,)))

    train_data, val_data = train_test_split(dataset, test_size=0.2, random_state=42)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, label_map


# Step 5: Fine-tune the Pre-trained r2plus1d_18 Model
Modify the final layer to match the number of classes in your dataset.

In [8]:
import torch.nn as nn
import torch.optim as optim
from torchvision import models

def create_model(num_classes):
    model = models.video.r2plus1d_18(pretrained=True)
    model.fc = nn.Linear(model.fc.in_features, num_classes)  # Update the last layer
    return model


# Step 6: Define Training Loop
Here’s the training loop where the model learns from the training data.

In [9]:
def train_model(model, train_loader, val_loader, device, epochs=10):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-4)

    model.to(device)

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Training Loss: {train_loss/len(train_loader)}")

        validate(model, val_loader, device)

def validate(model, val_loader, device):
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Validation Loss: {val_loss/len(val_loader)}, Accuracy: {100 * correct / total}%")


# Step 7: Train the Model
Initialize the model, load the data, and start training.

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_loader, val_loader, label_map = get_loaders('/home/alluvium/Desktop/Video_classification/dataset')
model = create_model(num_classes=len(label_map))

train_model(model, train_loader, val_loader, device, epochs=10)
torch.save(model.state_dict(), 'action_recognition_model.pth')


Downloading: "https://download.pytorch.org/models/r2plus1d_18-91a641e6.pth" to /home/alluvium/.cache/torch/hub/checkpoints/r2plus1d_18-91a641e6.pth
100%|██████████| 120M/120M [00:04<00:00, 30.3MB/s] 


Epoch 1/10, Training Loss: 0.4484711244895861
Validation Loss: 0.11833928966079839, Accuracy: 96.15384615384616%
Epoch 2/10, Training Loss: 0.21936393480796318
Validation Loss: 0.06307444193516858, Accuracy: 98.71794871794872%
Epoch 3/10, Training Loss: 0.23227855880055334
Validation Loss: 0.07171267224475741, Accuracy: 98.71794871794872%
Epoch 4/10, Training Loss: 0.23210832130696094
Validation Loss: 0.09051896380260586, Accuracy: 98.71794871794872%
Epoch 5/10, Training Loss: 0.11675865398478005
Validation Loss: 0.070965994335711, Accuracy: 98.71794871794872%
Epoch 6/10, Training Loss: 0.12450954276923236
Validation Loss: 0.1131224851065781, Accuracy: 96.15384615384616%
Epoch 7/10, Training Loss: 0.06309299194253981
Validation Loss: 0.07285855153459124, Accuracy: 98.71794871794872%
Epoch 8/10, Training Loss: 0.021976920212135202
Validation Loss: 0.062271978156059046, Accuracy: 98.71794871794872%
Epoch 9/10, Training Loss: 0.13053429370002168
Validation Loss: 0.06104815102880821, Accur

# Step 8: Test the Model on a Video File
Use the trained model to make predictions on a new video.

In [23]:
import cv2

def load_video_frames(video_path, frames_per_clip=32):
    """Load frames from the video and prepare them for the model."""
    cap = cv2.VideoCapture(video_path)
    frames = []

    while len(frames) < frames_per_clip:
        ret, frame = cap.read()
        if not ret:
            break

        # Resize the frame to match the model's input size (112x112)
        frame = cv2.resize(frame, (112, 112))
        
        # Convert frame to tensor and normalize (values between 0 and 1)
        frame = torch.tensor(frame, dtype=torch.float32).permute(2, 0, 1) / 255.0  # (C, H, W)
        frames.append(frame)

    cap.release()

    # If not enough frames, pad with the last frame
    while len(frames) < frames_per_clip:
        frames.append(frames[-1].clone())

    # Stack frames along the time dimension and add batch dimension
    video_tensor = torch.stack(frames, dim=1)  # (C, T, H, W)
    return video_tensor.unsqueeze(0)  # (1, C, T, H, W)


def predict_on_video(model, video_path, device, label_map, threshold=0.5):
    """Predict the action in a single video."""
    model.eval()
    
    # Load the video frames
    inputs = load_video_frames(video_path).to(device)

    with torch.no_grad():
        outputs = model(inputs)
        probabilities = torch.nn.functional.softmax(outputs, dim=1)
        confidence, predicted = torch.max(probabilities, 1)
        class_name = list(label_map.keys())[list(label_map.values()).index(predicted.item())]

        if confidence.item() < threshold:
            print(f"Predicted Action: Unknown, Confidence: {confidence.item():.2f}")
        else:
            print(f"Predicted Action: {class_name}, Confidence: {confidence.item():.2f}")

# Load the model and make predictions
model.load_state_dict(torch.load('action_recognition_model.pth', map_location=device))
vid_path = 'dataset/JavelineThrow/v_JavelinThrow_g02_c04.avi'
predict_on_video(model, vid_path, device, label_map)


Predicted Action: ThrowDiscus, Confidence: 0.98


  model.load_state_dict(torch.load('action_recognition_model.pth', map_location=device))
