In [None]:
!pip install opencv-python torch torchvision tqdm


Load Dataset

!pip install datasets

from datasets import load_dataset

ds = load_dataset("DanJoshua/RWF-2000")

print(ds)

Convert raw bytes to frames

!pip install av

import io
import av
import cv2
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

#Utility: Convert AVI bytes -> frames
def avi_bytes_to_frames(avi_bytes, max_frames=32, every_n=5):
    container = av.open(io.BytesIO(avi_bytes))
    frames = []
    for i, frame in enumerate(container.decode(video=0)):
        if i % every_n == 0:
            frames.append(np.array(frame.to_image()))
        if len(frames) >= max_frames:
            break
    if len(frames) == 0:
        return np.zeros((1, 112, 112, 3), dtype=np.uint8)
    return np.stack(frames)

#Utility: Resize frames to fixed size
def resize_frames(frames, size=(112,112)):
    resized = [cv2.resize(f, size) for f in frames]
    return np.stack(resized)

#PyTorch Dataset
class RWF2000Dataset(Dataset):
    def __init__(self, hf_dataset, max_frames=32, every_n=5, frame_size=(112,112)):
        self.dataset = hf_dataset
        self.max_frames = max_frames
        self.every_n = every_n
        self.frame_size = frame_size

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        avi_bytes = item["avi"]
        frames = avi_bytes_to_frames(avi_bytes, self.max_frames, self.every_n)
        frames = resize_frames(frames, self.frame_size)
        # Infer label: 0 = Non-Violence, 1 = Violence (based on file name)
        label = 1 if "Violence" in item["__url__"] else 0

        # Convert to tensor (T, C, H, W)
        frames_tensor = torch.from_numpy(frames).permute(0,3,1,2).float()
        return frames_tensor, torch.tensor(label).long()

#Instantiate Dataset + DataLoader
train_dataset = RWF2000Dataset(ds["train"], max_frames=32, every_n=5)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=2)


#Verify batch shapes
for X, y in train_loader:
    print("Video batch shape:", X.shape)  # (B, T, C, H, W)
    print("Labels shape:", y.shape)       # (B,)
    break

CNN+LSTM

import torch.nn as nn
import torch.nn.functional as F

class CNNEncoder(nn.Module):
    def __init__(self, out_features=256):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 16, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
        self.conv3 = nn.Conv2d(32, 64, 3, padding=1)
        self.fc = nn.Linear(64*14*14, out_features)  # assuming 112x112 input

    def forward(self, x):
        # x: (B, C, H, W)
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = F.relu(self.conv3(x))
        x = self.pool(x)  # now size (B, 64, 14, 14)
        x = x.reshape(x.size(0), -1)  # flatten
        x = self.fc(x)
        return x  # (B, out_features)

class CNN_LSTM(nn.Module):
    def __init__(self, cnn_out=256, lstm_hidden=128, num_classes=2):
        super().__init__()
        self.cnn = CNNEncoder(out_features=cnn_out)
        self.lstm = nn.LSTM(input_size=cnn_out, hidden_size=lstm_hidden,
                            num_layers=1, batch_first=True)
        self.fc = nn.Linear(lstm_hidden, num_classes)

    def forward(self, x):
        # x: (B, T, C, H, W)
        B, T, C, H, W = x.size()
        # Flatten batch & time to feed CNN
        x = x.reshape(B*T, C, H, W)
        features = self.cnn(x)  # (B*T, cnn_out)
        # Restore sequence: (B, T, cnn_out)
        features = features.view(B, T, -1)
        # LSTM
        out, (h_n, c_n) = self.lstm(features)
        # Take last hidden state
        last = h_n[-1]  # (B, lstm_hidden)
        out = self.fc(last)  # (B, num_classes)
        return out

device = "cuda" if torch.cuda.is_available() else "cpu"
model = CNN_LSTM().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    running_loss = 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader):.4f}")

model.eval()

video_bytes = ds['train'][0]['avi']

# Preprocess it the same way as during training
def preprocess_video_bytes(avi_bytes):
    frames = avi_bytes_to_frames(avi_bytes, max_frames=32, every_n=5)
    frames = resize_frames(frames, size=(112,112))
    frames_tensor = torch.from_numpy(frames).permute(0,3,1,2).float().unsqueeze(0)  # Add batch dim
    return frames_tensor

X = preprocess_video_bytes(video_bytes).to(device)

with torch.no_grad():
    pred = torch.argmax(model(X), dim=1).item()

print("Prediction:", "Violence" if pred==1 else "Non-Violence")