In [2]:
import torch
torch.cuda.is_available()

True

## Using yolov7 to obtain keypoints

In [2]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import torch
import time
from torch.utils.data import Dataset, DataLoader


from torchvision import transforms
from PIL import Image

In [3]:
# Change directory to "/yolov7"
os.chdir("yolov7")
print(os.getcwd()) 

/notebooks/Fighting-detection-in-CCTVs/yolov7


In [4]:
from utils.datasets import letterbox
from utils.general import non_max_suppression_kpt
from utils.plots import output_to_keypoint, plot_skeleton_kpts

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

###############################################################################

# Load yolov7 pose detector model
print("Loading model: ", "yolov7-pose...")
weights_path = "yolov7-w6-pose.pt"
model_yolov7 = torch.load(weights_path, map_location=device, weights_only=False)['model']
print("Model loaded.")

model_yolov7.float().eval()

if torch.cuda.is_available():
    # half() turns predictions into float16 tensors --> significantly lowers inference time
    model_yolov7.half().to(device)

Loading model:  yolov7-pose...
Model loaded.


In [5]:
# Main inference
def infer(image):
    image = letterbox(image, 960, 
                      stride=64,
                      auto=True)[0]  # shape: (567, 960, 3)
    
    image = transforms.ToTensor()(image)  # torch.Size([3, 567, 960])

    if torch.cuda.is_available():
        image = image.half().to(device)

    image = image.unsqueeze(0)  # torch.Size([1, 3, 567, 960])

    with torch.no_grad():
        output, _ = model_yolov7(image)

    return output, image

###############################################################################
# Draw YOLOv7 pose keypoints and optionally return keypoints for saving.
def draw_keypoints(output, image, confidence=0.25, threshold=0.65, return_kpts=False, background_colour=(255, 255, 255)):
    output = non_max_suppression_kpt(
        output,
        confidence,
        threshold,
        nc=model_yolov7.yaml['nc'],
        nkpt=model_yolov7.yaml['nkpt'],
        kpt_label=True)

    with torch.no_grad():
        output = output_to_keypoint(output)  # shape: (num_people, 51)

    nimg = image[0].permute(1, 2, 0) * 255
    nimg = cv2.cvtColor(nimg.cpu().numpy().astype(np.uint8), cv2.COLOR_RGB2BGR)
    
    # Create a blank image with the specified background colour
    # nimg = np.full((image.shape[2], image.shape[3], 3), background_colour, dtype=np.uint8)

    # Correctly loop through 'output' variable instead of 'kpts'
    for idx in range(output.shape[0]):
        plot_skeleton_kpts(nimg, output[idx, 7:].T, 3)

    if return_kpts:
        return nimg, output  # (image with keypoints drawn, raw keypoints)

    return nimg


## Create custom dataloader
- includes pre-processing via yolov7 pose estimator

In [13]:

transform = transforms.Compose([
    # Convert frames to PyTorch tensors
    transforms.ToTensor(), 
    # Resize all video frames to 480, 640 (480p)
    transforms.Resize((480,640)), 
    # Convert to grayscale
    transforms.Grayscale(num_output_channels=1), 
])

class VideoDataset(Dataset):
    def __init__(self, video_files, video_folder, transform=transform):
        self.video_folder = video_folder
        self.video_files = video_files
        self.transform = transform
        self.data = self._load_data()

    
    
    def _load_data(self):
        data = []
        count = 5
        
        for video_file in self.video_files:
            video_path = os.path.join(self.video_folder, video_file)
            # If the video name is "nofixxx.mp4", this means no-fight --> 0
            # If "NV_xx.mp4" means no-fight
            if "nofi" in video_file or "NV" in video_file:
                label = 0
            else:
                label = 1
                
            # Check if we extract the labels correctly
            # if count >= 0:
            #     print(f'video path: {video_path}')
            #     print(f'label: {label}')
            #     count -= 1
                                      
            capture = cv2.VideoCapture(video_path)
            fps = int(capture.get(cv2.CAP_PROP_FPS))
            total_frames = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
             
            for idx in range(0, total_frames, fps): # Sample one frame per second
                capture.set(cv2.CAP_PROP_POS_FRAMES, idx)
                ret, frame = capture.read()
                if ret:
                    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                    keypoints, image = infer(frame_rgb)
                    pose_image = draw_keypoints(keypoints, image, return_kpts=False)
                    
                    # Check the outputs given by yolov7 output
                    # if count >= 0:
                    #     plt.figure(figsize=(30, 7))
                    #     plt.axis("off")
                    #     plt.imshow(pose_image)
                    #     plt.savefig(f"output{count}.jpg") # Save output if needed
                    
                    data.append((pose_image, label))
            capture.release()
        return data
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        pose_image, label = self.data[idx]
        if self.transform:
            pose_image = self.transform(pose_image)
            
            # Check the outputs given by yolov7 output witih the added transformations
            # if count >= 0:
            #     plt.figure(figsize=(30, 7))
            #     plt.axis("off")
            #     plt.imshow(pose_image)
            #     plt.savefig(f"output{count}.jpg") # Save output if needed
                    
        return pose_image, label


### Prepare video paths and labels

In [14]:
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

print("Preparing dataloaders...")

video_folder = "../fight-detection-4"
video_files = [f for f in os.listdir(video_folder) if f.endswith((".mp4", ".avi", ".mov", ".mpeg"))]

###############################################################################

print("Splitting data...")
# Split dataset such that 20% of data is used for testing, remaining 80% for training + validation
train_files, test_files = train_test_split(video_files, test_size=0.2, random_state=42)
# Split dataset such that 60% of data is used for training, remaining 20% for validation
train_files, val_files = train_test_split(train_files, test_size=0.25, random_state=42) # 0.25 * 0.8 = 0.2

###############################################################################

train_dataset = VideoDataset(train_files, video_folder)
print("Train dataset compiled...")
val_dataset = VideoDataset(val_files, video_folder)
print("Validation dataset compiled...")
test_dataset = VideoDataset(test_files, video_folder)
print("Test dataset compiled...")

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
print("Train dataloader compiled...")
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
print("Validation dataloader compiled...")
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
print("Test dataloader compiled...")



Preparing dataloaders...
Splitting data...
Train dataset compiled...
Validation dataset compiled...
Test dataset compiled...
Train dataloader compiled...
Validation dataloader compiled...
Test dataloader compiled...


## Create instance of Feature extractor (Xception) + LSTM

In [4]:

# torch.cuda.empty_cache()

!export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128


In [7]:
!nvidia-smi

Sat Apr 19 08:51:33 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.144.03             Driver Version: 550.144.03     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Quadro P6000                   Off |   00000000:00:05.0 Off |                  Off |
| 26%   33C    P0             59W /  250W |    1699MiB /  24576MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [10]:
import torch
import torch.nn as nn
import timm

class CNN_LSTM(nn.Module):
    def __init__(self, cnn_model, hidden_size, num_layers):
        super(CNN_LSTM, self).__init__()
        self.cnn = cnn_model
        self.lstm = nn.LSTM(input_size=cnn_model.num_features, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        batch_size, seq_length, c, h, w = x.size()
        c_in = x.view(batch_size * seq_length, c, h, w)
        c_out = self.cnn(c_in)
        c_out = c_out.view(batch_size, seq_length, -1)
        lstm_out, _ = self.lstm(c_out)
        out = self.fc(lstm_out[:, -1, :])
        return torch.sigmoid(out)

# Example usage
cnn_model = timm.create_model('xception', pretrained=True)
cnn_model.fc = nn.Identity() # Remove the final classification layer
model_cnn_lstm = CNN_LSTM(cnn_model, hidden_size=128, num_layers=2)


torch.cuda.empty_cache()

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_cnn_lstm.to(device)


# Example input for checking if models are loaded correctly
input_tensor = torch.randn(16, 10, 3, 224, 224) # (batch_size, seq_length, channels, height, width)
output = model_cnn_lstm(input_tensor.to(device))
print(f"Predicted result: {output}")


OutOfMemoryError: CUDA out of memory. Tried to allocate 466.00 MiB. GPU 0 has a total capacty of 23.86 GiB of which 334.12 MiB is free. Process 682018 has 23.53 GiB memory in use. Of the allocated memory 23.02 GiB is allocated by PyTorch, and 347.40 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

## Train Xception + LSTM on fighting and non-fighting datasets

In [None]:
def validate(model, val_loader):
    model.eval()  # Set the model to evaluation mode
    val_loss = 0.0
    correct, total = 0, 0
    criterion = nn.BCEWithLogitsLoss()  # Ensure the same loss function is used

    with torch.no_grad():  # Disable gradient computation
        for pose_img, labels in val_loader:
            pose_img = pose_img.repeat(1, 3, 1, 1) # Duplicate the grayscale channel to create a 3-channel image
            pose_img, labels = pose_img.to(device), labels.to(device).float()
            outputs = model(pose_img)
            loss = criterion(outputs, labels.unsqueeze(1))
            val_loss += loss.item() * pose_img.size(0)  # Accumulate the loss
            
            # Calculate accuracy
            predicted = torch.round(torch.sigmoid(outputs))
            total += labels.size(0)
            correct += (predicted.cpu() == labels.cpu().unsqueeze(1)).sum().item()

    val_loss /= len(val_loader.dataset)  # Compute the average loss
    val_accuracy = correct / total
    return val_loss, val_accuracy

num_epochs = 100

train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []


# Define loss function and optimiser
criterion = nn.BCEWithLogitsLoss()  # Binary Cross-Entropy Loss
optimizer = torch.optim.Adam(model_resnet.fc.parameters(), lr=0.0001)

# count = 1


for epoch in range(num_epochs):
    model_cnn_lstm.train()
    
    running_loss = 0.0
    correct = 0
    total = 0

    for pose_img, labels in train_loader:
        pose_img = pose_img.repeat(1, 3, 1, 1) # Duplicate the grayscale channel to create a 3-channel image
        inputs, labels = pose_img.to(device), labels.to(device).float()

        optimizer.zero_grad()
        outputs = model_cnn_lstm(inputs)
        loss = criterion(outputs, labels.unsqueeze(1).float())
        loss.backward()
        
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
        predicted = torch.round(torch.sigmoid(outputs))
        total += labels.size(0)
        correct += (predicted.cpu() == labels.cpu().unsqueeze(1)).sum().item()
        
        # if count >= 0:
        #     print(f"Predicted value: {predicted.cpu()}")
        #     print(f"Label: {labels.cpu().unsqueeze(1)}")
        #     count -= 1

    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_accuracy = correct / total
    train_losses.append(epoch_loss)
    train_accuracies.append(epoch_accuracy)

    # Validation step (assuming val_loader is defined)
    val_loss, val_accuracy = validate(model_cnn_lstm, val_loader)
    val_losses.append(val_loss)
    val_accuracies.append(val_accuracy)
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
    
    # Save model weights for every 5 epochs
    # if epoch % 5 == 0:
    #     torch.save(model_cnn_lstm.state_dict(), f'../model_epoch_2_{epoch+1}.pth')

## Save model weights

In [17]:
torch.save(model_cnn_lstm.state_dict(), f'../model_epoch_recent.pth')

## Show training-val loss and accuracy over epochs

In [None]:
%matplotlib inline

# Plot training and validation loss
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs+1), train_losses, label='Training Loss')
plt.plot(range(1, num_epochs+1), val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')
plt.show()

# Plot training and validation accuracy
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs+1), train_accuracies, label='Training Accuracy')
plt.plot(range(1, num_epochs+1), val_accuracies, label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Training and Validation Accuracy')
plt.show()


## Basic inference and testing - for one video file

In [None]:
# Load weights
model_cnn_lstm.load_state_dict(torch.load('../model_epoch_recent.pth'))
model_cnn_lstm.eval()

# Pre-process input --> extract frames
video_path = '../fight-detection-2/nofi096.mp4'

class Video(Dataset):
    def __init__(self, video_path, transform=transform):
        self.video_path = video_path
        self.transform = transform
        self.data = self._load_data()

    def _load_data(self):
        data = []
                                      
        capture = cv2.VideoCapture(self.video_path)
        fps = int(capture.get(cv2.CAP_PROP_FPS))
        total_frames = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))

        # for idx in range(0, total_frames, fps): # use this to sample one frame per second
        for idx in range(0, total_frames, fps): # use this to sample all frames in the video
            capture.set(cv2.CAP_PROP_POS_FRAMES, idx)
            ret, frame = capture.read()
            if ret:
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                keypoints, image = infer(frame_rgb)
                pose_image = draw_keypoints(keypoints, image, return_kpts=False)
                data.append(pose_image)
        capture.release()
        return data
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        pose_image = self.data[idx]
        if self.transform:
            pose_image = self.transform(pose_image)
        return pose_image

processed_video = Video(video_path)
print("Video compiled...")

video_loader = DataLoader(processed_video, batch_size=32, shuffle=True)     
print("Video loaded...")
        
# inference
for pose_img in video_loader:
    pose_img = pose_img.repeat(1, 3, 1, 1) # Duplicate the grayscale channel to create a 3-channel image
    pose_img = pose_img.to(device)
    outputs = model_cnn_lstm(pose_img)
    predicted = torch.round(torch.sigmoid(outputs))
    print(f"Predicted result for batch: {predicted}")

## Testloader
- can be used for unseen dataset
- or for the 20% split dataset done earlier

In [None]:
# Load weights
model_cnn_lstm.load_state_dict(torch.load('../model_epoch_recent.pth'))
model_cnn_lstm.eval()

def test(model, test_loader=test_loader):
    # model.eval()  # Set the model to evaluation mode
    test_loss = 0.0
    correct, total = 0, 0
    criterion = nn.BCEWithLogitsLoss()  # Ensure the same loss function is used

    with torch.no_grad():  # Disable gradient computation
        for pose_img, labels in test_loader:
            pose_img = pose_img.repeat(1, 3, 1, 1) # Duplicate the grayscale channel to create a 3-channel image
            pose_img, labels = pose_img.to(device), labels.to(device).float()
            outputs = model(pose_img)
            loss = criterion(outputs, labels.unsqueeze(1))
            test_loss += loss.item() * pose_img.size(0)  # Accumulate the loss
            
            # Calculate accuracy
            predicted = torch.round(torch.sigmoid(outputs))
            total += labels.size(0)
            correct += (predicted.cpu() == labels.cpu().unsqueeze(1)).sum().item()

    test_loss /= len(test_loader.dataset)  # Compute the average loss
    test_accuracy = correct / total
    return test_loss, test_accuracy

test_loss, test_accuracy = test(model_cnn_lstm)
print(f"Test loss: {test_loss} Test accuracy: {test_accuracy}")