Ensure `nvidia-smi` works.

Try running `!nvidia-smi` to check, else download [CUDA toolkit](https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64&target_version=11&target_type=exe_local)

In [1]:
import torch
torch.cuda.is_available()

False

In [2]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
import os
# import urllib.request
import sys
import torch
import time
# import datetime
from torch.utils.data import Dataset, DataLoader


from torchvision import transforms
from PIL import Image
# from moviepy.editor import *

# %matplotlib inlines

In [6]:
# Change directory to "/yolov7"
os.chdir("yolov7")
print(os.getcwd()) 

/Users/limsophie/Documents/GitHub/Fighting-detection-in-CCTVs/yolov7


In [7]:
from utils.datasets import letterbox
from utils.general import non_max_suppression_kpt
from utils.plots import output_to_keypoint, plot_skeleton_kpts

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load models
print("Loading model: ", "yolov7-tiny...")
weights_path = "yolov7-w6-pose.pt"
model = torch.load(weights_path, map_location=device, weights_only=False)['model']
print("Model loaded.")

model.float().eval()

if torch.cuda.is_available():
    # half() turns predictions into float16 tensors --> significantly lowers inference time
    model.half().to(device)


Loading model:  yolov7-tiny...
Model loaded.


In [8]:
from utils.datasets import letterbox

# Main inference
def infer(image):
    image = letterbox(image, 960, 
                      stride=64,
                      auto=True)[0]  # shape: (567, 960, 3)
    
    image = transforms.ToTensor()(image)  # torch.Size([3, 567, 960])

    if torch.cuda.is_available():
        image = image.half().to(device)

    image = image.unsqueeze(0)  # torch.Size([1, 3, 567, 960])

    with torch.no_grad():
        output, _ = model(image)

    return output, image


def draw_keypoints(output, image, confidence=0.25, threshold=0.65, return_kpts=False):
    """
    Draw YOLOv7 pose keypoints and optionally return keypoints for saving.
    """
    output = non_max_suppression_kpt(
        output,
        confidence,
        threshold,
        nc=model.yaml['nc'],
        nkpt=model.yaml['nkpt'],
        kpt_label=True)

    with torch.no_grad():
        output = output_to_keypoint(output)  # shape: (num_people, 51)

    # Convert tensor image back to numpy
    nimg = image[0].permute(1, 2, 0) * 255
    nimg = cv2.cvtColor(nimg.cpu().numpy().astype(np.uint8), cv2.COLOR_RGB2BGR)

    # Correctly loop through 'output' variable instead of 'kpts'
    for idx in range(output.shape[0]):
        plot_skeleton_kpts(nimg, output[idx, 7:].T, 3)

    if return_kpts:
        return nimg, output  # (image with keypoints drawn, raw keypoints)

    return nimg

#code had problem = "NameError: name 'kpts' is not defined"
#above code is the new one but essentially the part that changed was for idx in range...
#old code below
# for idx in range(kpts.shape[0]):
#        plot_skeleton_kpts(nimg, kpts[idx, 7:].T, 3)
#
#   return nimg, kpts

In [None]:
#exit yolov7

import os

# Step 1: Move up one directory level from the current working directory
os.chdir(os.path.dirname(os.getcwd()))  # Exit the current directory = yolov7

#os.chdir("fighting-detection-in-cctvs")  # Change working directory

print(os.getcwd())  # Prints the current working directory

/Users/limsophie/Documents/GitHub/Fighting-detection-in-CCTVs


In [None]:
#extract keypoints from the produced frames already processed by yolov7

import os
import cv2
import torch
import numpy as np
from tqdm import tqdm

# Folder containing your extracted frames (.jpg)
frame_folder = "results/fight_frames"

# Folder to store extracted keypoints (.npy)
keypoint_folder = "results/fight_keypoints"
os.makedirs(keypoint_folder, exist_ok=True)

def extract_keypoints_from_frames(frame_folder, keypoint_folder):
    frame_files = sorted([f for f in os.listdir(frame_folder) if f.endswith(".jpg")])

    for frame_file in tqdm(frame_files):
        basename = os.path.splitext(frame_file)[0]
        npy_path = os.path.join(keypoint_folder, basename + ".npy")

        # Skip if the .npy already exists
        if os.path.exists(npy_path):
            continue

        frame_path = os.path.join(frame_folder, frame_file)
        image_bgr = cv2.imread(frame_path)
        if image_bgr is None:
            print(f"Could not read {frame_file}")
            continue

        # Convert to RGB
        image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)

        try:
            output, image_tensor = infer(image_rgb)
            _, keypoints = draw_keypoints(output, image_tensor, return_kpts=True)

            if keypoints is not None and keypoints.shape[0] > 0:
                first_person_kpts = keypoints[0, 7:]  # Only the 17 keypoints
                keypoints_array = first_person_kpts.reshape(-1, 3)  # Shape: (17, 3)

                # Save as .npy
                np.save(npy_path, keypoints_array)
        except Exception as e:
            print(f"Error processing {frame_file}: {e}")

In [None]:
# (don't run this) draws points on the boxing photo
import matplotlib
matplotlib.use('MacOSX')

imagefile = "../test_images/boxing_grey.jpg"

print("Inferencing image input...")

output, image = infer(cv2.imread(imagefile))
pose_image = draw_keypoints(output, image, confidence=0.25, threshold=0.65)

print("Inference complete.")

plt.figure(figsize=(30, 7))
plt.axis("off")
plt.imshow(pose_image)

In [None]:
#old code don't use
import os
import time
import cv2
import numpy as np

def yoloV7_pose_video(videofile, confidence=0.25, threshold=0.65):
    start = time.time()

    capture = cv2.VideoCapture(videofile)
    fps = capture.get(cv2.CAP_PROP_FPS)
    total_frames = capture.get(cv2.CAP_PROP_FRAME_COUNT)

    print(f"Processing video: {videofile}")
    
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    outputvideofile = "../results/result_" + os.path.basename(videofile)
    outvideo = cv2.VideoWriter(outputvideofile, fourcc, 30.0,
                               (int(capture.get(3)), int(capture.get(4))))

    # Make sure output directories exist
    os.makedirs("results/fight_frames", exist_ok=True)
    os.makedirs("results/fight_keypoints", exist_ok=True)

    idx = 1
    while capture.isOpened():
        ret, frame = capture.read()
        if not ret:
            break

        if idx % fps == 1:
            print("Processed frames =", f"{idx:06}")

        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        output, tensor_frame = infer(frame_rgb)
        
        frame_with_kpts, keypoints = draw_keypoints(output, tensor_frame, confidence, threshold, return_kpts=True)

        # Save keypoints (only if keypoints are found)
        if keypoints is not None and keypoints.shape[0] > 0:
            first_person_kpts = keypoints[0, 7:]  # shape: (51,)
            keypoints_array = first_person_kpts.reshape(-1, 3)  # (17, 3)

            npy_filename = os.path.join("results/fight_keypoints", f"frame_{os.path.basename(videofile)}_{idx:06}.npy")
            np.save(npy_filename, keypoints_array)

        # Save visual frame
        frame_with_kpts = cv2.resize(frame_with_kpts,
                                     (int(capture.get(3)), int(capture.get(4))))
        jpg_filename = os.path.join("results/fight_frames", f"videoframe_{os.path.basename(videofile)}_{idx:06}.jpg")
        cv2.imwrite(jpg_filename, frame_with_kpts)

        outvideo.write(frame_with_kpts)
        idx += 1

    capture.release()
    outvideo.release()
    print(f"Done. Output video: {outputvideofile}")
    return outputvideofile

In [None]:
#use this one

import os
import time
import cv2
import numpy as np

def yoloV7_pose_video(videofile, frame_dir, keypoint_dir, confidence=0.25, threshold=0.65):
    start = time.time()

    capture = cv2.VideoCapture(videofile)
    fps = capture.get(cv2.CAP_PROP_FPS)
    total_frames = capture.get(cv2.CAP_PROP_FRAME_COUNT)

    print(f"Processing video: {videofile}")
    
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    outputvideofile = "../results/result_" + os.path.basename(videofile)
    outvideo = cv2.VideoWriter(outputvideofile, fourcc, 30.0,
                               (int(capture.get(3)), int(capture.get(4))))

    # Create the output folders if they don't exist
    os.makedirs(frame_dir, exist_ok=True)
    os.makedirs(keypoint_dir, exist_ok=True)

    idx = 1
    while capture.isOpened():
        ret, frame = capture.read()
        if not ret:
            break

        if idx % fps == 1:
            print("Processed frames =", f"{idx:06}")

        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        output, tensor_frame = infer(frame_rgb)
        
        frame_with_kpts, keypoints = draw_keypoints(output, tensor_frame, confidence, threshold, return_kpts=True)

        # Save keypoints
        if keypoints is not None and keypoints.shape[0] > 0:
            first_person_kpts = keypoints[0, 7:]  # shape: (51,)
            keypoints_array = first_person_kpts.reshape(-1, 3)  # (17, 3)

            npy_filename = os.path.join(keypoint_dir, f"frame_{os.path.basename(videofile)}_{idx:06}.npy")
            np.save(npy_filename, keypoints_array)

        # Save visual frame
        frame_with_kpts = cv2.resize(frame_with_kpts,
                                     (int(capture.get(3)), int(capture.get(4))))
        jpg_filename = os.path.join(frame_dir, f"videoframe_{os.path.basename(videofile)}_{idx:06}.jpg")
        cv2.imwrite(jpg_filename, frame_with_kpts)

        outvideo.write(frame_with_kpts)
        idx += 1

    capture.release()
    outvideo.release()
    print(f"Done. Output video: {outputvideofile}")
    return outputvideofile

In [None]:
#run yoloV7_pose_video on a whole folder (be careful)
import os

# Your video folder
video_folder = "train_videos/nf_train_40"

# Output folders
frame_dir = "results/nonfight_frames"
keypoint_dir = "results/nonfight_keypoints"

# Make sure output folders exist
os.makedirs(frame_dir, exist_ok=True)
os.makedirs(keypoint_dir, exist_ok=True)

# Get list of video files (mp4, avi, etc.)
video_files = [f for f in os.listdir(video_folder) if f.endswith((".mp4", ".avi", ".mov"))]

# Loop through and run pose extraction
for video in video_files:
    video_path = os.path.join(video_folder, video)
    print(f"Starting pose extraction on {video_path}...")

    yoloV7_pose_video(
        videofile=video_path,
        frame_dir=frame_dir,
        keypoint_dir=keypoint_dir
    )

print("✅ All non-fight videos processed!")

Starting pose extraction on train_videos/nf_train_40/nofi038.mp4...
Processing video: train_videos/nf_train_40/nofi038.mp4
Processed frames = 000001


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


Processed frames = 000031
Done. Output video: ../results/result_nofi038.mp4
Starting pose extraction on train_videos/nf_train_40/nofi010.mp4...
Processing video: train_videos/nf_train_40/nofi010.mp4
Processed frames = 000001
Done. Output video: ../results/result_nofi010.mp4
Starting pose extraction on train_videos/nf_train_40/nofi004.mp4...
Processing video: train_videos/nf_train_40/nofi004.mp4
Processed frames = 000001
Processed frames = 000026
Done. Output video: ../results/result_nofi004.mp4
Starting pose extraction on train_videos/nf_train_40/nofi005.mp4...
Processing video: train_videos/nf_train_40/nofi005.mp4
Processed frames = 000001
Processed frames = 000026
Done. Output video: ../results/result_nofi005.mp4
Starting pose extraction on train_videos/nf_train_40/nofi011.mp4...
Processing video: train_videos/nf_train_40/nofi011.mp4
Processed frames = 000001
Done. Output video: ../results/result_nofi011.mp4
Starting pose extraction on train_videos/nf_train_40/nofi039.mp4...
Processi

In [None]:
#fight_0086 = "train_videos/train_40/fight_0086.mpeg"
#video_52 = yoloV7_pose_video(fight_0086)

#fight_0075 = "train_videos/train_40/fight_0075.mpeg"
#video_27 = yoloV7_pose_video(fight_0075)

#fight_0064 = "train_videos/train_40/fight_0064.mpeg"
#video_16 = yoloV7_pose_video(fight_0064)

#fight_0021 (13 sec), fight_0022 (12 sec), fight_0062 (edit the clip)

#fight_0063 = "train_videos/train_40/fight_0063.mpeg"
#video_15 = yoloV7_pose_video(fight_0063)

#fight_0009 = "train_videos/train_40/fight_0009.mpeg"
#video_34 = yoloV7_pose_video(fight_0009)

#fight_0015 = "train_videos/train_40/fight_0015.mpeg"
#video_30 = yoloV7_pose_video(fight_0015)

#fight_0019 = "train_videos/train_40/fight_0019.mpeg"
#video_05 = yoloV7_pose_video(fight_0019)

Processing video: train_videos/train_40/fight_0015.mpeg
Processed frames = 000001
Done. Output video: ../results/result_fight_0015.mpeg


In [None]:
#old code
#extract_keypoints_from_frames(frame_folder, keypoint_folder)

100%|██████████| 2015/2015 [1:46:18<00:00,  3.17s/it]  


In [None]:
#hide one to run the other if you want, I usually run them one by one
extract_keypoints_from_frames("results/nonfight_frames", "results/nonfight_keypoints")
extract_keypoints_from_frames("results/fight_frames", "results/nonfight_keypoints")

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
100%|██████████| 2351/2351 [1:48:51<00:00,  2.78s/it]  


In [None]:
#string together the related keyframes into a sequence for learning
from torch.utils.data import Dataset
import os
import numpy as np
import torch

class PoseSequenceDataset(Dataset):
    def __init__(self, frame_dir, sequence_len=30, label=1):  # <-- This line is key
        self.frame_dir = frame_dir
        self.sequence_len = sequence_len
        self.label = label

        # Sort all keypoint files
        self.keypoint_files = sorted([
            os.path.join(frame_dir, f) for f in os.listdir(frame_dir) if f.endswith('.npy')
        ])

        # Create sequences
        self.sequences = [
            self.keypoint_files[i:i+sequence_len]
            for i in range(0, len(self.keypoint_files) - sequence_len + 1, sequence_len)
        ]

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence_paths = self.sequences[idx]
        sequence = np.array([np.load(p)[:, :2] for p in sequence_paths])  # (seq_len, 17, 2)
        sequence = sequence.reshape(self.sequence_len, -1)  # (seq_len, 34)
        return torch.tensor(sequence, dtype=torch.float32), torch.tensor(self.label, dtype=torch.long)


In [15]:
import torch.nn as nn

class FightClassifierLSTM(nn.Module):
    def __init__(self, input_size=34, hidden_size=64, num_layers=2, num_classes=2):
        super(FightClassifierLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out, _ = self.lstm(x)  # x: (batch, seq_len, input_size)
        out = out[:, -1, :]    # use last time step
        out = self.fc(out)     # output logits
        return out

In [16]:
import torch.optim as optim

def train_model(model, train_loader, num_epochs=10, lr=1e-3, device='cuda' if torch.cuda.is_available() else 'cpu'):
    model = model.to(device)
    model.train()  # Set to training mode

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        total_loss = 0.0
        correct = 0
        total = 0

        for sequences, labels in train_loader:
            sequences = sequences.to(device)
            labels = labels.to(device)

            outputs = model(sequences)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        avg_loss = total_loss / len(train_loader)
        accuracy = 100 * correct / total
        print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {avg_loss:.4f} | Accuracy: {accuracy:.2f}%")

In [None]:
fight_dataset = PoseSequenceDataset(frame_dir='results/fight_keypoints', sequence_len=30, label=1)
nonfight_dataset = PoseSequenceDataset(frame_dir='results/nonfight_keypoints', sequence_len=30, label=0)

from torch.utils.data import ConcatDataset, DataLoader

combined_dataset = ConcatDataset([fight_dataset, nonfight_dataset])
train_loader = DataLoader(combined_dataset, batch_size=8, shuffle=True)

model = FightClassifierLSTM()
train_model(model, train_loader, num_epochs=10)

Epoch [1/10] - Loss: 0.6690 | Accuracy: 55.70%
Epoch [2/10] - Loss: 0.5547 | Accuracy: 76.79%
Epoch [3/10] - Loss: 0.5849 | Accuracy: 69.20%
Epoch [4/10] - Loss: 0.5483 | Accuracy: 72.57%
Epoch [5/10] - Loss: 0.4711 | Accuracy: 78.06%
Epoch [6/10] - Loss: 0.6046 | Accuracy: 67.51%
Epoch [7/10] - Loss: 0.6518 | Accuracy: 62.03%
Epoch [8/10] - Loss: 0.5641 | Accuracy: 74.26%
Epoch [9/10] - Loss: 0.4843 | Accuracy: 77.64%
Epoch [10/10] - Loss: 0.5779 | Accuracy: 71.31%
