In [1]:
from ultralytics import YOLO
import pandas as pd
import os
import cv2
import torch


# GENERATE keypoints here
model = YOLO("yolo11n-pose.pt")  # load an official model

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
# Predict with the model
results = model("bus.jpg")  # predict on an image

for i in range(len(results[0].boxes)):
    print(results[0].boxes[i].id)

len(results[0])


image 1/1 C:\Users\sonnpm\OneDrive - UTS\Desktop\UTS\Semester 3\DL\Assignment\ass 3\bus.jpg: 640x480 4 persons, 99.4ms
Speed: 4.4ms preprocess, 99.4ms inference, 5.0ms postprocess per image at shape (1, 3, 640, 480)
tensor([1.])
tensor([2.])
tensor([3.])
tensor([4.])


In [2]:
import os
import cv2
import torch
from utils import is_none_or_empty

def process_video_with_model(yolo_model, dl_model, device, input_path="./fall_videos/processed/fall-01-cam0.mp4", output_path="./output/output.mp4", using_conf=True):

    cap = cv2.VideoCapture(input_path)

    # Get video properties
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Set up the output writer
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    print(f"Processing {input_path}")
    results = yolo_model.track(source=input_path)

    max_pred = 0.0
    fall_cnt = 0
    for frame_number, result in enumerate(results):
        frame = result.orig_img.copy()
        boxes = result.boxes
        keypoints = result.keypoints

        if is_none_or_empty(boxes) or is_none_or_empty(keypoints):
            continue

        # test mode so predict all
        for index in range(len(boxes)):

            conf = float(boxes.conf[index])
            cls = int(boxes.cls[index])

            if cls != 0 or conf < 0.2:
                continue

            keypoints_tensor = keypoints.data[index]
            keypoints_np = keypoints_tensor.cpu().detach().numpy()
            flat = keypoints_np.flatten().tolist()

            for i in range(0, 51, 3):
                flat[i] = float(flat[i]) / float(width)
                flat[i + 1] = float(flat[i + 1]) / float(height)

            if not using_conf:
                flat = [flat[i] for i in range(len(flat)) if i % 3 != 2]

            input_tensor = torch.tensor(flat, dtype=torch.float32).unsqueeze(0).to(device)
            print(input_tensor)

            with torch.no_grad():
                output = dl_model(input_tensor)

            prediction_value = output.item()
            max_pred = max(max_pred, prediction_value)
            prediction_label = "FALL" if prediction_value > 0.5 else "SAFE"
            print("Prediction value: ", prediction_value)

            if prediction_label == "FALL":
                fall_cnt+= 1

            x, y = int(keypoints_np[0][0]), int(keypoints_np[0][1])
            color = (0, 0, 255) if prediction_label == "FALL" else (0, 255, 0)
            cv2.putText(frame, prediction_label, (x, y), cv2.FONT_HERSHEY_SIMPLEX, 1, color, 2)
        out.write(frame)

    out.release()
    print(f"Saved annotated video to: {output_path}")
    # print("max_pred", max_pred)
    print("fall_cnt", fall_cnt)


In [32]:
from nn_model import NN_Model

nn_model = NN_Model()
nn_model.load_state_dict(torch.load('./model/nn_model.pth'))

# Set the model to evaluation mode (important for inference)
nn_model.eval()
nn_model.to(device)

NN_Model(
  (classifier): Sequential(
    (0): Linear(in_features=51, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=1, bias=True)
    (5): Sigmoid()
  )
)

In [33]:
process_video_with_model(yolo_model=model, dl_model=nn_model, device=device,
                         input_path='./fall_videos/mc_videos/Coffee_room_01/Coffee_room_01/Videos/video (1).avi',
                         output_path='./output/nn_annotated_output.mp4')

Processing ./fall_videos/mc_videos/Coffee_room_01/Coffee_room_01/Videos/video (1).avi


errors for large sources or long-running streams and videos. See https://docs.ultralytics.com/modes/predict/ for help.

Example:
    results = model(source=..., stream=True)  # generator of Results objects
    for r in results:
        boxes = r.boxes  # Boxes object for bbox outputs
        masks = r.masks  # Masks object for segment masks outputs
        probs = r.probs  # Class probabilities for classification outputs

video 1/1 (frame 1/157) C:\Users\sonnpm\OneDrive - UTS\Desktop\UTS\Semester 3\DL\Assignment\ass 3\fall_videos\mc_videos\Coffee_room_01\Coffee_room_01\Videos\video (1).avi: 480x640 1 person, 35.4ms
video 1/1 (frame 2/157) C:\Users\sonnpm\OneDrive - UTS\Desktop\UTS\Semester 3\DL\Assignment\ass 3\fall_videos\mc_videos\Coffee_room_01\Coffee_room_01\Videos\video (1).avi: 480x640 1 person, 35.6ms
video 1/1 (frame 3/157) C:\Users\sonnpm\OneDrive - UTS\Desktop\UTS\Semester 3\DL\Assignment\

In [3]:
from nn_model import NN_Model_NO_CONF

nn_model_no_conf = NN_Model_NO_CONF()
nn_model_no_conf.load_state_dict(torch.load('./model/nn_model_no_conf.pth'))

# Set the model to evaluation mode (important for inference)
nn_model_no_conf.eval()
nn_model_no_conf.to(device)

NN_Model_NO_CONF(
  (classifier): Sequential(
    (0): Linear(in_features=34, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=1, bias=True)
    (5): Sigmoid()
  )
)

In [4]:
process_video_with_model(yolo_model=model, dl_model=nn_model_no_conf, device=device,
                         input_path='./fall_videos/test/video_1.mp4',
                         output_path='./output/nn_no_conf_annotated_output.mp4', using_conf=False)

Processing ./fall_videos/test/video_1.mp4


errors for large sources or long-running streams and videos. See https://docs.ultralytics.com/modes/predict/ for help.

Example:
    results = model(source=..., stream=True)  # generator of Results objects
    for r in results:
        boxes = r.boxes  # Boxes object for bbox outputs
        masks = r.masks  # Masks object for segment masks outputs
        probs = r.probs  # Class probabilities for classification outputs

video 1/1 (frame 1/85) C:\Users\sonnpm\OneDrive - UTS\Desktop\UTS\Semester 3\DL\Assignment\ass 3\fall_videos\test\video_1.mp4: 384x640 1 person, 89.9ms
video 1/1 (frame 2/85) C:\Users\sonnpm\OneDrive - UTS\Desktop\UTS\Semester 3\DL\Assignment\ass 3\fall_videos\test\video_1.mp4: 384x640 1 person, 29.0ms
video 1/1 (frame 3/85) C:\Users\sonnpm\OneDrive - UTS\Desktop\UTS\Semester 3\DL\Assignment\ass 3\fall_videos\test\video_1.mp4: 384x640 1 person, 29.2ms
video 1/1 (frame 4/85) C:\Users\sonnpm\OneDrive - UTS\Desktop\UTS\Semester

In [5]:
import cv2
import os
from rule_model import Rule_Model
from utils import is_none_or_empty

# cv2.putText() is another OpenCV function used to add text to an image.
# image: The image to which the text will be added.
# 'Person Fell down': The text message that will be displayed on the image.
# (11, 100): The position of the text (top-left corner of the text box) in (x, y) coordinates. This is the point where the text starts to be drawn on the image.
# 0: The font type (0 corresponds to the default font in OpenCV).
# 1: The font scale, i.e., the size of the text. A scale of 1 means the text will be the default size.
# [0, 0, 2550]: The color of the text. It uses the BGR format (Blue, Green, Red). This is a red color with a high intensity (2550 is likely an error and should be something like [0, 0, 255]).
# thickness=3: The thickness of the text.
# lineType=cv2.LINE_AA: This ensures that the text is drawn with antialiased lines for smoother edges.
def falling_alarm(image, bbox):
    x_min, y_min, x_max, y_max = bbox
    cv2.rectangle(image, (int(x_min), int(y_min)), (int(x_max), int(y_max)), color=(0, 0, 255),
                  thickness=5, lineType=cv2.LINE_AA)
    cv2.putText(image, 'Person Fell down', (11, 100), 0, 1, [0, 0, 2550], thickness=3, lineType=cv2.LINE_AA)

def process_video_rule(video_path, model, output_path='./output/rule_annotated_output.mp4'):
    cap = cv2.VideoCapture(video_path)

    # Get video properties
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Set up the output writer
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    print(f"Processing {video_path}")
    results = model.track(source=video_path)

    for frame_number, result in enumerate(results):
        is_fall, bbox = Rule_Model(result)
        frame = result.orig_img.copy()

        if is_fall:
            falling_alarm(frame, bbox)

        out.write(frame)
        print(f"Frame {frame_number}: pred = {is_fall}")

    out.release()
    print("Saved annotated video to:", output_path)


# === Main ===
video_path = './fall_videos/test/video_1.mp4'

process_video_rule(video_path, model, output_path='./output/rule_annotated_output_unseen.mp4')


Processing ./fall_videos/test/video_1.mp4


errors for large sources or long-running streams and videos. See https://docs.ultralytics.com/modes/predict/ for help.

Example:
    results = model(source=..., stream=True)  # generator of Results objects
    for r in results:
        boxes = r.boxes  # Boxes object for bbox outputs
        masks = r.masks  # Masks object for segment masks outputs
        probs = r.probs  # Class probabilities for classification outputs

video 1/1 (frame 1/85) C:\Users\sonnpm\OneDrive - UTS\Desktop\UTS\Semester 3\DL\Assignment\ass 3\fall_videos\test\video_1.mp4: 384x640 1 person, 31.9ms
video 1/1 (frame 2/85) C:\Users\sonnpm\OneDrive - UTS\Desktop\UTS\Semester 3\DL\Assignment\ass 3\fall_videos\test\video_1.mp4: 384x640 1 person, 28.6ms
video 1/1 (frame 3/85) C:\Users\sonnpm\OneDrive - UTS\Desktop\UTS\Semester 3\DL\Assignment\ass 3\fall_videos\test\video_1.mp4: 384x640 1 person, 28.8ms
video 1/1 (frame 4/85) C:\Users\sonnpm\OneDrive - UTS\Desktop\UTS\Semester

NameError: name 'falling_alarm' is not defined

In [6]:
from collections import defaultdict, deque
from utils import is_none_or_empty

def process_video_with_lstm(yolo_model, lstm_model, device, input_path, output_path, sequence_length=20, using_conf=True):
    cap = cv2.VideoCapture(input_path)

    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    print(f"Processing {input_path}")
    results = yolo_model.track(source=input_path)

    person_sequences = defaultdict(lambda: deque(maxlen=sequence_length))  # person_id → keypoint buffer

    for result in results:
        frame = result.orig_img.copy()
        boxes = result.boxes
        keypoints = result.keypoints

        if is_none_or_empty(boxes) or is_none_or_empty(keypoints):
            out.write(frame)
            continue

        for i in range(len(boxes)):
            box = boxes[i]
            if int(box.cls) != 0 or float(box.conf) < 0.2:
                continue

            box_id = boxes[i].id
            if box_id is not None:
                person_id = int(box_id.item())  # works for tensor([1.])
            else:
                person_id = i  # fallback to index

            keypoint = keypoints.data[i].cpu().detach().numpy()
            flat = keypoint.flatten().tolist()

            for i in range(0, 51, 3):
                flat[i] = float(flat[i]) / float(width)
                flat[i + 1] = float(flat[i + 1]) / float(height)

            if not using_conf:
                flat = [flat[i] for i in range(len(flat)) if i % 3 != 2]

            person_sequences[person_id].append(flat)

            if len(person_sequences[person_id]) == sequence_length:
                input_tensor = torch.tensor([person_sequences[person_id]], dtype=torch.float32).to(device)
                with torch.no_grad():
                    output = lstm_model(input_tensor)
                pred_score = torch.sigmoid(output).item() if output.shape[-1] == 1 else torch.softmax(output, dim=-1)[0,1].item()
                prediction_label = "FALL" if pred_score > 0.5 else "SAFE LSTM"
                print(prediction_label)
                color = (0, 0, 255) if prediction_label == "FALL" else (0, 255, 0)

                x, y = int(keypoint[0][0]), int(keypoint[0][1])
                cv2.putText(frame, f"{prediction_label}", (x, y), cv2.FONT_HERSHEY_SIMPLEX, 1, color, 2)

        out.write(frame)

    cap.release()
    out.release()
    print(f"Saved annotated video to: {output_path}")


In [None]:
from sequence_model import LSTM_Model

lstm_model_path = "./model/lstm_model.pth"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

lstm_model = LSTM_Model()
lstm_model.load_state_dict(torch.load(lstm_model_path, map_location=device))
lstm_model.to(device)
lstm_model.eval()

In [15]:
process_video_with_lstm(yolo_model=model, lstm_model=lstm_model, device=device,
                        input_path="./fall_videos/mc_videos_test/coffee1/coffee1/Videos/video (1).avi", output_path='./output/lstm_unseen_test.mp4', sequence_length=10)

Processing ./fall_videos/mc_videos_test/coffee1/coffee1/Videos/video (1).avi


errors for large sources or long-running streams and videos. See https://docs.ultralytics.com/modes/predict/ for help.

Example:
    results = model(source=..., stream=True)  # generator of Results objects
    for r in results:
        boxes = r.boxes  # Boxes object for bbox outputs
        masks = r.masks  # Masks object for segment masks outputs
        probs = r.probs  # Class probabilities for classification outputs

video 1/1 (frame 1/157) C:\Users\sonnpm\OneDrive - UTS\Desktop\UTS\Semester 3\DL\Assignment\ass 3\fall_videos\mc_videos_test\coffee1\coffee1\Videos\video (1).avi: 480x640 1 person, 37.3ms
video 1/1 (frame 2/157) C:\Users\sonnpm\OneDrive - UTS\Desktop\UTS\Semester 3\DL\Assignment\ass 3\fall_videos\mc_videos_test\coffee1\coffee1\Videos\video (1).avi: 480x640 1 person, 34.9ms
video 1/1 (frame 3/157) C:\Users\sonnpm\OneDrive - UTS\Desktop\UTS\Semester 3\DL\Assignment\ass 3\fall_videos\mc_videos

In [33]:
import os
import cv2
import torch
from collections import defaultdict, deque

def process_video_ensemble_model(yolo_model, lstm_model, nn_model, rule_model, device, sequence_length=20, input_path="./fall_videos/processed/fall-01-cam0.mp4", output_path='lstm_annotated_output.mp4', using_conf=True):

    cap = cv2.VideoCapture(input_path)

    # Get video properties
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Set up the output writer
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    print(f"Processing {input_path}")
    results = yolo_model.track(source=input_path, stream=True)

    person_sequences = defaultdict(lambda: deque(maxlen=sequence_length))  # Store sequences for each person
    person_cnn_votes = defaultdict(lambda: deque(maxlen=sequence_length))  # Store CNN votes
    person_rule_votes = defaultdict(lambda: deque(maxlen=sequence_length))  # Store Rule model votes

    for frame_number, result in enumerate(results):
        frame = result.orig_img.copy()
        boxes = result.boxes
        keypoints = result.keypoints

        if is_none_or_empty(boxes) or is_none_or_empty(keypoints):
            continue

        for index in range(len(boxes)):
            # Track each person's keypoints and store predictions
            keypoints_tensor = keypoints.data[index]
            keypoints_np = keypoints_tensor.cpu().detach().numpy()
            flat = keypoints_np.flatten().tolist()

            for i in range(0, 51, 3):
                flat[i] = float(flat[i]) / float(width)
                flat[i + 1] = float(flat[i + 1]) / float(height)

            if not using_conf:
                flat = [flat[i] for i in range(len(flat)) if i % 3 != 2]

            box_id = boxes[index].id
            if box_id is not None:
                person_id = int(box_id.item())  # works for tensor([1.])
            else:
                person_id = index  # fallback to index

            input_tensor = torch.tensor(flat, dtype=torch.float32).view(1, 1, 51).to(device)

            # Predict using CNN model
            with torch.no_grad():
                cnn_pred = nn_model(input_tensor).item()

            # Predict using Rule Model
            xmin, ymin, xmax, ymax = boxes[index].xyxy.squeeze(0).tolist()
            rule_pred_bool = rule_model(flat, xmin, ymin, xmax, ymax)

            # Record CNN and Rule predictions
            person_cnn_votes[person_id].append(int(cnn_pred > 0.5))
            person_rule_votes[person_id].append(int(rule_pred_bool))

            # Collect the sequence of keypoints for LSTM model (if enough frames are collected)
            person_sequences[person_id].append(flat)

            if len(person_sequences[person_id]) == sequence_length:
                # LSTM model prediction
                sequence_tensor = torch.tensor(person_sequences[person_id], dtype=torch.float32).view(1, sequence_length, 51).to(device)
                with torch.no_grad():
                    lstm_output = lstm_model(sequence_tensor)  # Output shape: (1, 2) for binary classification
                    lstm_pred = torch.argmax(lstm_output, dim=1).item()

                # Perform majority voting
                cnn_label = int(cnn_pred > 0.5)
                rule_label = int(rule_pred_bool)
                lstm_label = int(lstm_pred > 0.5)

                # Voting mechanism: majority rule for combining CNN, Rule, and LSTM predictions
                votes = [cnn_label, rule_label, lstm_label]
                final_prediction = int(sum(votes) >= 2)

                prediction_label = "FALL" if final_prediction == 1 else "SAFE"
                x, y = int(keypoints_np[0][0]), int(keypoints_np[0][1])
                color = (0, 0, 255) if prediction_label == "FALL" else (0, 255, 0)
                cv2.putText(frame, prediction_label, (x, y), cv2.FONT_HERSHEY_SIMPLEX, 1, color, 2)

        out.write(frame)

    out.release()
    print(f"Saved annotated video to: {output_path}")



In [34]:
from sequence_model import LSTM_Model
import cv2
from nn_model import NN_Model
from rule_model import rule_fall_detection

lstm_model_path = "./model/lstm_model.pth"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

lstm_model = LSTM_Model()
lstm_model.load_state_dict(torch.load(lstm_model_path, map_location=device))
lstm_model.to(device)
lstm_model.eval()

nn_model = NN_Model()
nn_model.load_state_dict(torch.load('./model/nn_model.pth'))

# Set the model to evaluation mode (important for inference)
nn_model.eval()
nn_model.to(device)

# Example to call the process_video_ensemble_model function
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# input_video = "./fall_videos/processed/fall-01-cam0.mp4"
# output_video = './output/ensemble_output.mp4'

input_video = "./fall_videos/test/video_1.mp4"
output_video = './output/ensemble_nrl_output.mp4'

# Assuming lstm_model, cnn_model, rule_model are already loaded
process_video_ensemble_model(yolo_model=model, lstm_model=lstm_model, nn_model=nn_model, rule_model=rule_fall_detection, device=device, input_path=input_video, output_path=output_video, sequence_length=10)


Processing ./fall_videos/test/video_1.mp4

video 1/1 (frame 1/85) C:\Users\sonnpm\OneDrive - UTS\Desktop\UTS\Semester 3\DL\Assignment\ass 3\fall_videos\test\video_1.mp4: 384x640 1 person, 29.1ms
video 1/1 (frame 2/85) C:\Users\sonnpm\OneDrive - UTS\Desktop\UTS\Semester 3\DL\Assignment\ass 3\fall_videos\test\video_1.mp4: 384x640 1 person, 29.1ms
video 1/1 (frame 3/85) C:\Users\sonnpm\OneDrive - UTS\Desktop\UTS\Semester 3\DL\Assignment\ass 3\fall_videos\test\video_1.mp4: 384x640 1 person, 29.7ms
video 1/1 (frame 4/85) C:\Users\sonnpm\OneDrive - UTS\Desktop\UTS\Semester 3\DL\Assignment\ass 3\fall_videos\test\video_1.mp4: 384x640 1 person, 27.0ms
video 1/1 (frame 5/85) C:\Users\sonnpm\OneDrive - UTS\Desktop\UTS\Semester 3\DL\Assignment\ass 3\fall_videos\test\video_1.mp4: 384x640 1 person, 27.4ms
video 1/1 (frame 6/85) C:\Users\sonnpm\OneDrive - UTS\Desktop\UTS\Semester 3\DL\Assignment\ass 3\fall_videos\test\video_1.mp4: 384x640 1 person, 27.4ms
video 1/1 (frame 7/85) C:\Users\sonnpm\OneDri

In [9]:
# process lstm no conf

from sequence_model import LSTM_Model

lstm_model_no_conf_path = "./model/lstm_model_no_conf.pth"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

lstm_model_no_conf = LSTM_Model(input_dim=34)
lstm_model_no_conf.load_state_dict(torch.load(lstm_model_no_conf_path, map_location=device))
lstm_model_no_conf.to(device)
lstm_model_no_conf.eval()

LSTM_Model(
  (lstm): LSTM(34, 64, num_layers=2, batch_first=True)
  (fc): Linear(in_features=64, out_features=2, bias=True)
)

In [10]:
process_video_with_lstm(yolo_model=model, lstm_model=lstm_model_no_conf, device=device,
                        input_path="./fall_videos/test/video_1.mp4", output_path='./output/lstm_no_conf_unseen_test.mp4', sequence_length=10, using_conf=False)

Processing ./fall_videos/test/video_1.mp4


errors for large sources or long-running streams and videos. See https://docs.ultralytics.com/modes/predict/ for help.

Example:
    results = model(source=..., stream=True)  # generator of Results objects
    for r in results:
        boxes = r.boxes  # Boxes object for bbox outputs
        masks = r.masks  # Masks object for segment masks outputs
        probs = r.probs  # Class probabilities for classification outputs

video 1/1 (frame 1/85) C:\Users\sonnpm\OneDrive - UTS\Desktop\UTS\Semester 3\DL\Assignment\ass 3\fall_videos\test\video_1.mp4: 384x640 1 person, 36.5ms
video 1/1 (frame 2/85) C:\Users\sonnpm\OneDrive - UTS\Desktop\UTS\Semester 3\DL\Assignment\ass 3\fall_videos\test\video_1.mp4: 384x640 1 person, 28.6ms
video 1/1 (frame 3/85) C:\Users\sonnpm\OneDrive - UTS\Desktop\UTS\Semester 3\DL\Assignment\ass 3\fall_videos\test\video_1.mp4: 384x640 1 person, 28.5ms
video 1/1 (frame 4/85) C:\Users\sonnpm\OneDrive - UTS\Desktop\UTS\Semester