# Dependencies for tracking people and objects

In [1]:
! pip install ultralytics
! pip install supervision

Collecting ultralytics
  Downloading ultralytics-8.3.187-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.16-py3-none-any.whl.metadata (14 kB)
Downloading ultralytics-8.3.187-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ultralytics_thop-2.0.16-py3-none-any.whl (28 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.3.187 ultralytics-thop-2.0.16
Collecting supervision
  Downloading supervision-0.26.1-py3-none-any.whl.metadata (13 kB)
Downloading supervision-0.26.1-py3-none-any.whl (207 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.2/207.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: supervision
Successfully installed supervision-0.26.1


# Model for  detecting weapons

In [25]:
from huggingface_hub import hf_hub_download


weights_path = hf_hub_download(
        repo_id="Accurateinfosolution/Suspicious_activity_detection_Yolov11_Custom",
        filename="Suspicious_Activities_nano.pt"
    )

# This tracks multiple objects with each yolo model finetuned to track specific objects, making highly accurate, with out loosing out on speed

For first layer filtering this is excellent

In [26]:
import numpy as np
import supervision as sv
from ultralytics import YOLO
import cv2
import os

# --- Model and Annotator Initialization ---
# Assuming 'weights_path' is defined and points to your gun model weights.
# You can replace it with "yolov8n.pt" if you're using a pre-trained model.
WEAPON_KEYWORDS = {"gun", "knife", "pistol", "rifle", "revolver", "bomb", "time bomb"}
gun_model = YOLO(weights_path) # Replace with your model path
gun_model.fuse()
people_model = YOLO("/content/best.pt")
tracker = sv.ByteTrack()
box_annotator = sv.BoxAnnotator()
label_annotator = sv.LabelAnnotator(text_color=sv.Color.WHITE)
os.makedirs("crops", exist_ok=True)

# --- Utility Function ---
def is_weapon_label(label: str) -> bool:
    label_lower = label.lower()
    return any(keyword in label_lower for keyword in WEAPON_KEYWORDS)

# --- Video Processing Callback ---
def callback(frame: np.ndarray, index: int) -> np.ndarray:
    # Get detections from both models
    gun_results = gun_model(frame)[0]
    people_results = people_model(frame)[0]

    # Convert results to supervision Detections objects
    gun_detections = sv.Detections.from_ultralytics(gun_results)
    people_detections = sv.Detections.from_ultralytics(people_results)

    # Filter for 'person' class (class_id 0) from the people model
    person_class_id = 0
    people_detections = people_detections[people_detections.class_id == person_class_id]

    # Merge the detections from both models
    all_detections = sv.Detections.merge([gun_detections, people_detections])

    # Update the tracker with the merged detections
    all_detections = tracker.update_with_detections(all_detections)

    # Prepare labels for all detections
    labels = []
    for class_id, tracker_id in zip(all_detections.class_id, all_detections.tracker_id):
        # We need to use the correct model names for the labels.
        # This requires a slightly more complex logic or a combined names dictionary.
        # For simplicity, we'll assume a combined dictionary.
        # Or you can do a check: if class_id is from gun model's classes...
        # Here we'll handle the 'person' class specifically.

        # Determine the class name based on the class ID.
        # You'll need a unified dictionary of class names. Let's create one.
        class_names = gun_model.names.copy()
        class_names.update(people_model.names)
        class_name = class_names.get(class_id, "unknown")

        labels.append(f"#{tracker_id} {class_name}")

    # Crop and save images for detected weapons
    for i, (class_id, tracker_id) in enumerate(zip(all_detections.class_id, all_detections.tracker_id)):
        class_name = gun_model.names.get(class_id)
        if class_name and is_weapon_label(class_name):
            x_min, y_min, x_max, y_max = map(int, all_detections.xyxy[i])
            crop = frame[y_min:y_max, x_min:x_max]
            crop_filename = f"crops/frame{index}_id{tracker_id}_{class_name}.jpg"
            cv2.imwrite(crop_filename, crop)

    # Annotate the frame with all merged detections
    annotated_frame = box_annotator.annotate(frame.copy(), detections=all_detections)
    annotated_frame = label_annotator.annotate(
        scene=annotated_frame,
        detections=all_detections,
        labels=labels
    )

    return annotated_frame

# --- Process Video ---
sv.process_video(
    source_path="/content/gun_holding (online-video-cutter.com).mp4",
    target_path="result.mp4",
    callback=callback
)

YOLO11n summary (fused): 100 layers, 2,584,102 parameters, 0 gradients, 6.3 GFLOPs

0: 384x640 (no detections), 48.6ms
Speed: 20.2ms preprocess, 48.6ms inference, 29.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 16.6ms
Speed: 1.5ms preprocess, 16.6ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 9.1ms
Speed: 1.7ms preprocess, 9.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 5.7ms
Speed: 1.8ms preprocess, 5.7ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 9.1ms
Speed: 1.9ms preprocess, 9.1ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 5.7ms
Speed: 1.8ms preprocess, 5.7ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 7.7ms
Speed: 1.4ms preprocess, 7.7ms inference, 0.6ms postprocess per image at shape (1, 3, 384,

# Tracking across two camera feeds

you can modify input with different videos to test it out, higher quality images will give better tracking

In [27]:
import numpy as np
import supervision as sv
from ultralytics import YOLO
import cv2
import os
import torch
import torch.nn.functional as F
from torchvision import models, transforms
from PIL import Image

# --- Initialize models ---
WEAPON_KEYWORDS = {"gun", "knife", "pistol", "rifle", "revolver", "bomb", "time bomb"}
gun_model = YOLO(weights_path)   # Replace with your weapon detection model
gun_model.fuse()
people_model = YOLO("best.pt")   # Replace with your person detection model

tracker1 = sv.ByteTrack()
tracker2 = sv.ByteTrack()
box_annotator = sv.BoxAnnotator()
label_annotator = sv.LabelAnnotator(text_color=sv.Color.WHITE)
os.makedirs("crops", exist_ok=True)

# --- Simple person embedding extractor (ResNet50) ---
reid_model = models.resnet50(pretrained=True)
reid_model.fc = torch.nn.Identity()  # Remove classification layer
reid_model.eval()

transform = transforms.Compose([
    transforms.Resize((128, 256)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

def extract_embedding(crop):
    img = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
    img = transform(Image.fromarray(img)).unsqueeze(0)
    with torch.no_grad():
        feat = reid_model(img)
    return F.normalize(feat, dim=1).cpu().numpy()[0]

# --- Utility ---
def is_weapon_label(label: str) -> bool:
    return any(keyword in label.lower() for keyword in WEAPON_KEYWORDS)

# --- Multi-stream processing ---
def process_two_streams(stream1_path, stream2_path, output_path="result.mp4"):
    cap1 = cv2.VideoCapture(stream1_path)
    cap2 = cv2.VideoCapture(stream2_path)

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, 20.0,
                          (int(cap1.get(3))*2, int(cap1.get(4))))  # side-by-side output

    person_embeddings = {}  # {tracker_id: embedding}

    while True:
        ret1, frame1 = cap1.read()
        ret2, frame2 = cap2.read()
        if not ret1 or not ret2:
            break

        # --- Run detections ---
        results1 = people_model(frame1)[0]
        results2 = people_model(frame2)[0]

        det1 = sv.Detections.from_ultralytics(results1)
        det2 = sv.Detections.from_ultralytics(results2)

        det1 = tracker1.update_with_detections(det1)
        det2 = tracker2.update_with_detections(det2)

        labels1, labels2 = [], []

        # --- Process stream 1 ---
        for i, (cls, tid) in enumerate(zip(det1.class_id, det1.tracker_id)):
            if cls == 0:  # person
                x1, y1, x2, y2 = map(int, det1.xyxy[i])
                crop = frame1[y1:y2, x1:x2]
                emb = extract_embedding(crop)
                person_embeddings[f"cam1_{tid}"] = emb
                labels1.append(f"Cam1_ID{tid}")

        # --- Process stream 2 + re-id match with cam1 ---
        for i, (cls, tid) in enumerate(zip(det2.class_id, det2.tracker_id)):
            if cls == 0:  # person
                x1, y1, x2, y2 = map(int, det2.xyxy[i])
                crop = frame2[y1:y2, x1:x2]
                emb = extract_embedding(crop)

                # Find best match from cam1
                best_match, best_score = None, -1
                for pid, ref_emb in person_embeddings.items():
                    score = F.cosine_similarity(
                        torch.tensor(emb), torch.tensor(ref_emb), dim=0
                    ).item()
                    if score > best_score:
                        best_score = score
                        best_match = pid

                labels2.append(f"Cam2_ID{tid} -> {best_match} ({best_score:.2f})")

        # --- Annotate frames ---
        annotated1 = box_annotator.annotate(frame1.copy(), det1)
        annotated1 = label_annotator.annotate(annotated1, det1, labels1)

        annotated2 = box_annotator.annotate(frame2.copy(), det2)
        annotated2 = label_annotator.annotate(annotated2, det2, labels2)

        # Combine side-by-side
        combined = np.hstack((annotated1, annotated2))
        out.write(combined)
        # cv2.imshow("output", combined)
        # if cv2.waitKey(1) & 0xFF == ord("q"):
        #     break

    cap1.release()
    cap2.release()
    out.release()
    cv2.destroyAllWindows()

# --- Run two-stream processing ---
process_two_streams("/content/gun_holding (online-video-cutter.com).mp4", "/content/gun_holding (online-video-cutter.com).mp4", "output.mp4")


YOLO11n summary (fused): 100 layers, 2,584,102 parameters, 0 gradients, 6.3 GFLOPs





0: 384x640 (no detections), 9.0ms
Speed: 1.6ms preprocess, 9.0ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 6.4ms
Speed: 1.8ms preprocess, 6.4ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 6.3ms
Speed: 1.7ms preprocess, 6.3ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 5.9ms
Speed: 1.8ms preprocess, 5.9ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 6.5ms
Speed: 2.0ms preprocess, 6.5ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 5.8ms
Speed: 1.9ms preprocess, 5.8ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 6.1ms
Speed: 1.7ms preprocess, 6.1ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 5.7ms
Speed: 1.8ms preprocess, 5.7ms inference, 0.6ms 

# Dependency to install vlm to run locally

In [8]:
! pip install pyvips-binary pyvips

Collecting pyvips-binary
  Downloading pyvips_binary-8.17.1-cp37-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (2.4 kB)
Collecting pyvips
  Downloading pyvips-3.0.0.tar.gz (56 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/56.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading pyvips_binary-8.17.1-cp37-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (7.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m71.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyvips
  Building wheel for pyvips (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pyvips: filename=pyvips-3.0.0-py3-non

# Below is for getting image descriptions in real time,
Each user question takes sub second response time, as we will keep getting more and more frames, we can create a data sink store all of it in elastic search and very quickly we can retireve relevant results over a Million images as well.

# To run the model locally

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image

# Load the model

model = AutoModelForCausalLM.from_pretrained(
"vikhyatk/moondream2",
revision="2025-01-09",
trust_remote_code=True,
device_map={"": "cuda"}# Uncomment for GPU acceleration & pip install accelerate # device_map={"": "cuda"}
)

# Load your image



# 1. Image Captioning

# print("Short caption:")
# print(model.caption(image, length="short")["caption"])

# print("Detailed caption:")
# for t in model.caption(image, length="normal", stream=True)["caption"]:
#   print(t, end="", flush=True)

#   # 2. Visual Question Answering

#   print("Asking questions about the image:")
#   print(model.query(image, "is there any one holding gun")["answer"])

#   # 3. Object Detection

#   print("Detecting objects:")



#   # 4. Visual Pointing

#   print("Locating objects:")
#   points = model.point(image, "person")["points"]
#   print(f"Found {len(points)} person(s)")

config.json:   0%|          | 0.00/276 [00:00<?, ?B/s]

hf_moondream.py: 0.00B [00:00, ?B/s]

vision.py: 0.00B [00:00, ?B/s]

config.py: 0.00B [00:00, ?B/s]

layers.py: 0.00B [00:00, ?B/s]

image_crops.py: 0.00B [00:00, ?B/s]

moondream.py: 0.00B [00:00, ?B/s]

text.py: 0.00B [00:00, ?B/s]

rope.py: 0.00B [00:00, ?B/s]

weights.py: 0.00B [00:00, ?B/s]

utils.py: 0.00B [00:00, ?B/s]

region.py: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

generation_config.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

In [14]:
from PIL import Image
image = Image.open("/content/crops/frame129_id12_Terrorist_With_Time_Bomb.jpg")
objects = model.detect(image, "guns")["objects"]
print(f"Found {len(objects)} guns")

Found 1 guns


# Using the finetuned version from mooondream of the same model we see an higher accuracy, much less false positives

In [9]:
! pip install moondream

Collecting moondream
  Downloading moondream-0.1.1-py3-none-any.whl.metadata (4.7 kB)
Collecting pillow<11.0.0,>=10.4.0 (from moondream)
  Downloading pillow-10.4.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (9.2 kB)
Downloading moondream-0.1.1-py3-none-any.whl (94 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pillow-10.4.0-cp312-cp312-manylinux_2_28_x86_64.whl (4.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pillow, moondream
  Attempting uninstall: pillow
    Found existing installation: pillow 11.3.0
    Uninstalling pillow-11.3.0:
      Successfully uninstalled pillow-11.3.0
Successfully installed moondream-0.1.1 pillow-10.4.0


# we can a loop on every frame we get to quickly filter out un wanted frames, with the accurate descriptions, we can use the same data to finetune our YOLO model which will act as first filter, this one being filter and more descriptive source.

In [29]:
import moondream as md
from PIL import Image
from google.colab import userdata


# Initialize with API key
image = Image.open("/content/crops/frame160_id12_Terrorist_With_Time_Bomb.jpg")
model = md.vl(api_key=userdata.get('moondream'))
print("Asking questions about the image:")
print(model.query(image, "is there any one holding gun")["answer"])



Asking questions about the image:
Yes, one person is holding a gun.


In [22]:
print(model.query(image, "describe the surroundings")["answer"])

The scene takes place in a hallway with light-colored walls. There are two doors in the hallway, one of which is red and the other white. A person is standing near the red door, holding a gun. They appear to be dressed in dark clothing. Another person is standing near the white door, wearing a red hoodie and holding a cell phone to their ear.


In [30]:
print(model.query(image, "describe the person holding the gun")["answer"])

The person holding the gun appears to be a male, wearing a red hoodie and possibly a black jacket. He is holding a gun and seems to be in a defensive posture, possibly preparing to shoot or react to something.
