In [2]:
!pip install -U torch torchvision torchaudio
!pip install -U ultralytics insightface opencv-python-headless imutils pymongo deep_sort_realtime
!pip install onnxruntime


Collecting ultralytics
  Downloading ultralytics-8.3.203-py3-none-any.whl.metadata (37 kB)
Collecting insightface
  Downloading insightface-0.7.3.tar.gz (439 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m439.5/439.5 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pymongo
  Downloading pymongo-4.15.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting deep_sort_realtime
  Downloading deep_sort_realtime-1.3.2-py3-none-any.whl.metadata (12 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.17-py3-none-any.whl.metadata (14 kB)
Collecting onnx (from insightface)
  Downloading onnx-1.19.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (7.0 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo

In [3]:
import os
import cv2
import datetime
import base64
import numpy as np
import uuid
from pymongo import MongoClient
from ultralytics import YOLO
import torch
import insightface
from insightface.app import FaceAnalysis
from deep_sort_realtime.deepsort_tracker import DeepSort


# Initialize YOLOv8 model
yolo_model = YOLO("yolov8n.pt")
if torch.cuda.is_available():
    yolo_model.to('cuda')  # Move model to GPU if CUDA is available

# Initialize InsightFace with GPU support
face_app = FaceAnalysis(name='buffalo_l', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
face_app.prepare(ctx_id=0)  # ctx_id=0 sets GPU for InsightFace

# Initialize DeepSORT tracker
tracker = DeepSort(max_age=30, n_init=3)


Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt to 'yolov8n.pt': 100% ━━━━━━━━━━━━ 6.2MB 62.6MB/s 0.1s
download_path: /root/.insightface/models/buffalo_l
Downloading /root/.insightface/models/buffalo_l.zip from https://github.com/deepinsight/insightface/releases/download/v0.7/buffalo_l.zip...


100%|██████████| 281857/281857 [00:04<00:00, 64700.98KB/s]


Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /root/.insightface/models/buffalo_l/1k3d68.onnx landmark_3d_68 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /root/.insightface/models/buffalo_l/2d106det.onnx landmark_2d_106 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /root/.insightface/models/buffalo_l/det_10g.onnx detection [1, 3, '?', '?'] 127.5 128.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /root/.insightface/models/buffalo_l/genderage.onnx genderage ['None', 3, 96, 96] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /root/.insightface/models/buffalo_l/w600k_r50.onnx recognition ['None', 3, 112, 112] 127.5 127.5
set det-size: (640, 640)


In [None]:
# Your MongoDB Atlas connection string (replace password accordingly)
MONGO_CONNECTION_STRING = "mongodb+srv://CV_TASK:12345@cluster-1.1aklqjp.mongodb.net/?retryWrites=true&w=majority&appName=Cluster-1"

client = MongoClient(MONGO_CONNECTION_STRING)
db = client['face_recognition_db']
face_collection = db['faces']
event_collection = db['events_log']

LOG_BASE_DIR = '/content/drive/MyDrive/Visitor_Task/logs/entries'
LOG_FILE = '/content/drive/MyDrive/Visitor_Task/events.log'


In [None]:
def encode_image_to_base64(image):
    _, buffer = cv2.imencode('.jpg', image)
    return base64.b64encode(buffer).decode()

def save_face_crop(face_id, image, bbox, event_type):
    date_folder = datetime.datetime.now().strftime('%Y-%m-%d')
    out_dir = os.path.join(LOG_BASE_DIR, date_folder)
    os.makedirs(out_dir, exist_ok=True)
    x1, y1, x2, y2 = map(int, bbox)
    face_crop = image[y1:y2, x1:x2]
    timestamp_str = datetime.datetime.now().strftime('%H%M%S')
    filename = f"{face_id}_{event_type}_{timestamp_str}.jpg"
    full_path = os.path.join(out_dir, filename)
    cv2.imwrite(full_path, face_crop)
    return full_path, face_crop

def log_event(event_type, face_id, image_path=None):
    timestamp = datetime.datetime.now(datetime.timezone.utc).isoformat()
    log_line = f"{timestamp}, {event_type}, {face_id}, {image_path if image_path else ''}"
    with open(LOG_FILE, 'a') as f:
        f.write(log_line + "\n")
    event_collection.insert_one({"timestamp": timestamp, "event_type": event_type, "face_id": face_id, "image_path": image_path or ''})

def register_face_in_db(face_id, embedding, bbox, face_crop):
    timestamp = datetime.datetime.now(datetime.timezone.utc)
    face_doc = {
        "face_id": face_id,
        "embedding": embedding.tolist(),
        "bbox": list(map(int, bbox)),
        "timestamp": timestamp,
        "image_base64": encode_image_to_base64(face_crop),
    }
    face_collection.insert_one(face_doc)


In [None]:
face_db = []

def register_or_identify(face_emb, bbox=None, image=None, event_type='entry'):
    for entry in face_db:
        sim = np.dot(face_emb, entry['emb']) / (np.linalg.norm(face_emb) * np.linalg.norm(entry['emb']))
        if sim > 0.6:
            return entry['id']
    new_id = str(uuid.uuid4())[:8]
    face_db.append({'id': new_id, 'emb': face_emb})
    if bbox is not None and image is not None:
        img_path, face_crop = save_face_crop(new_id, image, bbox, event_type)
        register_face_in_db(new_id, face_emb, bbox, face_crop)
        log_event(event_type, new_id, img_path)
    return new_id

def draw_face(image, bbox, id_num):
    x1, y1, x2, y2 = [int(i) for i in bbox]
    cv2.rectangle(image, (x1, y1), (x2, y2), (50, 255, 50), 2)
    cv2.putText(image, f'ID: {id_num}', (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (50, 255, 50), 2)

def draw_count(image, count):
    text = f"Unique People Present: {count}"
    cv2.putText(image, text, (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 200, 200), 2)


In [None]:
def main(video_path):
    cap = cv2.VideoCapture(video_path)
    active_face_ids = set()
    all_unique_ids = set()

    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    output_file = "/content/drive/MyDrive/Visitor_Task/Output/processed_output.mp4"
    out = cv2.VideoWriter(output_file, fourcc, fps, (frame_width, frame_height))

    detection_interval = 3
    input_width, input_height = 640, 360

    frame_count = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame_count += 1

        if frame_count % detection_interval == 0:
            small_frame = cv2.resize(frame, (input_width, input_height))
            results = yolo_model(small_frame)
            result = results[0]

            faces_bboxes = []
            for bbox_tensor, conf, cls in zip(result.boxes.xyxy, result.boxes.conf, result.boxes.cls):
                x1, y1, x2, y2 = bbox_tensor.cpu().numpy()
                x1 = int(x1 * frame.shape[1] / input_width)
                x2 = int(x2 * frame.shape[1] / input_width)
                y1 = int(y1 * frame.shape[0] / input_height)
                y2 = int(y2 * frame.shape[0] / input_height)
                conf = conf.cpu().item()
                cls = int(cls.cpu().item())
                if cls == 0 and conf > 0.5:
                    faces_bboxes.append([x1, y1, x2, y2])

            current_face_ids = set()
            for bbox in faces_bboxes:
                x1, y1, x2, y2 = map(int, bbox)
                face_crop = frame[y1:y2, x1:x2]
                insightface_results = face_app.get(face_crop)
                if insightface_results:
                    face_embedding = insightface_results[0].embedding
                    identity_id = register_or_identify(face_embedding, bbox=bbox, image=frame, event_type='entry')
                    current_face_ids.add(identity_id)
                    all_unique_ids.add(identity_id)
                    draw_face(frame, bbox, identity_id)

            exited_ids = active_face_ids - current_face_ids
            for ex_id in exited_ids:
                log_event('exit', ex_id)

            active_face_ids = current_face_ids
        else:
            # Could update tracker or just skip face update during skipped frames
            pass

        draw_count(frame, len(active_face_ids))
        out.write(frame)

    cap.release()
    out.release()

    print("Total unique visitors detected in video:", len(all_unique_ids))
    return output_file


In [None]:
video_path = "/content/drive/MyDrive/Visitor_Task/Input/video_sample1.mp4"
output_path = main(video_path)

from IPython.display import HTML
from base64 import b64encode

def display_video(path):
    mp4 = open(path, 'rb').read()
    data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
    return HTML(f"""
    <video width=600 controls>
        <source src="{data_url}" type="video/mp4">
    </video>
    """)

display_video(output_path)



0: 384x640 24 persons, 1 handbag, 50.6ms
Speed: 2.3ms preprocess, 50.6ms inference, 53.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 29 persons, 4 handbags, 11.6ms
Speed: 1.6ms preprocess, 11.6ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 29 persons, 2 handbags, 18.2ms
Speed: 2.8ms preprocess, 18.2ms inference, 3.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 29 persons, 3 handbags, 10.8ms
Speed: 1.7ms preprocess, 10.8ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 31 persons, 3 handbags, 13.9ms
Speed: 2.1ms preprocess, 13.9ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 31 persons, 1 handbag, 14.0ms
Speed: 1.6ms preprocess, 14.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 30 persons, 2 handbags, 13.6ms
Speed: 2.6ms preprocess, 13.6ms inference, 2.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 30 person

In [None]:
def process_rtsp_stream(rtsp_url):
    cap = cv2.VideoCapture(rtsp_url)
    active_face_ids = set()
    all_unique_ids = set()

    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    if fps == 0:
        fps = 20  # Default FPS if stream does not provide it

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    output_file = "/content/processed_output_live.mp4"
    out = cv2.VideoWriter(output_file, fourcc, fps, (frame_width, frame_height))

    detection_interval = 3
    input_width, input_height = 640, 360

    frame_count = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            print("Stream ended or no frame received.")
            break
        frame_count += 1

        if frame_count % detection_interval == 0:
            small_frame = cv2.resize(frame, (input_width, input_height))
            results = yolo_model(small_frame)
            result = results[0]

            faces_bboxes = []
            for bbox_tensor, conf, cls in zip(result.boxes.xyxy, result.boxes.conf, result.boxes.cls):
                x1, y1, x2, y2 = bbox_tensor.cpu().numpy()
                x1 = int(x1 * frame.shape[1] / input_width)
                x2 = int(x2 * frame.shape[1] / input_width)
                y1 = int(y1 * frame.shape[0] / input_height)
                y2 = int(y2 * frame.shape[0] / input_height)
                conf = conf.cpu().item()
                cls = int(cls.cpu().item())
                if cls == 0 and conf > 0.5:
                    faces_bboxes.append([x1, y1, x2, y2])

            current_face_ids = set()
            for bbox in faces_bboxes:
                x1, y1, x2, y2 = map(int, bbox)
                face_crop = frame[y1:y2, x1:x2]
                insightface_results = face_app.get(face_crop)
                if insightface_results:
                    face_embedding = insightface_results[0].embedding
                    identity_id = register_or_identify(face_embedding, bbox=bbox, image=frame, event_type='entry')
                    current_face_ids.add(identity_id)
                    all_unique_ids.add(identity_id)
                    draw_face(frame, bbox, identity_id)

            exited_ids = active_face_ids - current_face_ids
            for ex_id in exited_ids:
                log_event('exit', ex_id)

            active_face_ids = current_face_ids
        else:
            # Optionally update tracker without detection here if applicable
            pass

        draw_count(frame, len(active_face_ids))
        out.write(frame)

    cap.release()
    out.release()

    print("Total unique visitors detected in live stream:", len(all_unique_ids))
    return output_file


In [None]:
rtsp_url = "rtsp://username:password@ipaddress:port/stream"
output_path = process_rtsp_stream(rtsp_url)
