# Проектная работа: анализ посещаемости магазинов с помощью нейронных сетей в компьютерном зрении

## 1. Импорт зависимостей

In [None]:
import json
import random
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np

import cv2 as cv
import torch
from ultralytics import YOLO

## 2. Общие конфигурации и настройки

In [None]:
warnings.filterwarnings("ignore")

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.random.manual_seed(SEED)

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

In [None]:
FORCE_CPU = torch.device("cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Используемое устройство: {device}")

## 3. Подготовка данных


### 3.1. Конфигурация и схема данных

In [None]:
CUR_DIR = Path.cwd()
PROJECT_DIR = CUR_DIR.parent

DATA_DIR = PROJECT_DIR / "data"
MODEL_DIR = PROJECT_DIR / "models"

for dir in [DATA_DIR, MODEL_DIR]:
    dir.mkdir(parents=True, exist_ok=True)

In [None]:
VIDEOS_SHAPE = (720, 720)
VIDEOS_FPS = 29.0

MODEL_INPUT_SHAPE = (640, 640)

In [None]:
class Shop:
    def __init__(self, name, pts):
        self.name = name
        self.pts = np.array(pts)
        self.visit_count = 0

    def __repr__(self):
        return f"Shop {self.name} (pts={self.pts})"


def draw_shop(img, shop):
    cv.polylines(img, [shop.pts], isClosed=True, color=(0, 255, 0), thickness=3)

In [None]:
input_videos_filenames = [
    DATA_DIR / "input_1.mp4",
    DATA_DIR / "input_2.mp4",
    DATA_DIR / "input_3.mp4",
    DATA_DIR / "input_4.mp4",
]

In [None]:
SHOPS_CONFIG_FILE = DATA_DIR / "shops_config.json"

### 3.2. Экземпляры данных

In [None]:
with open(SHOPS_CONFIG_FILE, "r") as file:
    shops_configs = json.load(file)

shops = [Shop(**shop_cfg) for shop_cfg in shops_configs]

In [None]:
# Проверка работоспособности и корректности конфигурации

ex_idx = 0

input_video_filename = input_videos_filenames[ex_idx]

cap = cv.VideoCapture(str(input_video_filename))

if not cap.isOpened():
    print("Error opening video file")
else:
    ret, frame = cap.read()

    for shop in shops:
        draw_shop(frame, shop)
    cv.imshow("Video Frame", frame)

    cv.waitKey(0)
    cap.release()
    cv.destroyAllWindows()

## 4. Нейросетевая модель

In [None]:
MODEL_FILE = MODEL_DIR / "yolo11s.pt"

In [None]:
class PeopleDetectionYOLO:
    def __init__(self, model_file=MODEL_FILE, device=FORCE_CPU):
        self.model = YOLO(model_file).to(device).eval()

    def predict(self, img_tensor):
        with torch.no_grad():
            return self.model(img_tensor)

In [None]:
model = PeopleDetectionYOLO(device=device)

## 5. Подсчёт посетителей на видео

### 5.1. Методы и структуры данных обработки видео и обнаружения объектов

In [None]:
def preprocess_frame(frame, new_size=MODEL_INPUT_SHAPE):
    frame_resized = cv.resize(frame, new_size)
    frame_rgb = cv.cvtColor(frame_resized, cv.COLOR_BGR2RGB)
    frame_tensor = torch.tensor(frame_rgb, dtype=torch.float32) / 255.0
    preprocessed_frame = frame_tensor.permute(2, 0, 1).unsqueeze(0)
    return preprocessed_frame

In [None]:
class Boxes:
    def __init__(self, conf, xywhn):
        self.conf = conf
        self.xywhn = xywhn

    def __repr__(self):
        return f"Boxes(conf={self.conf}, xywhn={self.xywhn})"


def filter_boxes(boxes, thresh=0.5):
    mask = (boxes.cls == 0) & (boxes.conf > thresh)

    filtered_boxes = Boxes(
        conf=boxes.conf[mask],
        xywhn=boxes.xywhn[mask],
    )

    return filtered_boxes

In [None]:
def draw_bboxes(img, boxes):
    h, w, _ = img.shape

    for i in range(len(boxes.xywhn)):
        x_center, y_center, width, height = boxes.xywhn[i].numpy()
        conf = boxes.conf[i].numpy()

        x_abs = int((x_center * w) - (width * w) / 2)
        y_abs = int((y_center * h) - (height * h) / 2)
        width_abs = int(width * w)
        height_abs = int(height * h)

        cv.rectangle(
            img, (x_abs, y_abs), (x_abs + width_abs, y_abs + height_abs), (0, 0, 255), 1
        )

        text = f"{conf:.2f}"
        text_size = cv.getTextSize(text, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)[0]

        text_x, text_y = x_abs + 3, y_abs + 14

        cv.rectangle(
            img,
            (text_x - 1, text_y - text_size[1] - 2),
            (text_x + text_size[0] + 1, text_y + 2),
            (0, 0, 255),
            cv.FILLED,
        )

        cv.putText(
            img,
            text,
            (text_x, text_y),
            cv.FONT_HERSHEY_SIMPLEX,
            0.5,
            (255, 255, 255),
            1,
        )

### 5.2. Запуск процесса

In [None]:
ex_idx = 1
thresh = 0.2

input_video_filename = input_videos_filenames[ex_idx]

cap = cv.VideoCapture(str(input_video_filename))

if not cap.isOpened():
    print("Ошибка при открытии видео файла")
else:
    frame_count = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1

        if frame_count % (VIDEOS_FPS // 2) == 0:
            frame_tensor = preprocess_frame(frame).to(device)
            boxes = model.predict(frame_tensor)[0].boxes
            filtered_boxes = filter_boxes(boxes, thresh=thresh)

            for shop in shops:
                draw_shop(frame, shop)

            draw_bboxes(frame, filtered_boxes)
            cv.imshow("YOLO", frame)

            if cv.waitKey(10) & 0xFF == ord("q"):
                break

    cap.release()
    cv.destroyAllWindows()