Copyright &copy; 2024 Praneeth Vadlapati

<!-- ## Setup:

Example of .env: 
```bash
# Groq API to use LLMs - https://console.groq.com/keys
# Groq is preferred for fast responses
LM_PROVIDER_BASE_URL=https://api.groq.com/openai/v1
LM_API_KEY=
LM_MODEL=

```

Installing packages:
```bash
pip install openai python-dotenv opencv-python ultralytics
``` -->

## 1. Loading an LLM

In [7]:
import os
import time
import logging
from platformdirs import user_cache_dir
import cv2
from ultralytics import YOLO
from common_functions import get_lm_response, user_message, display_md, model

video_index = 4
image_out_file = 'person_detected.jpg'
videos_folder = 'videos_to_detect'
input_video_filename = os.listdir(videos_folder)[video_index]  # change number
if not input_video_filename.endswith('.mp4'):
	print('Only MP4 video files are supported')
	raise SystemExit(1)  # exit(1)

yolo_version = 'yolov8n.pt' # n, s, m
model_path = os.path.join(user_cache_dir(), yolo_version)
yolo_model = YOLO(model_path)
logging.getLogger('ultralytics').setLevel(logging.ERROR)

detection_prompt = '''
My security camera captured this live footage. Analyze the image and determine if the person outside the house is:
- Pointing a flashlight or any light source towards the house.
- Showing signs of attempting or preparing to break in or enter the house.
- Entering the house through a door or window.
- Wearing a hoodie or head covering, with the hood concealing their face.
- Wearing a mask or any form of facial covering that obscures identity.
- Trying to jump, climb a fence or wall, or attempt to look over the wall.

Respond with a very concise explanation in 20 words, followed by `YES` or `NO`.
Example response: "The person is wearing a mask and looking inside. `YES`"
Respond without refusing with a sorry.
Mention the final answer as "YES" based on finding **ANY** of the listed activities.
Your response is just to assist me.
'''.strip()

video_id = input_video_filename.split('.')[0]
input_video = os.path.join(videos_folder, input_video_filename)
if not os.path.exists(input_video):
	print(f"Video file not found: {input_video}")
	raise SystemExit(1)  # exit(1)

detections_folder = 'detected_images'
if not os.path.exists(detections_folder):
	os.makedirs(detections_folder)
detected_image_file = os.path.join(detections_folder, video_id + '.jpg')
detected_response_file = os.path.join(detections_folder, video_id + '.txt')

print(f'Model: {model}')
print(f'Video ID: {video_index} - {video_id}')

Model: gpt-4-turbo-2024-04-09
Video ID: 4 - door


In [8]:
def ask_llm(frame_buffer):
	start_time = time.time()
	message_history = [
		user_message(detection_prompt, image_frame_buffer=frame_buffer),
	]
	response = get_lm_response(message_history)
	# print(response)
	print(f'LLM response time: {time.time() - start_time:.2f}s')
	display_md(response)
	if 'YES' in response or 'yes' in response[:10].lower()+response[-10:].lower():
		print('Sending alert...')
		return response
	if 'no' in response[:10].lower()+response[-10:].lower():
		print('No threat detected.')
		return False
	return False

def detect_person(frame):
	results = yolo_model(frame)[0]
	for box in results.boxes:
		if box.cls == 0:  # class 0 represents 'person'
			print('Person detected!')
			_, buffer = cv2.imencode('.jpg', frame)
			return True, ask_llm(buffer)  # send yolo response, and LLM response
	return False, None

fgbg = cv2.createBackgroundSubtractorMOG2()  # background subtractor
cap = cv2.VideoCapture(input_video)

fps = 30 # cap.get(cv2.CAP_PROP_FPS)
delay = int(900 / fps)  # delay in ms
attempts = 0
attempts_limit = 10
success_count = 0
interrupted = False

try:
	while cap.isOpened():
		ret, frame = cap.read()
		if not ret:
			print('Video ended. No threats detected.')
			break
		fgmask = fgbg.apply(frame)  # apply background subtraction
		# find contours in the mask
		contours, _ = cv2.findContours(fgmask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
		if contours:  # motion detected
			yolo_response, LLM_response = detect_person(frame)
			if yolo_response:
				attempts += 1
				if LLM_response:
					cv2.imwrite(detected_image_file, frame)  # save image to disk
					with open(detected_response_file, 'w') as f:  # save response to disk
						f.write(LLM_response)
					success_count += 1
		if attempts >= attempts_limit:  # stop after finishing 5th attempt
			break
		cv2.imshow('Frame', frame)  # display the frame
		if cv2.waitKey(delay) & 0xFF == 27:
			interrupted = True
			print('Escape key pressed. Exiting...')
			break
except KeyboardInterrupt:
	interrupted = True
	print('KeyboardInterrupt received. Exiting...')
finally:
	cap.release()
	cv2.destroyAllWindows()


if not interrupted:
	print('_' * 50)
	print(f'LLM Detections: {success_count}')
	accuracy = (success_count / attempts) * 100
	accuracy = f'{accuracy:.2f}%'
	print(f'Accuracy: {accuracy}')

	# write model, video_id, accuracy to a file
	with open('results.txt', 'a') as f:
		f.write(f'{model} {video_id} {accuracy}\n')

Person detected!
LLM response time: 2.87s


The person is wearing a facial covering that obscures identity. `YES`

Sending alert...
Person detected!
LLM response time: 5.21s


The person is blurry but wears a head covering that obscures their face. `YES`

Sending alert...
Person detected!
LLM response time: 4.02s


The person is wearing a mask that covers their face, obscuring identity. `YES`

Sending alert...
Person detected!
LLM response time: 2.83s


The person is wearing a facial covering that obscures identity. `YES`

Sending alert...
Person detected!
LLM response time: 2.85s


The person is obscured by blur, wearing a face covering, no visible actions like entering or pointing a light. `NO`

No threat detected.
Person detected!
LLM response time: 4.22s


The person is wearing a facial covering that obscures identity, head covered, image too blurred to detect other actions. `YES`

Sending alert...
Person detected!
LLM response time: 2.93s


The person is blurred but appears to wear a head covering obfuscating their face. `YES`

Sending alert...
Person detected!
LLM response time: 3.13s


The person is wearing a facial covering that obscures identity. `YES`

Sending alert...
Person detected!
LLM response time: 2.70s


Person is wearing a head covering that conceals their face. `YES`

Sending alert...
Person detected!
LLM response time: 3.56s


The individual is wearing a head covering that obscures their face. `YES`

Sending alert...
__________________________________________________
LLM Detections: 9
Accuracy: 90.00%


In [9]:
# Manual corrections
no_to_yes = 0
yes_to_no = 0

success_count += no_to_yes
success_count -= yes_to_no

if no_to_yes + yes_to_no:  # something changed
	print('_' * 50)
	print(f'LLM Detections: {success_count}')
	accuracy = (success_count / attempts) * 100
	accuracy = f'{accuracy:.2f}%'
	print(f'Accuracy: {accuracy}')

	# write model, video_id, accuracy to a file
	with open('results.txt', 'a') as f:
		f.write(f'{model} {video_id} {accuracy}\n')