In [1]:
import cv2
import pyttsx3
import time
import speech_recognition as sr
from ultralytics import YOLO

# Initialize YOLO model
model = YOLO('yolov8n.pt')

# Initialize camera
camera = cv2.VideoCapture(0)

# Initialize text-to-speech engine
engine = pyttsx3.init()

# Initialize speech recognition
recognizer = sr.Recognizer()

# Constants
Known_distance = 74.2
Known_width = 14.3

# Function to recognize speech
def recognize_speech():
    with sr.Microphone() as source:
        print("Listening...")
        audio = recognizer.listen(source)
    try:
        command = recognizer.recognize_google(audio).lower()
        print("Recognized:", command)
        return command
    except sr.UnknownValueError:
        print("Sorry, could not understand audio.")
        return None
    except sr.RequestError as e:
        print("Error occurred; {0}".format(e))
        return None

# Function to find distance of object
def find_distance(known_distance, known_width, object_width):
    focal_length = (known_distance * known_width) / object_width
    return (known_distance * focal_length) / object_width

# Function to detect objects and find distance
while True:
    success, frame = camera.read()
    if not success:
        continue
    results = model(source=frame, conf=0.4, save=True)
    for result in results: 
        boxes = result.boxes.cpu().numpy()
        for box in boxes:
            object_name = result.names[int(box.cls[0])]
            x1, y1, x2, y2 = box.xyxy[0]
            object_width = x2 - x1
            distance = find_distance(Known_distance, Known_width, object_width)
            distance = round(distance)
            engine.say(f"I see a {object_name} at a distance of {distance} metres")
            engine.runAndWait()
            cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
    command = recognize_speech()
    if command == "stop detect":
        break
    elif command == "detect":
        continue
    elif cv2.waitKey(1) & 0xFF == ord("q"):
        break
    cv2.imshow("Frame", frame)


camera.release()
cv2.destroyAllWindows()


0: 480x640 1 person, 1 laptop, 167.8ms
Speed: 7.7ms preprocess, 167.8ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)
Results saved to [1mruns\detect\predict50[0m
Listening...
Sorry, could not understand audio.

0: 480x640 2 persons, 1 refrigerator, 137.6ms
Speed: 4.0ms preprocess, 137.6ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)
Results saved to [1mruns\detect\predict50[0m
Listening...
Recognized: detect

0: 480x640 1 person, 1 laptop, 114.5ms
Speed: 2.0ms preprocess, 114.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)
Results saved to [1mruns\detect\predict50[0m
Listening...
Recognized: stop detect
