In [1]:
# !pip install ultralytics opencv-python pyttsx3

In [2]:
import cv2
from ultralytics import YOLO
import pyttsx3
import threading
import time

In [3]:
print("Loading YOLOv8 model...")
# Load the YOLOv8 'nano' model because it's the fastest
model = YOLO('yolov8n.pt')
print("Model loaded.")

print("Initializing Text-to-Speech engine...")
# Initialize the Text-to-Speech (TTS) engine
tts_engine = pyttsx3.init()
print("TTS engine ready.")

Loading YOLOv8 model...
Model loaded.
Initializing Text-to-Speech engine...
TTS engine ready.


In [4]:
video_path = 'sample.mp4'
cap = cv2.VideoCapture(video_path)

In [None]:
def say_alert(text_to_say):
    """Function to say text and manage speaking state."""
    global is_speaking 
    try:
        tts_engine.say(text_to_say)
        tts_engine.runAndWait()
        tts_engine.setProperty('rate', 500)
        tts_engine.setProperty('volumne', 1.0)

    except Exception as e:
        print(f"Error in TTS: {e}")
    finally:
        is_speaking = False 

In [None]:
last_car_area = 0
alert_cooldown = 1  
last_alert_time = 0
is_speaking = False 

In [None]:
while cap.isOpened():
    success, frame = cap.read()

    if not success:
        print("Video finished or failed to read a frame.")
        break

    results = model(frame)

    annotated_frame = results[0].plot()

    person_boxes = []
    car_boxes = []

    for box in results[0].boxes:
        class_id = int(box.cls)
        if class_id == 0: #Person in COCO dataset
            person_boxes.append(box)
        elif class_id == 2: # Car in COCO dataset
            car_boxes.append(box)
        elif class_id == 7: #Truck in COCO dataset
            car_boxes.append(box)

    is_danger = False

    # Check for danger only if both a person and a car are detected
    if person_boxes and car_boxes:
        # For simplicity, we'll focus on the first detected person and car
        person_box = person_boxes[0].xyxy[0]  # Coordinates [x1, y1, x2, y2]
        car_box = car_boxes[0].xyxy[0]

        person_center_x = (person_box[0] + person_box[2]) / 2
        car_center_x = (car_box[0] + car_box[2]) / 2
        current_car_area = (car_box[2] - car_box[0]) * (car_box[3] - car_box[1])

 
        is_incoming = current_car_area > (last_car_area * 1.10)  #10% buffer for noise

        path_danger = abs(person_center_x - car_center_x) < 200 

        is_close = current_car_area > 35000 

        if is_incoming and path_danger and is_close:
            is_danger = True

        last_car_area = current_car_area
    else:
        last_car_area = 0

    if is_danger:
        cv2.putText(
            annotated_frame, 
            "DANGER! COLLISION IMMINENT!", 
            (50, 100), cv2.FONT_HERSHEY_SIMPLEX, 
            2, (0, 0, 255), 3, cv2.LINE_AA
        )
        
        current_time = time.time()
        
        if (current_time - last_alert_time) > alert_cooldown and not is_speaking:
            
            is_speaking = True
            
            print("DANGER DETECTED! Triggering voice alert.")
            last_alert_time = current_time
            
            threading.Thread(
                target=say_alert, 
                args=("Collision Imminent!",),
                daemon=True
            ).start()

    cv2.imshow("YOLOv8 Danger Detection", annotated_frame)


    if cv2.waitKey(1) & 0xFF == ord('q'):
        print("'q' key pressed. Exiting...")
        break


0: 640x512 2 persons, 3 cars, 2 motorcycles, 1 truck, 1 bed, 121.4ms
Speed: 5.6ms preprocess, 121.4ms inference, 9.6ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 2 persons, 4 cars, 1 motorcycle, 1 truck, 1 bed, 10.9ms
Speed: 5.1ms preprocess, 10.9ms inference, 8.3ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 2 persons, 3 cars, 1 motorcycle, 1 bed, 11.4ms
Speed: 5.2ms preprocess, 11.4ms inference, 7.5ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 1 person, 4 cars, 3 motorcycles, 1 bed, 12.7ms
Speed: 4.9ms preprocess, 12.7ms inference, 9.9ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 1 person, 3 cars, 3 motorcycles, 1 truck, 1 bed, 12.5ms
Speed: 4.2ms preprocess, 12.5ms inference, 9.0ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 1 person, 3 cars, 3 motorcycles, 1 bed, 12.9ms
Speed: 3.9ms preprocess, 12.9ms inference, 8.9ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 1 person, 3 cars, 3 mo

In [8]:
print("Cleaning up resources...")
cap.release()
cv2.destroyAllWindows()
print("Script finished.")

Cleaning up resources...
Script finished.


In [9]:
# import pyttsx3

# print("Initializing TTS engine...")
# try:
#     engine = pyttsx3.init()
# except Exception as e:
#     print(f"Error initializing engine: {e}")
#     exit()

# print("Engine initialized.")

# engine.setProperty('rate', 150)  # Speed of speech
# engine.setProperty('volume', 1.0) 

# print("Attempting to speak...")
# engine.say("Hello, this is a sound test. Can you hear me?")

# try:
#     engine.runAndWait()
#     print("Speech finished.")
# except Exception as e:
#     print(f"Error during runAndWait: {e}")

# print("Test complete.")