In [2]:
# !pip install pyttsx3

In [3]:
import cv2
import numpy as np
import torch
import pyttsx3
import time

# ----------------------
# Configuration
# ----------------------
# YOLOv5n model (pretrained on COCO)
model = torch.hub.load('ultralytics/yolov5', 'yolov5n', pretrained=True)
model.conf = 0.25  # confidence threshold
model.iou = 0.45   # NMS IoU threshold
model.to('cpu')    # force CPU

# Text-to-speech engine
tts_engine = pyttsx3.init()
tts_engine.setProperty('rate', 150)

# Door detection parameters
MIN_DOOR_AREA = 5000
ASPECT_RATIO_RANGE = (1.8, 2.5)  # height/width


Downloading: "https://github.com/ultralytics/yolov5/zipball/master" to C:\Users\rbris/.cache\torch\hub\master.zip


Creating new Ultralytics Settings v0.0.6 file  
View Ultralytics Settings with 'yolo settings' or at 'C:\Users\rbris\AppData\Roaming\Ultralytics\settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


YOLOv5  2025-4-18 Python-3.12.4 torch-2.5.1+cu124 CUDA:0 (NVIDIA GeForce RTX 4050 Laptop GPU, 6140MiB)

Downloading https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5n.pt to yolov5n.pt...
100%|██████████| 3.87M/3.87M [00:00<00:00, 11.0MB/s]

Fusing layers... 
YOLOv5n summary: 213 layers, 1867405 parameters, 0 gradients, 4.5 GFLOPs
Adding AutoShape... 


In [4]:

# ----------------------
# Detection Functions
# ----------------------

def detect_doors(frame):
    """
    Heuristic: find large rectangular contours with aspect ratio ~2:1.
    """
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    edges = cv2.Canny(blurred, 50, 150)
    contours, _ = cv2.findContours(edges, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    door_boxes = []
    for cnt in contours:
        peri = cv2.arcLength(cnt, True)
        approx = cv2.approxPolyDP(cnt, 0.02 * peri, True)
        if len(approx) == 4 and cv2.isContourConvex(approx):
            x, y, w, h = cv2.boundingRect(approx)
            area = w * h
            aspect = (h / w) if w > 0 else 0
            if area > MIN_DOOR_AREA and ASPECT_RATIO_RANGE[0] < aspect < ASPECT_RATIO_RANGE[1]:
                door_boxes.append((x, y, w, h))
    return door_boxes


def detect_lanes(frame):
    """
    Canny + Hough transform on a region of interest to detect lane lines.
    """
    h, w = frame.shape[:2]
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (5, 5), 0)
    edges = cv2.Canny(blur, 50, 150)

    # Region of interest mask (bottom half)
    mask = np.zeros_like(edges)
    polygon = np.array([[(0, h), (w, h), (w, int(h * 0.6)), (0, int(h * 0.6))]], dtype=np.int32)
    cv2.fillPoly(mask, polygon, 255)
    masked = cv2.bitwise_and(edges, mask)

    # HoughLinesP parameters
    lines = cv2.HoughLinesP(masked, 1, np.pi / 180, threshold=50,
                             minLineLength=100, maxLineGap=50)
    line_img = np.zeros_like(frame)
    if lines is not None:
        for x1, y1, x2, y2 in lines.reshape(-1, 4):
            cv2.line(line_img, (x1, y1), (x2, y2), (0, 255, 0), 3)
    return line_img, lines


def detect_footpath(frame):
    """
    Simple heuristic: detect multiple nearly-horizontal lines in bottom region.
    """
    h, w = frame.shape[:2]
    roi = frame[int(h * 0.8):h, 0:w]
    gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 50, 150)
    lines = cv2.HoughLinesP(edges, 1, np.pi / 180, threshold=30,
                             minLineLength=50, maxLineGap=20)
    count = 0
    if lines is not None:
        for x1, y1, x2, y2 in lines.reshape(-1, 4):
            slope = abs((y2 - y1) / (x2 - x1 + 1e-6))
            if slope < 0.1:  # nearly horizontal
                count += 1
    if count > 10:
        # return True and bounding box of ROI relative to full frame
        return True, (0, int(h * 0.8), w, int(h * 0.2))
    return False, None


In [6]:

# ----------------------
# Main Loop
# ----------------------

def main():
    cap = cv2.VideoCapture(0)
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)

    prev_time = time.time()
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # 1) YOLOv5n detection (draw bounding boxes)
        results = model(frame)
        for *box, conf, cls in results.xyxy[0].cpu().numpy():
            x1, y1, x2, y2 = map(int, box)
            label = results.names[int(cls)]
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX,
                        0.5, (0, 255, 0), 2)

        # 2) Door detection
        door_boxes = detect_doors(frame)
        for x, y, w, h in door_boxes:
            cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 2)
            cv2.putText(frame, 'Door', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX,
                        0.5, (255, 0, 0), 2)
            tts_engine.say('Door ahead')
            tts_engine.runAndWait()

        # 3) Lane detection
        lane_img, _ = detect_lanes(frame)
        frame = cv2.addWeighted(frame, 0.8, lane_img, 1, 1)

        # 4) Footpath detection
        fp_flag, fp_box = detect_footpath(frame)
        if fp_flag:
            x, y, w, h = fp_box
            cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 0, 255), 2)
            cv2.putText(frame, 'Footpath', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX,
                        0.5, (0, 0, 255), 2)
            tts_engine.say('Footpath')
            tts_engine.runAndWait()

        # 5) FPS display
        curr_time = time.time()
        fps = 1 / (curr_time - prev_time)
        prev_time = curr_time
        cv2.putText(frame, f'FPS: {fps:.1f}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX,
                    1, (0, 255, 255), 2)

        cv2.imshow('AI Glasses View', frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()


if __name__ == '__main__':
    main()


  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with a

In [None]:
import cv2
import numpy as np
import torch
import pyttsx3
import time

# ----------------------
# Configuration
# ----------------------
# YOLOv5n model (pretrained on COCO)
model = torch.hub.load('ultralytics/yolov5', 'yolov5n', pretrained=True)
model.conf = 0.25  # confidence threshold
model.iou = 0.45   # NMS IoU threshold
model.to('cpu')    # force CPU

# Text-to-speech engine
tts_engine = pyttsx3.init()
tts_engine.setProperty('rate', 140)

# Door detection heuristics
MIN_DOOR_AREA = 5000
ASPECT_RATIO_RANGE = (1.8, 2.5)  # height/width

# ----------------------
# Detection Functions
# ----------------------

def detect_doors(frame):
    """
    Heuristic: find large rectangular contours with aspect ratio ~2:1.
    """
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (5, 5), 0)
    edges = cv2.Canny(blur, 50, 150)
    contours, _ = cv2.findContours(edges, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    boxes = []
    for cnt in contours:
        peri = cv2.arcLength(cnt, True)
        approx = cv2.approxPolyDP(cnt, 0.02 * peri, True)
        if len(approx) == 4 and cv2.isContourConvex(approx):
            x, y, w, h = cv2.boundingRect(approx)
            area = w * h
            aspect = h / (w + 1e-6)
            if area > MIN_DOOR_AREA and ASPECT_RATIO_RANGE[0] < aspect < ASPECT_RATIO_RANGE[1]:
                boxes.append((x, y, w, h))
    return boxes


def detect_lanes(frame):
    """
    Canny + Hough on bottom region to detect lane lines.
    """
    h, w = frame.shape[:2]
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (5, 5), 0)
    edges = cv2.Canny(blur, 50, 150)
    # ROI: lower half
    mask = np.zeros_like(edges)
    roi = np.array([[(0, h), (w, h), (w, h//2), (0, h//2)]], dtype=np.int32)
    cv2.fillPoly(mask, roi, 255)
    lines = cv2.HoughLinesP(mask & edges, 1, np.pi/180, 50, minLineLength=80, maxLineGap=50)
    line_img = np.zeros_like(frame)
    return line_img, lines


def detect_footpath(frame):
    """
    Detect footpath by counting horizontal line segments.
    """
    h, w = frame.shape[:2]
    roi = frame[int(h*0.75):h, :]
    gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 50, 150)
    lines = cv2.HoughLinesP(edges, 1, np.pi/180, 30, minLineLength=50, maxLineGap=20)
    count = 0
    if lines is not None:
        for x1, y1, x2, y2 in lines.reshape(-1,4):
            if abs((y2-y1)/(x2-x1+1e-6)) < 0.1:
                count += 1
    if count > 8:
        return True, (0, int(h*0.75), w, int(h*0.25))
    return False, None


def analyze_navigation(frame, door_boxes, yolo_boxes, lanes, fp_flag, fp_box):
    """
    Provide simple audio cues: obstacle, path clear, direction hints.
    """
    h, w = frame.shape[:2]
    cx_min, cx_max = w//3, 2*w//3
    cy_min, cy_max = h//2, h
    obstacle = False
    # YOLO obstacles
    for x1,y1,x2,y2,label in yolo_boxes:
        if label in ['person','car','bicycle','motorcycle','bus','truck']:
            if x2 > cx_min and x1 < cx_max and y2 > cy_min:
                obstacle = True
                break
    # door obstacles
    if not obstacle:
        for x,y,ww,hh in door_boxes:
            if x+ww > cx_min and x < cx_max and y+hh > cy_min:
                obstacle = True
                break
    # give basic cue
    if obstacle:
        tts_engine.say('Obstacle ahead')
    else:
        tts_engine.say('Path clear')
    # footpath cue
    if fp_flag:
        fx,fy,fw,fh = fp_box
        center = fx + fw/2
        if center > w/2:
            tts_engine.say('Footpath on right')
        else:
            tts_engine.say('Footpath on left')
    # lane turn suggestion
    if lanes is not None:
        slopes = []
        for x1,y1,x2,y2 in lanes.reshape(-1,4):
            if abs(x2-x1) > 5:
                slopes.append((y2-y1)/(x2-x1))
        if slopes:
            avg_s = np.mean(slopes)
            if avg_s > 0.2:
                tts_engine.say('Turn right')
            elif avg_s < -0.2:
                tts_engine.say('Turn left')
    tts_engine.runAndWait()

# ----------------------
# Main Loop
# ----------------------

def main():
    cap = cv2.VideoCapture(0)
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
    prev = time.time()
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        # YOLO detection
        results = model(frame)
        yolo_boxes = []
        for *b, conf, cls in results.xyxy[0].cpu().numpy():
            x1,y1,x2,y2 = map(int, b)
            label = results.names[int(cls)]
            yolo_boxes.append((x1,y1,x2,y2,label))
            cv2.rectangle(frame, (x1,y1), (x2,y2), (0,255,0),2)
            cv2.putText(frame, label, (x1,y1-5), cv2.FONT_HERSHEY_SIMPLEX,0.5,(0,255,0),1)
        # door
        doors = detect_doors(frame)
        for x,y,ww,hh in doors:
            cv2.rectangle(frame, (x,y), (x+ww,y+hh), (255,0,0),2)
            cv2.putText(frame, 'Door', (x,y-5), cv2.FONT_HERSHEY_SIMPLEX,0.5,(255,0,0),1)
        # lanes
        lane_img, lanes = detect_lanes(frame)
        frame = cv2.addWeighted(frame,0.8,lane_img,1,1)
        # footpath
        fp_flag, fp_box = detect_footpath(frame)
        if fp_flag:
            x,y,ww,hh = fp_box
            cv2.rectangle(frame, (x,y),(x+ww,y+hh),(0,0,255),2)
            cv2.putText(frame,'Footpath',(x,y-5),cv2.FONT_HERSHEY_SIMPLEX,0.5,(0,0,255),1)
        # navigation cues
        analyze_navigation(frame, doors, yolo_boxes, lanes, fp_flag, fp_box)
        # FPS
        now = time.time()
        fps = 1/(now-prev)
        prev = now
        cv2.putText(frame,f'FPS: {fps:.1f}',(10,30),cv2.FONT_HERSHEY_SIMPLEX,1,(0,255,255),2)
        cv2.imshow('AI Glasses Prototype', frame)
        if cv2.waitKey(1)&0xFF==ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()

if __name__=='__main__':
    main()


Using cache found in C:\Users\rbris/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2025-4-18 Python-3.12.4 torch-2.5.1+cu124 CUDA:0 (NVIDIA GeForce RTX 4050 Laptop GPU, 6140MiB)

Fusing layers... 
YOLOv5n summary: 213 layers, 1867405 parameters, 0 gradients, 4.5 GFLOPs
Adding AutoShape... 
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):


RuntimeError: run loop already started

: 

In [7]:
import cv2
import numpy as np
import torch
import matplotlib.pyplot as plt

# ----------------------
# Configuration
# ----------------------
# YOLOv5n model (pretrained on COCO)
model = torch.hub.load('ultralytics/yolov5', 'yolov5n', pretrained=True)
model.conf = 0.25  # confidence threshold
model.iou  = 0.45  # NMS IoU threshold
model.to('cpu')    # force CPU

# Heuristic footpath detection parameters
LINE_LENGTH_THRESHOLD = 100
LINE_GAP_THRESHOLD    = 20
HORIZONTAL_SLOPE_MAX  = 0.1
MIN_HORIZONTAL_LINES  = 5

def detect_footpath_refined(frame):
    h, w = frame.shape[:2]
    start_y = int(h * 0.85)
    roi = frame[start_y:h, :]
    gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 50, 150)
    lines = cv2.HoughLinesP(
        edges, 
        rho=1, 
        theta=np.pi/180, 
        threshold=30,
        minLineLength=LINE_LENGTH_THRESHOLD, 
        maxLineGap=LINE_GAP_THRESHOLD
    )
    count = 0
    if lines is not None:
        for x1, y1, x2, y2 in lines.reshape(-1, 4):
            if abs((y2 - y1) / (x2 - x1 + 1e-6)) < HORIZONTAL_SLOPE_MAX:
                count += 1
    if count >= MIN_HORIZONTAL_LINES:
        return True, (0, start_y, w, h - start_y), count
    return False, None, count

# ----------------------
# Load and process the image
# ----------------------
img_path = '../test_images/bars.png'
frame = cv2.imread(img_path)

# --- Debug check: image load ---
if frame is None:
    print("ERROR: could not load image at", img_path)
    exit(1)
else:
    print("Loaded image with shape:", frame.shape)

overlay = frame.copy()

# ----------------------
# 1) YOLO “sidewalk” detections
# ----------------------
results = model(frame)
sidewalk_boxes = []
for *box, conf, cls in results.xyxy[0].cpu().numpy():
    x1, y1, x2, y2 = map(int, box)
    label = results.names[int(cls)]
    if label == 'sidewalk':  # COCO class for footpath
        sidewalk_boxes.append((x1, y1, x2, y2))
        cv2.rectangle(overlay, (x1, y1), (x2, y2), (255, 0, 255), 3)
        cv2.putText(
            overlay, 
            'YOLO Footpath', 
            (x1, y1 - 10),
            cv2.FONT_HERSHEY_SIMPLEX, 
            0.8, 
            (255, 0, 255), 
            2
        )

print("YOLO sidewalk boxes:", sidewalk_boxes)

# ----------------------
# 2) Heuristic footpath
# ----------------------
fp_flag, fp_box, fp_count = detect_footpath_refined(frame)
print(f"Heuristic footpath detected? {fp_flag} – line count = {fp_count}")

if fp_flag:
    x, y, w_box, h_box = fp_box
    cv2.rectangle(overlay, (x, y), (x + w_box, y + h_box), (0, 0, 255), 3)
    cv2.putText(
        overlay, 
        f'Heuristic Footpath ({fp_count})',
        (x + 5, y - 10), 
        cv2.FONT_HERSHEY_SIMPLEX, 
        0.8, 
        (0, 0, 255), 
        2
    )

# ----------------------
# Save & display the overlay
# ----------------------
# Save for offline inspection
out_path = '../test_images/bars.png'
cv2.imwrite(out_path, overlay)
print("Wrote debug overlay to:", out_path)

# (Optional) display with matplotlib if you have an X server
overlay_rgb = cv2.cvtColor(overlay, cv2.COLOR_BGR2RGB)
plt.figure(figsize=(10, 6))
plt.imshow(overlay_rgb)
plt.axis('off')
plt.title('Combined YOLO & Heuristic Footpath Detection')
plt.show()


Using cache found in C:\Users\rbris/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2025-4-18 Python-3.12.4 torch-2.5.1+cu124 CUDA:0 (NVIDIA GeForce RTX 4050 Laptop GPU, 6140MiB)

Fusing layers... 
YOLOv5n summary: 213 layers, 1867405 parameters, 0 gradients, 4.5 GFLOPs
Adding AutoShape... 


Loaded image with shape: (740, 932, 3)
YOLO sidewalk boxes: []
Heuristic footpath detected? True – line count = 133
Wrote debug overlay to: ../test_images/bars.png


  with amp.autocast(autocast):


In [2]:
import cv2
import numpy as np
import torch
import pyttsx3
import easyocr
import time

# ----------------------
# Configuration
# ----------------------
# Load YOLOv5n (CPU only)
model = torch.hub.load('ultralytics/yolov5', 'yolov5n', pretrained=True)
model.conf = 0.25
model.iou  = 0.45
model.to('cpu')

# Text-to-speech (pyttsx3)
tts = pyttsx3.init()
tts.setProperty('rate', 140)

# EasyOCR reader (English)
reader = easyocr.Reader(['en'], gpu=False)

# ----------------------
# Parameters
# ----------------------
MIN_DOOR_AREA      = 5000
ASPECT_RATIO_RANGE = (1.8, 2.5)
LINE_LENGTH        = 100
LINE_GAP           = 20
SLOPE_MAX          = 0.1
MIN_LINES          = 5

# Crosswalk params
CW_MIN_LINES   = 6
CW_SLOPE_MAX   = 0.2
CW_ROI_Y_RATIO = 0.5

# Stairs/curb params
ST_STEP_HEIGHT = 20

# OCR skip frames
OCR_SKIP_FRAMES = 30

# ----------------------
# Detection Functions
# ----------------------

def detect_doors(frame):
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (5, 5), 0)
    edges = cv2.Canny(blur, 50, 150)
    cnts, _ = cv2.findContours(edges, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    boxes = []
    for c in cnts:
        peri = cv2.arcLength(c, True)
        approx = cv2.approxPolyDP(c, 0.02 * peri, True)
        if len(approx) == 4 and cv2.isContourConvex(approx):
            x, y, w, h = cv2.boundingRect(approx)
            area = w * h
            asp = h / (w + 1e-6)
            if area > MIN_DOOR_AREA and ASPECT_RATIO_RANGE[0] < asp < ASPECT_RATIO_RANGE[1]:
                boxes.append((x, y, w, h))
    return boxes


def detect_lanes(frame):
    h, w = frame.shape[:2]
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (5, 5), 0)
    edges = cv2.Canny(blur, 50, 150)
    mask = np.zeros_like(edges)
    roi = np.array([[(0, h), (w, h), (w, h // 2), (0, h // 2)]], np.int32)
    cv2.fillPoly(mask, roi, 255)
    lines = cv2.HoughLinesP(mask & edges, 1, np.pi / 180, 50,
                             minLineLength=80, maxLineGap=50)
    return lines


def detect_footpath(frame):
    h, w = frame.shape[:2]
    sy = int(h * 0.85)
    roi = frame[sy:h, :]
    gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 50, 150)
    lines = cv2.HoughLinesP(edges, 1, np.pi / 180,
                             30, minLineLength=LINE_LENGTH, maxLineGap=LINE_GAP)
    cnt = 0
    if lines is not None:
        for x1, y1, x2, y2 in lines.reshape(-1, 4):
            if abs((y2 - y1) / (x2 - x1 + 1e-6)) < SLOPE_MAX:
                cnt += 1
    if cnt >= MIN_LINES:
        return True, (0, sy, w, h - sy)
    return False, None


def detect_crosswalk(frame):
    # Group horizontal lines to approximate zebra stripes
    h, w = frame.shape[:2]
    sy = int(h * CW_ROI_Y_RATIO)
    roi = frame[sy:h, :]
    gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (5, 5), 0)
    edges = cv2.Canny(blur, 50, 150)
    lines = cv2.HoughLinesP(edges, 1, np.pi / 180,
                             40, minLineLength=50, maxLineGap=30)
    ys = []
    if lines is not None:
        for x1, y1, x2, y2 in lines.reshape(-1, 4):
            if abs((y2 - y1) / (x2 - x1 + 1e-6)) < CW_SLOPE_MAX:
                ys.append((y1 + y2) / 2)
    # Require a minimum count and unique spacing
    if len(ys) >= CW_MIN_LINES:
        bins = set(int(y // 10) for y in ys)
        if len(bins) >= CW_MIN_LINES // 2:
            return True, (0, sy, w, h - sy)
    return False, None


def detect_stairs(frame):
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    sob = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=5)
    _, th = cv2.threshold(np.abs(sob).astype(np.uint8), 50, 255, cv2.THRESH_BINARY)
    cnts, _ = cv2.findContours(th, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    steps = []
    for c in cnts:
        x, y, w, h = cv2.boundingRect(c)
        if h > ST_STEP_HEIGHT and w > 20:
            steps.append((x, y, w, h))
    return (True, steps) if steps else (False, [])


def classify_obstacles(yolo_boxes, frame):
    h, w = frame.shape[:2]
    cx_min, cx_max = w // 3, 2 * w // 3
    cy_min = h // 2
    obs = []
    for x1, y1, x2, y2, label in yolo_boxes:
        if y2 > cy_min and x1 < cx_max and x2 > cx_min and label in [
            'person', 'car', 'bicycle', 'motorcycle', 'bus', 'truck']:
            obs.append(label)
    return set(obs)


def detect_indoor_objects(yolo_boxes):
    indoor = []
    for x1, y1, x2, y2, label in yolo_boxes:
        if label in ['chair', 'couch', 'diningtable', 'bed']:
            indoor.append((label, (x1, y1, x2, y2)))
    return indoor


def recognize_text(frame):
    results = reader.readtext(frame)
    texts = []
    for bbox, text, conf in results:
        if conf > 0.5:
            xs = [pt[0] for pt in bbox]
            ys = [pt[1] for pt in bbox]
            x, y = int(min(xs)), int(min(ys))
            w, h = int(max(xs) - x), int(max(ys) - y)
            texts.append((text, (x, y, w, h)))
    return texts

# ----------------------
# Main Loop
# ----------------------
def main():
    cap = cv2.VideoCapture(0)
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
    frame_count = 0
    prev_time = time.time()

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Reset audio queue
        tts.stop()

        # YOLO detections
        results = model(frame)
        yolo_boxes = []
        for *b, _, cls in results.xyxy[0].cpu().numpy():
            x1, y1, x2, y2 = map(int, b)
            label = results.names[int(cls)]
            yolo_boxes.append((x1, y1, x2, y2, label))
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX,
                        0.5, (0, 255, 0), 1)

        # Doors
        doors = detect_doors(frame)
        for x, y, w, h in doors:
            cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 2)
            cv2.putText(frame, 'Door', (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                        (255, 0, 0), 1)
            tts.say('Door ahead')

        # Lanes
        lines = detect_lanes(frame)
        if lines is not None:
            for x1, y1, x2, y2 in lines.reshape(-1, 4):
                cv2.line(frame, (x1, y1), (x2, y2), (0, 255, 255), 2)

        # Footpath
        fp_flag, fp_box = detect_footpath(frame)
        if fp_flag:
            x, y, w_box, h_box = fp_box
            cv2.rectangle(frame, (x, y), (x + w_box, y + h_box), (0, 0, 255), 2)
            cv2.putText(frame, 'Footpath', (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX,
                        0.5, (0, 0, 255), 1)
            pos = 'right' if (x + w_box / 2) > (frame.shape[1] / 2) else 'left'
            tts.say(f'Footpath on {pos}')

        # Crosswalk
        cw_flag, cw_box = detect_crosswalk(frame)
        if cw_flag:
            x, y, w_box, h_box = cw_box
            cv2.rectangle(frame, (x, y), (x + w_box, y + h_box), (0, 255, 0), 2)
            cv2.putText(frame, 'Crosswalk', (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX,
                        0.5, (0, 255, 0), 1)
            tts.say('Crosswalk ahead')

        # Stairs/Curb
        st_flag, steps = detect_stairs(frame)
        if st_flag:
            for x, y, w_box, h_box in steps:
                cv2.rectangle(frame, (x, y), (x + w_box, y + h_box), (0, 165, 255), 2)
            cv2.putText(frame, 'Stairs/Curb', (steps[0][0], steps[0][1] - 5),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 165, 255), 1)
            tts.say('Stairs or curb ahead')

        # Indoor objects
        indoor_list = detect_indoor_objects(yolo_boxes)
        for label, (x1, y1, x2, y2) in indoor_list:
            cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 255, 0), 2)
            cv2.putText(frame, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX,
                        0.5, (255, 255, 0), 1)
            tts.say(f'{label}')

        # OCR every N frames
        if frame_count % OCR_SKIP_FRAMES == 0:
            texts = recognize_text(frame)
            for txt, (x, y, w_box, h_box) in texts:
                cv2.rectangle(frame, (x, y), (x + w_box, y + h_box), (255, 0, 255), 1)
                cv2.putText(frame, txt, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX,
                            0.5, (255, 0, 255), 1)
                tts.say(f'Sign: {txt}')

        # Obstacle classification
        obs = classify_obstacles(yolo_boxes, frame)
        if obs:
            tts.say('Obstacle: ' + ' and '.join(obs))
        else:
            # if no threats and no static warnings
            if not doors and not cw_flag and not st_flag:
                tts.say('Path clear')

        # Speak queued messages
        try:
            tts.runAndWait()
        except RuntimeError:
            tts.stop()

        # Display FPS
        now = time.time()
        fps = 1 / (now - prev_time) if 'prev_time' in locals() else 0
        prev_time = now
        cv2.putText(frame, f'FPS: {fps:.1f}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX,
                    1, (0, 255, 255), 2)

        cv2.imshow('AI Glasses', frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
        frame_count += 1

    cap.release()
    cv2.destroyAllWindows()

if __name__ == '__main__':
    main()


Using cache found in C:\Users\rbris/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2025-4-18 Python-3.12.4 torch-2.5.1+cu124 CUDA:0 (NVIDIA GeForce RTX 4050 Laptop GPU, 6140MiB)

Fusing layers... 
YOLOv5n summary: 213 layers, 1867405 parameters, 0 gradients, 4.5 GFLOPs
Adding AutoShape... 
Using CPU. Note: This module is much faster with a GPU.
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast)

In [10]:
from ultralytics import YOLO

# Initialize a YOLO-World model
model = YOLO("yolov8s-world.pt")  # or select yolov8m/l-world.pt

# Define custom classes
model.set_classes([
    'person', 'bicycle', 'car', 'motorcycle', 'bus', 'truck',
    'sidewalk', 'crosswalk', 'path', 'door', 'stairs', "barriers", "traffic lights", "traffic sign"
])

# Save the model with the defined offline vocabulary
model.save("custom_yolov8s.pt")

In [11]:
from ultralytics import YOLO

# Load your custom model
model = YOLO("custom_yolov8s.pt")

# Run inference to detect your custom classes
results = model.predict("../test_images/road.png")

# Show results
results[0].show()


image 1/1 c:\Users\rbris\OneDrive\Desktop\Learning\embed\smart_glasses\..\test_images\road.png: 384x640 1 person, 5 cars, 1 traffic sign, 27.4ms
Speed: 2.4ms preprocess, 27.4ms inference, 2.8ms postprocess per image at shape (1, 3, 384, 640)


In [12]:
results[0]

ultralytics.engine.results.Results object with attributes:

boxes: ultralytics.engine.results.Boxes object
keypoints: None
masks: None
names: {0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'bus', 5: 'truck', 6: 'sidewalk', 7: 'crosswalk', 8: 'path', 9: 'door', 10: 'stairs', 11: 'barriers', 12: 'traffic lights', 13: 'traffic sign'}
obb: None
orig_img: array([[[221, 146,  34],
        [227, 140,  33],
        [243, 158,  45],
        ...,
        [ 73,  85, 102],
        [ 72,  85, 101],
        [ 72,  87, 103]],

       [[198, 126,   9],
        [221, 146,  20],
        [223, 150,  18],
        ...,
        [ 89, 101, 117],
        [ 88, 101, 117],
        [ 86, 101, 117]],

       [[219, 149,  29],
        [222, 156,  18],
        [231, 167,  24],
        ...,
        [ 96, 108, 125],
        [ 96, 109, 125],
        [ 96, 112, 127]],

       ...,

       [[ 23, 107, 179],
        [ 23, 111, 184],
        [ 32, 117, 190],
        ...,
        [ 84,  82,  97],
        [ 84,  

In [None]:
import cv2
from ultralytics import YOLO
import pyttsx3
import easyocr
import time

# ----------------------
# Configuration
# ----------------------
# Load custom YOLOv8 model (trained with yolov8s-world vocabulary)
model = YOLO("custom_yolov8s.pt")  # or yolov8m-world.pt for more accuracy

# Define only necessary classes for navigation
target_classes = [
    'person', 'bicycle', 'car', 'motorcycle', 'bus', 'truck',
    'sidewalk', 'crosswalk', 'path', 'door', 'stairs',
    'barriers', 'traffic light', 'stop sign'
]

# Text-to-speech engine
tts = pyttsx3.init()
tts.setProperty('rate', 140)

# EasyOCR for optional sign reading
gpu_flag = False  # set True if GPU is available
reader = easyocr.Reader(['en'], gpu=gpu_flag)
OCR_SKIP = 30

# ----------------------
# Main Loop
# ----------------------
def main():
    cap = cv2.VideoCapture(0)
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
    frame_count = 0
    prev_time = time.time()

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # YOLOv8-world inference
        results = model.predict(frame, stream=False)[0]
        detections = results.boxes.data.tolist()  # [x1, y1, x2, y2, conf, cls]

        # Reset audio queue
        tts.stop()

        # Draw detections and queue audio guidance
        for *box, conf, cls in detections:
            label = model.model.names[int(cls)]
            if label not in target_classes:
                continue
            x1, y1, x2, y2 = map(int, box)
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, f"{label} {conf:.2f}", (x1, y1 - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)

            # Audio guidance based on label
            if label in ['person', 'car', 'bus', 'truck']:
                tts.say(f"{label} ahead")
            elif label == 'sidewalk':
                tts.say('Footpath detected')
            elif label == 'crosswalk':
                tts.say('Crosswalk ahead')
            elif label == 'door':
                tts.say('Door ahead')
            elif label == 'stairs':
                tts.say('Stairs ahead')
            elif label == 'path':
                tts.say('Path detected')
            elif label == 'barriers':
                tts.say('Barrier ahead')
            elif label in ['traffic light', 'stop sign']:
                tts.say(f"{label} detected")

        # OCR signage every OCR_SKIP frames
        if frame_count % OCR_SKIP == 0:
            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            ocr_results = reader.readtext(gray)
            for bbox, text, conf in ocr_results:
                if conf > 0.5:
                    xs = [int(pt[0]) for pt in bbox]
                    ys = [int(pt[1]) for pt in bbox]
                    x, y = min(xs), min(ys)
                    w, h = max(xs) - x, max(ys) - y
                    cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 1)
                    cv2.putText(frame, text, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX,
                                0.5, (255, 0, 0), 1)
                    tts.say(f"Sign: {text}")

        # Play queued audio cues
        try:
            tts.runAndWait()
        except RuntimeError:
            tts.stop()

        # Display FPS
        now = time.time()
        fps = 1 / (now - prev_time) if frame_count else 0
        prev_time = now
        cv2.putText(frame, f'FPS: {fps:.1f}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX,
                    1, (0, 255, 255), 2)

        # Show frame and handle quit
        cv2.imshow('AI Navigation', frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

        frame_count += 1

    cap.release()
    cv2.destroyAllWindows()

if __name__ == '__main__':
    main()


Using CPU. Note: This module is much faster with a GPU.



0: 480x640 1 person, 22.7ms
Speed: 3.3ms preprocess, 22.7ms inference, 2.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 30.3ms
Speed: 4.8ms preprocess, 30.3ms inference, 2.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 18.7ms
Speed: 1.9ms preprocess, 18.7ms inference, 2.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 26.0ms
Speed: 1.6ms preprocess, 26.0ms inference, 2.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 21.2ms
Speed: 1.8ms preprocess, 21.2ms inference, 2.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 36.5ms
Speed: 2.6ms preprocess, 36.5ms inference, 4.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 47.8ms
Speed: 2.2ms preprocess, 47.8ms inference, 5.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 86.3ms
Speed: 2.8ms preprocess, 86.3ms inference, 9.2ms postprocess per image at shape (1, 3, 48

In [6]:
model.model.names

['person',
 'bicycle',
 'car',
 'motorcycle',
 'bus',
 'truck',
 'sidewalk',
 'crosswalk',
 'path',
 'door',
 'stairs',
 'barriers',
 'traffic lights',
 'traffic sign']

In [2]:
from ultralytics import SAM

# Load the model
model = SAM("mobile_sam.pt")

# Predict a segment based on a point prompt
model.predict("../test_images/road.png", points=[900, 370], labels=[1])


image 1/1 c:\Users\rbris\OneDrive\Desktop\Learning\embed\smart_glasses\..\test_images\road.png: 1024x1024 1 0, 1233.8ms
Speed: 103.2ms preprocess, 1233.8ms inference, 60.4ms postprocess per image at shape (1, 3, 1024, 1024)


[ultralytics.engine.results.Results object with attributes:
 
 boxes: ultralytics.engine.results.Boxes object
 keypoints: None
 masks: ultralytics.engine.results.Masks object
 names: {0: '0'}
 obb: None
 orig_img: array([[[221, 146,  34],
         [227, 140,  33],
         [243, 158,  45],
         ...,
         [ 73,  85, 102],
         [ 72,  85, 101],
         [ 72,  87, 103]],
 
        [[198, 126,   9],
         [221, 146,  20],
         [223, 150,  18],
         ...,
         [ 89, 101, 117],
         [ 88, 101, 117],
         [ 86, 101, 117]],
 
        [[219, 149,  29],
         [222, 156,  18],
         [231, 167,  24],
         ...,
         [ 96, 108, 125],
         [ 96, 109, 125],
         [ 96, 112, 127]],
 
        ...,
 
        [[ 23, 107, 179],
         [ 23, 111, 184],
         [ 32, 117, 190],
         ...,
         [ 84,  82,  97],
         [ 84,  78,  92],
         [ 74,  66,  78]],
 
        [[ 16, 103, 188],
         [ 10, 101, 189],
         [  4,  91, 179],
  

In [4]:
from ultralytics.data.annotator import auto_annotate

auto_annotate(data="../test_images/road.png", det_model="yolov11n.pt", sam_model="mobile_sam.pt")


image 1/1 c:\Users\rbris\OneDrive\Desktop\Learning\embed\smart_glasses\..\test_images\road.png: 384x640 1 person, 6 cars, 143.9ms
Speed: 5.0ms preprocess, 143.9ms inference, 35.7ms postprocess per image at shape (1, 3, 384, 640)


In [12]:
from ultralytics import YOLOE

# Initialize a YOLOE model
model = YOLOE("yoloe-11s-seg.pt")  # or select yoloe-11s/m-seg.pt for different sizes

# Set text prompt to detect person and bus. You only need to do this once after you load the model.
# Set text prompt to detect various objects. You only need to do this once after you load the model.
names =  [
    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 
    'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 
    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 
    'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 
    'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 
    'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 
    'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 
    'hair drier', 'toothbrush'
]
model.set_classes(names, model.get_text_pe(names))

# Run detection on the given image
results = model.predict("../test_images/path.png")

# Show results
results[0].show()

Ultralytics 8.3.111  Python-3.12.4 torch-2.5.1+cu124 CUDA:0 (NVIDIA GeForce RTX 4050 Laptop GPU, 6140MiB)
YOLOe-11s-seg summary (fused): 137 layers, 13,693,398 parameters, 1,857,958 gradients, 36.4 GFLOPs

image 1/1 c:\Users\rbris\OneDrive\Desktop\Learning\embed\smart_glasses\..\test_images\path.png: 384x640 2 cars, 35.4ms
Speed: 3.3ms preprocess, 35.4ms inference, 3.8ms postprocess per image at shape (1, 3, 384, 640)


In [13]:
from ultralytics import YOLOE
import cv2
import pyttsx3
import time

# Initialize text-to-speech engine
engine = pyttsx3.init()
engine.setProperty('rate', 150)  # Adjust speech speed (words per minute)

# Initialize YOLOE model
model = YOLOE("yoloe-11s-seg.pt")  # Use smaller model for better performance

# Set class names (COCO dataset classes)qaqq
class_names = [
    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 
    'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 
    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 
    'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 
    'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 
    'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 
    'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 
    'hair drier', 'toothbrush'
]
model.set_classes(class_names, model.get_text_pe(class_names))

# Initialize video capture
cap = cv2.VideoCapture(0)  # Use 0 for default camera
if not cap.isOpened():
    raise IOError("Cannot open webcam")

# Speech cooldown variables
last_announcement_time = 0
speech_cooldown = 5  # seconds
last_spoken_objects = set()

def speak_objects(objects):
    global last_announcement_time, last_spoken_objects
    if not objects:
        return
    
    # Create announcement text
    announcement = "In front: " + ", ".join(objects)
    
    # Speak the announcement
    engine.say(announcement)
    engine.runAndWait()
    
    # Update last announcement time and objects
    last_announcement_time = time.time()
    last_spoken_objects = set(objects)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Perform object detection
    results = model.predict(frame)
    
    # Get detected objects
    detected_objects = []
    for result in results:
        boxes = result.boxes
        for box in boxes:
            class_id = int(box.cls[0])
            confidence = box.conf[0]
            if confidence > 0.5:  # Only consider confident detections
                object_name = class_names[class_id]
                detected_objects.append(object_name)
    
    # Remove duplicates and keep order
    unique_objects = []
    seen = set()
    for obj in detected_objects:
        if obj not in seen:
            seen.add(obj)
            unique_objects.append(obj)
    
    # Visual feedback (optional, for debugging)
    annotated_frame = results[0].plot()
    cv2.imshow('AI Glasses View', annotated_frame)
    
    # Audio feedback logic
    current_time = time.time()
    if (current_time - last_announcement_time) > speech_cooldown:
        if unique_objects and set(unique_objects) != last_spoken_objects:
            speak_objects(unique_objects)
    
    # Exit on 'q' press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Clean up
cap.release()
cv2.destroyAllWindows()
engine.stop()

Ultralytics 8.3.111  Python-3.12.4 torch-2.5.1+cu124 CUDA:0 (NVIDIA GeForce RTX 4050 Laptop GPU, 6140MiB)
YOLOe-11s-seg summary (fused): 137 layers, 13,693,398 parameters, 1,857,958 gradients, 36.4 GFLOPs

0: 480x640 1 person, 1 couch, 2 beds, 1 tv, 140.5ms
Speed: 3.1ms preprocess, 140.5ms inference, 6.2ms postprocess per image at shape (1, 3, 480, 640)
Ultralytics 8.3.111  Python-3.12.4 torch-2.5.1+cu124 CUDA:0 (NVIDIA GeForce RTX 4050 Laptop GPU, 6140MiB)

0: 480x640 1 person, 2 beds, 1 tv, 26.4ms
Speed: 2.0ms preprocess, 26.4ms inference, 4.2ms postprocess per image at shape (1, 3, 480, 640)
Ultralytics 8.3.111  Python-3.12.4 torch-2.5.1+cu124 CUDA:0 (NVIDIA GeForce RTX 4050 Laptop GPU, 6140MiB)

0: 480x640 1 person, 1 couch, 2 beds, 1 tv, 26.6ms
Speed: 2.1ms preprocess, 26.6ms inference, 4.0ms postprocess per image at shape (1, 3, 480, 640)
Ultralytics 8.3.111  Python-3.12.4 torch-2.5.1+cu124 CUDA:0 (NVIDIA GeForce RTX 4050 Laptop GPU, 6140MiB)

0: 480x640 1 person, 2 beds, 1 tv, 2

In [14]:
!pip install edge-tts

Collecting edge-tts
  Downloading edge_tts-7.0.1-py3-none-any.whl.metadata (5.5 kB)
Downloading edge_tts-7.0.1-py3-none-any.whl (26 kB)
Installing collected packages: edge-tts
Successfully installed edge-tts-7.0.1


In [15]:

import asyncio

import edge_tts

TEXT = "Hello World!"
VOICE = "en-GB-SoniaNeural"
OUTPUT_FILE = "test.mp3"


async def amain() -> None:
    """Main function"""
    communicate = edge_tts.Communicate(TEXT, VOICE)
    await communicate.save(OUTPUT_FILE)


if __name__ == "__main__":
    asyncio.run(amain())

RuntimeError: asyncio.run() cannot be called from a running event loop

In [16]:
import os
import asyncio
from mistralai import Mistral

class QueryCategorizer:
    def __init__(self):
        self.api_key = os.getenv("MISTRAL_API_KEY")
        self.client = Mistral(api_key=self.api_key)
        self.model = "mistral-tiny"
        
        self.system_prompt = """REACT PROMPT:
You are a classification agent. Analyze the user input and categorize it into EXACTLY ONE of these categories:
1) Object detection - queries about identifying objects in images
2) Scene analyzer - requests to describe or analyze entire scenes
3) Conversation - general chat, greetings, or non-visual queries
4) Text recognition - requests to read/extract text from images

Respond ONLY with the category name in lowercase, without any punctuation or formatting.
Example responses: "object detection", "scene analyzer", etc."""

    async def categorize(self, text: str) -> str:
        try:
            response = await self.client.chat.async(
                model=self.model,
                messages=[
                    {"role": "system", "content": self.system_prompt},
                    {"role": "user", "content": text}
                ]
            )
            
            category = response.choices[0].message.content.strip().lower()
            return self._validate_category(category)
            
        except Exception as e:
            print(f"Error: {e}")
            return "conversation"

    def _validate_category(self, category: str) -> str:
        valid_categories = {
            "object detection", 
            "scene analyzer",
            "conversation",
            "text recognition"
        }
        
        # Simple fuzzy matching
        if any(c in category for c in valid_categories):
            return next(c for c in valid_categories if c in category)
        return "conversation"

async def main():
    categorizer = QueryCategorizer()
    
    examples = [
        "What objects are in this picture?",
        "Can you describe this scene?",
        "Hello, how are you today?",
        "Read the text from this document image",
        "What's the main subject in this photo?"
    ]
    
    for query in examples:
        category = await categorizer.categorize(query)
        print(f"Query: {query}\nCategory: {category}\n")

if __name__ == "__main__":
    asyncio.run(main())

SyntaxError: invalid syntax (1842384898.py, line 23)