In [1]:
!pip install -q ultralytics transformers gtts opencv-python ffmpeg-python

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.0/1.0 MB[0m [31m40.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m82.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m65.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os, json, cv2, numpy as np, torch
from PIL import Image
from gtts import gTTS
from IPython.display import Audio, display
from ultralytics import YOLO
from transformers import CLIPProcessor, CLIPModel
from collections import deque

# --- 📁 Config ---
VIDEO_PATH = "/content/WhatsApp Video 2025-06-15 at 18.15.58_1b92e557.mp4"
END_IMG_PATH = "/content/ending.jpg"
OUTPUT_DIR = "room_graph"
FRAME_INTERVAL = 10  # Save every 1 sec if video is 30fps
SIM_THRESHOLD = 0.87
TURN_THRESHOLD_DEGREES = 60
TURN_FLOW_MAG = 2.5
BLUR_THRESHOLD = 100.0
device = "cuda" if torch.cuda.is_available() else "cpu"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- 🔌 Load models
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
yolo = YOLO("yolov8m.pt")

# --- 🔊 Speak
def speak(text):
    print("🔊", text)
    tts = gTTS(text=text, lang='en')
    tts.save("speech.mp3")
    display(Audio("speech.mp3", autoplay=True))

# --- 🔍 Embedding + Similarity
def get_clip_embedding(img):
    inputs = clip_processor(images=Image.fromarray(img), return_tensors="pt").to(device)
    with torch.no_grad():
        return clip_model.get_image_features(**inputs)[0].cpu().numpy()

def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# --- 📐 Blur Check
def is_clear_image(img, threshold=BLUR_THRESHOLD):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    return cv2.Laplacian(gray, cv2.CV_64F).var() >= threshold

# --- 🧭 Estimate Turn
def estimate_rotation(prev, curr):
    prev_gray, curr_gray = cv2.cvtColor(prev, cv2.COLOR_BGR2GRAY), cv2.cvtColor(curr, cv2.COLOR_BGR2GRAY)
    flow = cv2.calcOpticalFlowFarneback(prev_gray, curr_gray, None, 0.5, 3, 15, 3, 5, 1.2, 0)
    dx, dy = np.mean(flow[..., 0]), np.mean(flow[..., 1])
    angle = np.degrees(np.arctan2(dy, dx))
    mag = np.sqrt(dx**2 + dy**2)
    if mag < TURN_FLOW_MAG: return "forward", 0
    if angle < -TURN_THRESHOLD_DEGREES: return "left", abs(angle)
    if angle > TURN_THRESHOLD_DEGREES: return "right", abs(angle)
    return "forward", abs(angle)

# --- 🧱 Room Graph Builder
def build_room_graph(video_path):
    cap = cv2.VideoCapture(video_path)
    room_id = 0
    prev_emb, prev_frame = None, None
    frame_idx = 0
    current_room = f"room_{room_id}"
    room_images, room_graph = {current_room: []}, {}
    room_filenames = {current_room: []}

    while True:
        ret, frame = cap.read()
        if not ret: break
        if frame_idx % FRAME_INTERVAL != 0:
            frame_idx += 1
            continue

        emb = get_clip_embedding(frame)
        is_new_room = False

        if prev_emb is not None:
            sim = cosine_sim(emb, prev_emb)
            direction, angle = estimate_rotation(prev_frame, frame)
            is_new_room = (sim < SIM_THRESHOLD and angle >= TURN_THRESHOLD_DEGREES)
            if is_new_room:
                room_id += 1
                new_room = f"room_{room_id}"
                room_graph.setdefault(current_room, []).append({"room": new_room, "direction": direction})
                current_room = new_room
                room_images[current_room] = []
                room_filenames[current_room] = []

        # ✅ Save only clear frames
        if is_clear_image(frame):
            frame_filename = f"{OUTPUT_DIR}/{current_room}_frame_{frame_idx}.jpg"
            cv2.imwrite(frame_filename, frame)
            room_images[current_room].append(frame)
            room_filenames[current_room].append(frame_filename)

        prev_emb, prev_frame = emb, frame.copy()
        frame_idx += 1

    cap.release()

    # ✅ Save 3 start and 3 end images per room
    for room, files in room_filenames.items():
        if len(files) >= 1:
            num_start = min(3, len(files))
            num_end = min(3, len(files))

            # Start images
            for i in range(num_start):
                start_img = files[i]
                dest = os.path.join(OUTPUT_DIR, f"{room}_start_{i+1}.jpg")
                cv2.imwrite(dest, cv2.imread(start_img))

            # End images
            for i in range(num_end):
                end_img = files[-num_end + i]
                dest = os.path.join(OUTPUT_DIR, f"{room}_end_{i+1}.jpg")
                cv2.imwrite(dest, cv2.imread(end_img))

    # ✅ Save graph
    with open(f"{OUTPUT_DIR}/graph.json", "w") as f:
        json.dump(room_graph, f, indent=2)

    print("✅ Room graph:\n", json.dumps(room_graph, indent=2))
    return room_graph, room_images


# --- 🔎 Room Matching
def match_image_to_room(img, room_images):
    if isinstance(img, str):
        img = cv2.imread(img)
    img_emb = get_clip_embedding(img)
    best_score, best_room = -1, None
    for room, imgs in room_images.items():
        for ref in imgs:
            sim = cosine_sim(img_emb, get_clip_embedding(ref))
            if sim > best_score:
                best_score, best_room = sim, room
    return best_room

# --- 📍 Navigation Path (BFS)
def bfs_path(graph, start, goal):
    queue, visited = deque([(start, [])]), set()
    while queue:
        node, path = queue.popleft()
        if node == goal: return path
        for edge in graph.get(node, []):
            neighbor = edge["room"]
            if neighbor not in visited:
                visited.add(neighbor)
                queue.append((neighbor, path + [(node, edge["direction"], neighbor)]))
    return []

# --- 🚀 Main
graph, room_images = build_room_graph(VIDEO_PATH)
end_room = match_image_to_room(END_IMG_PATH, room_images)
start_room = match_image_to_room(list(room_images.values())[0][0], room_images)

print(f"🎯 Destination Room: {end_room}")
print(f"🚪 Starting Room: {start_room}")

path = bfs_path(graph, start_room, end_room)
print("🧭 Navigation Path:")
for s, d, e in path:
    print(f"{s} --{d}--> {e}")
    speak(f"From {s}, go {d} to reach {e}")


Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8m.pt to 'yolov8m.pt'...


100%|██████████| 49.7M/49.7M [00:00<00:00, 216MB/s]


✅ Room graph:
 {
  "room_0": [
    {
      "room": "room_1",
      "direction": "right"
    }
  ],
  "room_1": [
    {
      "room": "room_2",
      "direction": "left"
    }
  ]
}
🎯 Destination Room: room_2
🚪 Starting Room: room_0
🧭 Navigation Path:
room_0 --right--> room_1
🔊 From room_0, go right to reach room_1


room_1 --left--> room_2
🔊 From room_1, go left to reach room_2


In [None]:
url="Input_Live_Video_URL"

In [10]:
import cv2, time, torch, numpy as np, os, json, re
from PIL import Image
from gtts import gTTS
from IPython.display import Audio, display
from transformers import CLIPProcessor, CLIPModel
from collections import deque

# --- 📁 Config ---
ROOM_GRAPH_PATH = "/content/room_graph/graph.json"
ROOM_IMAGES_DIR = "/content/room_graph"
DEST_IMG_PATH = "/content/ending.jpg"
VIDEO_URL = url  # Replace with your IP webcam stream
USER_START_ROOM = "room_0"  # ✅ Set this to your actual starting room
CHECK_INTERVAL = 2
SIM_THRESHOLD = 0.70
device = "cuda" if torch.cuda.is_available() else "cpu"

# --- 🔌 Load models
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# --- 🔊 Text-to-Speech
def speak(text):
    print("🔊", text)
    tts = gTTS(text=text, lang='en')
    tts.save("speech.mp3")
    display(Audio("speech.mp3", autoplay=True))

# --- 🔍 Embedding + Similarity
def get_clip_embedding(img):
    inputs = clip_processor(images=Image.fromarray(img), return_tensors="pt").to(device)
    with torch.no_grad():
        return clip_model.get_image_features(**inputs)[0].cpu().numpy()

def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# --- 📥 Load room images and infer middle
def load_room_images(directory):
    rooms = {}
    for file in os.listdir(directory):
        if not file.endswith(".jpg"): continue
        path = os.path.join(directory, file)
        parts = file.split("_")
        room = "_".join(parts[:2])
        if "_start_" in file:
            tag = "start"
        elif "_end_" in file:
            tag = "end"
        elif "_frame_" in file:
            tag = "frame"
        else:
            continue
        rooms.setdefault(room, {"start": [], "end": [], "frame": [], "middle": []})
        img = cv2.imread(path)
        if img is not None:
            rooms[room][tag].append((path, img))
    for room, sections in rooms.items():
        frame_images = sections["frame"]
        if len(frame_images) >= 3:
            sorted_frames = sorted(frame_images, key=lambda x: int(re.findall(r'frame_(\d+)', x[0])[0]))
            mid_index = len(sorted_frames) // 2
            middle_imgs = [sorted_frames[mid_index][1]]
            if mid_index + 1 < len(sorted_frames):
                middle_imgs.append(sorted_frames[mid_index + 1][1])
            rooms[room]["middle"] = middle_imgs
        rooms[room]["start"] = [img for _, img in sections["start"]]
        rooms[room]["end"] = [img for _, img in sections["end"]]
    return rooms

# --- Match one section
def match_room_section(frame, room_images, section, verbose=False):
    emb = get_clip_embedding(frame)
    best_score, best_room = -1, None
    for room, sections in room_images.items():
        for img in sections.get(section, []):
            sim = cosine_sim(emb, get_clip_embedding(img))
            if verbose:
                print(f"📏 Similarity with {room} ({section}): {sim:.4f}")
            if sim > best_score:
                best_score = sim
                best_room = room
    if best_score >= SIM_THRESHOLD:
        return best_room
    return None

# --- Path planning
def bfs_path(graph, start, goal):
    queue, visited = deque([(start, [])]), set()
    while queue:
        node, path = queue.popleft()
        if node == goal:
            return path
        for edge in graph.get(node, []):
            neighbor = edge["room"]
            if neighbor not in visited:
                visited.add(neighbor)
                queue.append((neighbor, path + [(node, edge["direction"], neighbor)]))
    return []

# --- Live navigation
def live_navigate(video_url, room_graph, room_images, dest_img, start_room):
    dest_room = match_room_section(dest_img, room_images, "end", verbose=True)

    print(f"🚪 Provided Start Room: {start_room}")
    print(f"🎯 Detected Destination Room: {dest_room}")
    if not start_room or not dest_room:
        speak("Could not detect destination room.")
        return

    path = bfs_path(room_graph, start_room, dest_room)
    if not path:
        speak("No path found from start to destination.")
        return

    print(f"🧭 Navigation path:\n{path}")
    current_step = 0
    state = "start"
    cap = cv2.VideoCapture(video_url)
    last_time = time.time()

    while cap.isOpened() and current_step < len(path):
        ret, frame = cap.read()
        if not ret: continue
        frame = cv2.rotate(frame, cv2.ROTATE_90_CLOCKWISE)
        now = time.time()
        if now - last_time < CHECK_INTERVAL:
            continue
        last_time = now

        current_room, direction, next_room = path[current_step]

        if state == "start":
            match = match_room_section(frame, room_images, "start", verbose=True)
            if match == current_room:
                speak(f"You are in {current_room} start. Move forward.")
                state = "middle"
        elif state == "middle":
            match = match_room_section(frame, room_images, "middle", verbose=True)
            if match == current_room:
                speak(f"You are in the middle of {current_room}. Keep going.")
                state = "end"
        elif state == "end":
            match = match_room_section(frame, room_images, "end", verbose=True)
            if match == current_room:
                speak(f"You reached the end of {current_room}. Turn {direction} to enter {next_room}.")
                current_step += 1
                state = "start"

    speak("You have reached your destination.")
    cap.release()

# --- Run
with open(ROOM_GRAPH_PATH) as f:
    graph = json.load(f)

room_images = load_room_images(ROOM_IMAGES_DIR)
dest_img = cv2.imread(DEST_IMG_PATH)

live_navigate(VIDEO_URL, graph, room_images, dest_img, USER_START_ROOM)


📏 Similarity with room_2 (end): 0.6936
📏 Similarity with room_2 (end): 0.7264
📏 Similarity with room_2 (end): 0.7042
📏 Similarity with room_0 (end): 0.5789
📏 Similarity with room_0 (end): 0.6034
📏 Similarity with room_0 (end): 0.5397
📏 Similarity with room_1 (end): 0.5299
📏 Similarity with room_1 (end): 0.6438
📏 Similarity with room_1 (end): 0.6618
🚪 Provided Start Room: room_0
🎯 Detected Destination Room: room_2
🧭 Navigation path:
[('room_0', 'right', 'room_1'), ('room_1', 'left', 'room_2')]
📏 Similarity with room_2 (start): 0.7050
📏 Similarity with room_2 (start): 0.7450
📏 Similarity with room_2 (start): 0.7359
📏 Similarity with room_0 (start): 0.7999
📏 Similarity with room_0 (start): 0.7685
📏 Similarity with room_0 (start): 0.8033
📏 Similarity with room_1 (start): 0.8232
📏 Similarity with room_1 (start): 0.8370
📏 Similarity with room_1 (start): 0.8066
📏 Similarity with room_2 (start): 0.6741
📏 Similarity with room_2 (start): 0.7405
📏 Similarity with room_2 (start): 0.7284
📏 Similari

📏 Similarity with room_2 (middle): 0.6749
📏 Similarity with room_2 (middle): 0.6744
📏 Similarity with room_0 (middle): 0.7158
📏 Similarity with room_0 (middle): 0.6500
📏 Similarity with room_1 (middle): 0.7180
📏 Similarity with room_1 (middle): 0.6714
📏 Similarity with room_2 (middle): 0.6596
📏 Similarity with room_2 (middle): 0.6777
📏 Similarity with room_0 (middle): 0.7328
📏 Similarity with room_0 (middle): 0.6894
📏 Similarity with room_1 (middle): 0.7379
📏 Similarity with room_1 (middle): 0.6683
📏 Similarity with room_2 (middle): 0.6961
📏 Similarity with room_2 (middle): 0.6418
📏 Similarity with room_0 (middle): 0.7095
📏 Similarity with room_0 (middle): 0.6440
📏 Similarity with room_1 (middle): 0.6941
📏 Similarity with room_1 (middle): 0.7118
📏 Similarity with room_2 (middle): 0.6765
📏 Similarity with room_2 (middle): 0.6993
📏 Similarity with room_0 (middle): 0.7883
📏 Similarity with room_0 (middle): 0.7146
📏 Similarity with room_1 (middle): 0.7434
📏 Similarity with room_1 (middle):

📏 Similarity with room_2 (end): 0.7335
📏 Similarity with room_2 (end): 0.7609
📏 Similarity with room_2 (end): 0.6808
📏 Similarity with room_0 (end): 0.8195
📏 Similarity with room_0 (end): 0.8393
📏 Similarity with room_0 (end): 0.7946
📏 Similarity with room_1 (end): 0.7058
📏 Similarity with room_1 (end): 0.7902
📏 Similarity with room_1 (end): 0.7741
🔊 You reached the end of room_0. Turn right to enter room_1.


📏 Similarity with room_2 (start): 0.7025
📏 Similarity with room_2 (start): 0.7425
📏 Similarity with room_2 (start): 0.7307
📏 Similarity with room_0 (start): 0.7983
📏 Similarity with room_0 (start): 0.7630
📏 Similarity with room_0 (start): 0.8060
📏 Similarity with room_1 (start): 0.7199
📏 Similarity with room_1 (start): 0.7572
📏 Similarity with room_1 (start): 0.7353
📏 Similarity with room_2 (start): 0.7365
📏 Similarity with room_2 (start): 0.7239
📏 Similarity with room_2 (start): 0.7220
📏 Similarity with room_0 (start): 0.7707
📏 Similarity with room_0 (start): 0.7466
📏 Similarity with room_0 (start): 0.7795
📏 Similarity with room_1 (start): 0.7337
📏 Similarity with room_1 (start): 0.7417
📏 Similarity with room_1 (start): 0.7412
📏 Similarity with room_2 (start): 0.8017
📏 Similarity with room_2 (start): 0.7670
📏 Similarity with room_2 (start): 0.8005
📏 Similarity with room_0 (start): 0.8201
📏 Similarity with room_0 (start): 0.8341
📏 Similarity with room_0 (start): 0.8202
📏 Similarity wit

📏 Similarity with room_2 (middle): 0.7661
📏 Similarity with room_2 (middle): 0.7225
📏 Similarity with room_0 (middle): 0.8041
📏 Similarity with room_0 (middle): 0.7481
📏 Similarity with room_1 (middle): 0.7952
📏 Similarity with room_1 (middle): 0.8562
🔊 You are in the middle of room_1. Keep going.


📏 Similarity with room_2 (end): 0.7127
📏 Similarity with room_2 (end): 0.7858
📏 Similarity with room_2 (end): 0.7103
📏 Similarity with room_0 (end): 0.7081
📏 Similarity with room_0 (end): 0.7441
📏 Similarity with room_0 (end): 0.7108
📏 Similarity with room_1 (end): 0.7396
📏 Similarity with room_1 (end): 0.7843
📏 Similarity with room_1 (end): 0.8039
🔊 You reached the end of room_1. Turn left to enter room_2.


🔊 You have reached your destination.
