In [None]:
# Cell 0 – Install dependencies (YOLO, OpenCV, OpenAI, etc.)
!pip -q install ultralytics opencv-python matplotlib pillow pandas imageio openai>=1.40.0 seaborn

print("Dependencies installed.")


In [None]:
# Cell 1 – Load YOLO model (YOLOModel6_withRobo + KEEP_IDS)

import os, glob, math
from collections import Counter

import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ultralytics import YOLO

# KEEP_IDS (filtered COCO classes)
KEEP_IDS = [
     0, 15, 16, 24, 25,
    28, 39, 40, 41, 42, 43, 44, 45, 46,
    47, 48, 49, 50, 51, 52, 53, 54, 55,
    56, 57, 59, 60, 61, 63, 64, 65, 66,
    67, 68, 70, 71, 73, 74, 75, 76,
    77, 78, 79, 80, 82, 83, 85, 86, 88,
    89, 90, 91,
]

WEIGHTS_PATH = "/content/YOLOModel6_withRobo.pt"
assert os.path.exists(WEIGHTS_PATH), f"Model missing: {WEIGHTS_PATH}"

det_model = YOLO(WEIGHTS_PATH)

print("Loaded YOLO model from:", WEIGHTS_PATH)
print("Number of classes:", len(det_model.model.names))
print("Some sample classes:", list(det_model.model.names.values())[:20])
print("KEEP_IDS:", len(KEEP_IDS), "classes")


In [None]:
# Cell 2 – Mount Google Drive and list indoor videos

from google.colab import drive
drive.mount('/content/drive')

VIDEO_DIR = "/content/drive/MyDrive/indoor_videos"
videos = sorted(glob.glob(os.path.join(VIDEO_DIR, "*")))

print("Found videos:", len(videos))
for v in videos:
    print(v)


In [None]:
# Cell 3 – YOLO video processor using positions (center, w, h, conf)
#           + frame skipping + monitor-specific threshold

import cv2
import numpy as np
import pandas as pd
from collections import Counter

# labels we treat as "computer monitor / tv"
MONITOR_LABELS = {"computer monitor", "tv", "monitor"}
MONITOR_CONF_THRESH = 0.40   # ignore monitor < 40%

def run_yolo_video(
    model,
    video_path,
    save_every=10,
    conf_thresh=0.25,
    keep_ids=None,
    frame_skip=5,
):
    """
    Run YOLO on a video.

    Outputs:
      - annotated MP4 video
      - CSV of frame-level detections with center x/y, width, height, confidence
      - debug frames (JPGs with boxes + labels)
      - per-frame summaries for GPT

    Returns:
      df (DataFrame), frame_summaries (list), frames_dir (str), fps (float)
    """
    base_name = os.path.splitext(os.path.basename(video_path))[0]
    out_video = f"{base_name}_annotated.mp4"
    out_csv   = f"{base_name}_frame_detections.csv"
    frames_dir = f"{base_name}_frames_debug"
    os.makedirs(frames_dir, exist_ok=True)

    cap = cv2.VideoCapture(video_path)
    assert cap.isOpened(), f"Could not open video: {video_path}"

    fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
    w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    writer = cv2.VideoWriter(out_video, fourcc, fps, (w, h))

    detections = []
    frame_summaries = []
    frame_idx = 0

    model_names = None
    try:
        model_names = model.model.names
    except Exception:
        pass

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        t = frame_idx / fps

        # Skip frames for speed – only run YOLO every `frame_skip` frames
        if frame_idx % frame_skip != 0:
            frame_idx += 1
            continue

        # YOLO inference with optional class filter
        if keep_ids is not None:
            results = model(frame, classes=keep_ids)
        else:
            results = model(frame)

        res = results[0]

        # annotate BGR frame
        annotated = res.plot()
        writer.write(annotated)

        # get names for classes
        names = model_names or getattr(res, "names", None)

        labels_this = []
        label_strings = []

        if res.boxes is not None and len(res.boxes) > 0:
            boxes_xywh = res.boxes.xywh.cpu().numpy()
            clss       = res.boxes.cls.cpu().numpy()
            confs      = res.boxes.conf.cpu().numpy()
        else:
            boxes_xywh = []
            clss       = []
            confs      = []

        for (x, y, w_box, h_box), c, conf in zip(boxes_xywh, clss, confs):
            conf = float(conf)
            cls_id = int(c)

            # resolve label
            label = None
            if names is not None:
                if isinstance(names, dict):
                    label = names.get(cls_id, None)
                else:
                    if 0 <= cls_id < len(names):
                        label = names[cls_id]
            if label is None:
                label = f"id_{cls_id}"

            # monitor-specific threshold
            if label.lower() in MONITOR_LABELS and conf < MONITOR_CONF_THRESH:
                continue

            # generic threshold for everything else
            if conf < conf_thresh:
                continue

            detections.append({
                "frame": frame_idx,
                "t_sec": t,
                "label": label,
                "conf": conf,
                "x_center": float(x),
                "y_center": float(y),
                "w": float(w_box),
                "h": float(h_box),
            })

            labels_this.append(label)
            label_strings.append(
                f"{label}({conf:.2f}) center=({x:.0f},{y:.0f}) size=({w_box:.0f}x{h_box:.0f})"
            )

        # per-frame counts and summary text for GPT
        counts_this = dict(Counter(labels_this))
        frame_summaries.append({
            "frame": frame_idx,
            "t_sec": t,
            "text": ", ".join(label_strings) if label_strings else "no detections",
            "counts": counts_this,
        })

        # save some debug frames
        if frame_idx % save_every == 0:
            img_path = os.path.join(frames_dir, f"frame_{frame_idx:04d}.jpg")
            cv2.imwrite(img_path, annotated)

        frame_idx += 1

    cap.release()
    writer.release()

    df = pd.DataFrame(detections)
    df.to_csv(out_csv, index=False)

    print("CSV saved:", out_csv)
    print("Annotated video saved:", out_video)
    print("Debug frames saved in:", frames_dir)

    return df, frame_summaries, frames_dir, fps


In [None]:
# Cell 4 – OpenAI GPT client setup

import os
from getpass import getpass
from openai import OpenAI

if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API key: ")

client = OpenAI()
print("GPT client initialized.")


In [None]:
# Cell 5 – Summarizer + GPT environment classifier (with bounding box positions)

def summarize_interval(frame_summaries, t_start, t_end, max_frames=40):
    """
    Build a short text summary + fixed object counts for frames between t_start and t_end.

    - Text: up to max_frames lines like:
        "Frame 120 @ 4.0s: chair(0.83) center=(300,200) size=(80x60), table(0.70)..."
    - Counts: for each label, we take the MAX count across frames in the interval.
      (avoids exploding totals like "983 chairs" from summing every frame.)
    """
    subset = [fs for fs in frame_summaries if t_start <= fs["t_sec"] < t_end]
    if not subset:
        return None, {}

    env_counts = {}
    for fs in subset:
        for label, c in fs["counts"].items():
            env_counts[label] = max(env_counts.get(label, 0), c)

    text = "\n".join([
        f"Frame {fs['frame']} @ {fs['t_sec']:.2f}s: {fs['text']}"
        for fs in subset[:max_frames]
    ])

    return text, env_counts


def classify_environment_from_yolo_summary_text(yolo_summary_text, env_counts):
    """
    Call GPT (text-only) to classify the indoor environment using YOLO summaries,
    including bounding-box centers and sizes.
    """
    if env_counts:
        counts_lines = "\n".join([f"- {k}: {v}" for k, v in env_counts.items()])
    else:
        counts_lines = "No significant objects detected with sufficient confidence."

    prompt = f"""
You are a fine-tuned AI (text only) that classifies INDOOR environments using YOLO object
detection text. Each line below describes one video frame and the objects YOLO detected,
with confidence scores and bounding-box center/size information.

Your tasks:
1. Predict the most likely INDOOR environment type (e.g., classroom, office, lab,
   hallway, cafeteria, kitchen, lobby, colloquium room, workplace, study hall, bathroom, common area, etc.).
2. Explain your reasoning using:
   - detected object types
   - approximate counts (provided below)
   - bounding-box positions (center x/y)
   - bounding-box sizes (relative width/height)
   - spatial patterns (e.g., rows of chairs, scattered tables, etc.).
3. Provide a short list of the main objects with their approximate counts.

4.  -If there are many laptops/computers/monitors, weight more towards either a computer lab
   -If there are some laptops/computers/monitors, weight more towards either a computer lab or classroom
   -If there are tables/chairs in rows, weight more towards a classroom.
   -If there is a large number of tables/chairs, especially in clusters, weight more towards an
   auditorium or cafeteria.
   -Many chairs + couches implies a common area.
   -If there are very few objects or mostly empty space, consider a hallway or lobby.
   -A sink implies a bathroom unless there are other appliances

== YOLO detection summary for this time interval ==
{yolo_summary_text}

== Estimated object counts for this interval (max per label across frames) ==
{counts_lines}
"""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": (
                    "You are an assistant that reasons about INDOOR environments "
                    "from YOLO detection text."
                ),
            },
            {"role": "user", "content": prompt},
        ],
        max_tokens=450,
    )

    return response.choices[0].message.content


In [None]:
# Cell 6 – Run YOLO + GPT every 3 seconds on ONE video

def gpt_predictions_every_3s(
    model,
    video_path,
    interval_sec=3.0,
    conf_thresh=0.25,
):
    print("\nRunning YOLO + GPT on:", video_path)

    df, frame_summaries, frames_dir, fps = run_yolo_video(
        model,
        video_path,
        save_every=10,        # save debug frame every 10 processed frames
        conf_thresh=conf_thresh,
        keep_ids=KEEP_IDS,
        frame_skip=5,         # look at every 5th frame
    )

    duration = df["t_sec"].max() if not df.empty else 0.0
    n_intervals = int(math.ceil(duration / interval_sec))

    results = []

    for k in range(n_intervals):
        t0 = k * interval_sec
        t1 = (k + 1) * interval_sec

        interval_text, interval_counts = summarize_interval(frame_summaries, t0, t1)
        if not interval_text:
            continue

        desc = classify_environment_from_yolo_summary_text(interval_text, interval_counts)

        print(f"\n=== Interval {k} ({t0:.1f}s – {t1:.1f}s) ===")
        print(desc[:700], "\n")

        results.append({
            "video": os.path.basename(video_path),
            "interval_idx": k,
            "t_start": t0,
            "t_end": t1,
            "description": desc,
        })

    return pd.DataFrame(results)


In [None]:
# Cell 7 – Run YOLO + GPT with flexible mode

RUN_MODE = "single"   # change to "all" to run all videos

all_interval_preds = []

if RUN_MODE == "single":
    single_video = videos[9]  # choose index manually
    print("Running only this video:", single_video)
    df_preds = gpt_predictions_every_3s(det_model, single_video, interval_sec=3.0)
    all_interval_preds.append(df_preds)

else:  # RUN_MODE == "all"
    print("Running ALL videos...")
    for v in videos:
        df_preds = gpt_predictions_every_3s(det_model, v, interval_sec=3.0)
        all_interval_preds.append(df_preds)

# save predictions
all_interval_preds = pd.concat(all_interval_preds, ignore_index=True)
all_interval_preds.to_csv("gpt_interval_predictions.csv", index=False)

print("\nSaved predictions to gpt_interval_predictions.csv")




In [None]:
# Cell 8 – Load & peek at interval predictions (works for 1 or many videos)

import pandas as pd

# If we just ran Cell 7 in this Colab session, reuse all_interval_preds.
# Otherwise, fall back to loading from the CSV on disk.
if "all_interval_preds" in globals() and isinstance(all_interval_preds, pd.DataFrame):
    interval_df = all_interval_preds.copy()
    print("Loaded interval_df from memory. Shape:", interval_df.shape)
else:
    interval_df = pd.read_csv("gpt_interval_predictions.csv")
    print("Loaded interval_df from CSV. Shape:", interval_df.shape)

print("\nSample 3-second predictions:")
print(
    interval_df[["video", "interval_idx", "t_start", "t_end", "description"]]
    .head()
)



In [None]:
# Cell 9 – Ground truth per video

GROUND_TRUTH = {
    "IMG_1709.MOV": "Cafeteria",
    "IMG_1710.MOV": "Cafeteria",
    "IMG_6404.MOV": "Computer Lab",
    "IMG_6405.MOV": "Computer Lab",
    "IMG_6406.MOV": "Hallway",
    "IMG_6407.MOV": "Classroom",
    "IMG_6409.MOV": "Office",
    "IMG_6410.MOV": "Office",
    "IMG_6411.MOV": "Office",
    "IMG_6412.MOV": "Office",
    "IMG_6413.MOV": "Lobby",
    "IMG_6414.MOV": "Study Hall",
    "IMG_6415.MOV": "Study Hall",
    "IMG_6416.MOV": "Colloquium",
    "IMG_6417.MOV": "Colloquium",
    "IMG_6418.MOV": "Dining Area",
    "IMG_6419.MOV": "Study Hall",
    "IMG_6420.MOV": "Kitchen",
    "IMG_6429.MOV": "Classroom",
    "IMG_6432.MOV": "Classroom",
    "IMG_4049.MOV": "Bedroom",
    "IMG_4051.MOV": "Bathroom",
    "IMG_4052.MOV": "Kitchen",
    "IMG_4055.MOV": "Living Room",
    "IMG_4057.MOV": "Bathroom",
}

print("Ground truth for", len(GROUND_TRUTH), "videos loaded.")


In [None]:
# Cell 10 – Canonical environments, synonyms, and label extractor

import re
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix

# 1) Canonical set of environments we care about
CANON_ENVS = [
    "Cafeteria",
    "Classroom",
    "Computer Lab",
    "Dining Area",
    "Hallway",
    "Kitchen",
    "Lobby",
    "Lounge",        # instead of "Living Room"
    "Office",
    "Study Hall",
    "Workplace",
    "Colloquium",
]

# 2) Synonyms / phrases that GPT might use -> map to the canon labels
ENV_SYNONYMS = {
    # Cafeteria / dining
    "cafeteria": "Cafeteria",
    "cafe": "Cafeteria",
    "café": "Cafeteria",
    "food court": "Cafeteria",

    "dining area": "Dining Area",
    "dining room": "Dining Area",

    # Computer lab / lab
    "computer lab": "Computer Lab",
    "lab": "Computer Lab",

    # Classroom-ish
    "classroom": "Classroom",
    "lecture hall": "Classroom",

    # Hallway
    "hallway": "Hallway",
    "corridor": "Hallway",

    # Kitchen
    "kitchen": "Kitchen",

    # Lobby / reception
    "lobby": "Lobby",
    "reception": "Lobby",

    # Lounge – replaces "living room"
    "lounge": "Lounge",
    "lounge area": "Lounge",
    "informal lounge": "Lounge",
    "informal lounge area": "Lounge",
    "living room": "Lounge",   # map living room -> Lounge

    # Offices / workspaces
    "office": "Office",
    "workspace": "Workplace",
    "workplace": "Workplace",

    # Study spaces
    "study hall": "Study Hall",
    "study space": "Study Hall",
    "study area": "Study Hall",

    # Colloquium
    "colloquium room": "Colloquium",
    "colloquium": "Colloquium",
}


def extract_env_label(text: str) -> str:
    """
    Look through a GPT description, count how many times each environment
    (via its synonyms) appears, and return the best-matching canonical env.
    If nothing matches, returns 'UNKNOWN'.
    """
    if not isinstance(text, str):
        return "UNKNOWN"

    t = text.lower()
    scores = {env: 0 for env in CANON_ENVS}

    for phrase, env in ENV_SYNONYMS.items():
        count = t.count(phrase)
        if count > 0:
            scores[env] += count

    best_env = "UNKNOWN"
    best_score = 0
    for env, s in scores.items():
        if s > best_score:
            best_env, best_score = env, s

    return best_env if best_score > 0 else "UNKNOWN"


In [None]:
# Cell 11 – Interval-level + video-level accuracy & confusion matrices

# ---- 1) Add predicted + true environment labels to each 3-second window ----
interval_df = interval_df.copy()

interval_df["pred_env"] = interval_df["description"].apply(extract_env_label)
interval_df["true_env"] = interval_df["video"].map(GROUND_TRUTH)

print("Sample 3-second predictions:")
print(
    interval_df[["video", "interval_idx", "t_start", "t_end", "true_env", "pred_env"]]
    .head()
)

# Keep only rows where we know the ground-truth env
valid_int = interval_df.dropna(subset=["true_env"])

# -------- Interval-level accuracy --------
acc_int = accuracy_score(valid_int["true_env"], valid_int["pred_env"])
print("\nInterval-level overall accuracy (3-sec windows):", acc_int)

# Use only envs that actually appear in either true or predicted labels
used_int_labels = sorted(
    set(valid_int["true_env"]) | set(valid_int["pred_env"])
)
cm_int = confusion_matrix(
    valid_int["true_env"],
    valid_int["pred_env"],
    labels=used_int_labels,
)

plt.figure(figsize=(8, 6))
sns.heatmap(
    cm_int,
    annot=True,
    fmt="d",
    xticklabels=used_int_labels,
    yticklabels=used_int_labels,
)
plt.xlabel("Predicted environment")
plt.ylabel("True environment")
plt.title("Interval-level Confusion Matrix (all 3-second predictions)")
plt.tight_layout()
plt.show()

# -------- 2) Video-level majority vote over intervals --------
video_env = (
    valid_int
    .groupby("video")["pred_env"]
    .agg(lambda s: s.mode().iloc[0] if not s.mode().empty else CANON_ENVS[0])
    .reset_index()
    .rename(columns={"video": "basename"})
)

video_env["true_env"] = video_env["basename"].map(GROUND_TRUTH)

print("\nVideo-level predictions (majority over 3-second intervals):")
print(video_env)

valid_videos = video_env.dropna(subset=["true_env"])
video_acc = accuracy_score(valid_videos["true_env"], valid_videos["pred_env"])
print("\nVideo-level overall accuracy:", video_acc)

used_video_labels = sorted(
    set(valid_videos["true_env"]) | set(valid_videos["pred_env"])
)

cm_video = confusion_matrix(
    valid_videos["true_env"],
    valid_videos["pred_env"],
    labels=used_video_labels,
)

plt.figure(figsize=(8, 6))
sns.heatmap(
    cm_video,
    annot=True,
    fmt="d",
    xticklabels=used_video_labels,
    yticklabels=used_video_labels,
)
plt.xlabel("Predicted environment")
plt.ylabel("True environment")
plt.title("Video-level Confusion Matrix (majority over 3-second intervals)")
plt.tight_layout()
plt.show()


In [None]:
# Cell 12 – Display some annotated debug frames

import glob
import cv2
import matplotlib.pyplot as plt

def display_frames(frames_dir, max_frames=12):
    files = sorted(glob.glob(os.path.join(frames_dir, "*.jpg")))[:max_frames]
    if not files:
        print("No frames found in", frames_dir)
        return

    cols = 4
    rows = (len(files) + cols - 1) // cols

    plt.figure(figsize=(20, 4 * rows))
    for i, f in enumerate(files):
        img = cv2.imread(f)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        plt.subplot(rows, cols, i + 1)
        plt.imshow(img)
        plt.title(os.path.basename(f))
        plt.axis("off")

    plt.show()

# Example: display frames from first video
if videos:
    first = os.path.splitext(os.path.basename(videos[0]))[0] + "_frames_debug"
    display_frames(first, max_frames=12)
else:
    print("No videos available.")


In [None]:
# Cell 13 – GIF creation

import numpy as np
import imageio
from IPython.display import Image as IImage, display as idisplay

def make_gif(frames_dir, gif_path="annotated.gif", fps=10):
    files = sorted(glob.glob(os.path.join(frames_dir, "*.jpg")))
    if not files:
        print("No frames found.")
        return

    base = imageio.imread(files[0])
    H, W = base.shape[:2]

    frames = []
    for f in files:
        img = imageio.imread(f)
        img_resized = cv2.resize(img, (W, H))
        frames.append(img_resized)

    imageio.mimsave(gif_path, frames, duration=1 / fps)
    print("GIF created:", gif_path)
    idisplay(IImage(filename=gif_path))

# Example
if videos:
    first = os.path.splitext(os.path.basename(videos[0]))[0] + "_frames_debug"
    make_gif(first, "annotated.gif", fps=10)
else:
    print("No videos available.")
