# Ironsite Spatial Awareness Pipeline

**Video → Undistort → Grounded SAM 2 → VGGT-X → Scene Graphs → FAISS Memory → VLM**

Processes body cam video from construction workers and produces structured spatial data for LLM-based activity analysis.

---
**Setup:** Run Cell 1 first (one-time install), then run cells sequentially.

In [None]:
# ============================================================
# Cell 1: Setup & Installation (run once per instance)
# ============================================================
import subprocess, os, sys

# Install dependencies
!pip install -q torch torchvision torchaudio
!pip install -q transformers supervision opencv-python-headless numpy plotly matplotlib
!pip install -q huggingface_hub openai scipy Pillow
!pip install -q pycolmap
!pip install -q faiss-gpu 2>/dev/null || pip install -q faiss-cpu

# Clone & install Grounded SAM 2
if not os.path.exists("Grounded-SAM-2"):
    !git clone https://github.com/IDEA-Research/Grounded-SAM-2.git
    %cd Grounded-SAM-2
    !SAM2_BUILD_CUDA=0 pip install -e ".[notebooks]" -q
    %cd checkpoints
    !bash download_ckpts.sh
    %cd ../..
else:
    print("Grounded-SAM-2 already installed")

# Clone & install VGGT-X
if not os.path.exists("VGGT-X"):
    !git clone --recursive https://github.com/Linketic/VGGT-X.git
    !pip install -q -r VGGT-X/requirements.txt
else:
    print("VGGT-X already installed")

# GPU check
import torch
if torch.cuda.is_available():
    name = torch.cuda.get_device_name(0)
    vram = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"GPU: {name} | VRAM: {vram:.1f} GB | BF16: {torch.cuda.is_bf16_supported()}")
else:
    print("WARNING: No GPU detected!")
print(f"PyTorch: {torch.__version__}")
print("\nSetup complete!")

In [None]:
# ============================================================
# Cell 2: Configuration
# ============================================================
# Edit these settings before running the pipeline

# --- INPUT ---
VIDEO_PATH = "videos/IronsiteHackathonData/11_prep_masonry.mp4"  # <-- change this
OUTPUT_DIR = "output"

# --- VIDEO PREPROCESSING ---
KEYFRAME_INTERVAL = 10   # extract every Nth frame (10 = ~1.5s at 15fps)
MAX_FRAMES = 0           # 0 = no limit
FISHEYE_K_SCALE = 0.8
FISHEYE_D = [-0.3, 0.1, 0.0, 0.0]
FISHEYE_BALANCE = 0.5

# --- GROUNDED SAM 2 ---
TEXT_PROMPT = (
    "person . concrete block . cinder block . rebar . trowel . bucket . "
    "hard hat . safety vest . gloved hand . scaffolding . crane . "
    "mortar . pipe . wall . ladder . wheelbarrow"
)
DETECTION_THRESHOLD = 0.3
REDETECT_EVERY = 50
TRACK_CHUNK_SIZE = 100
SAM2_CHECKPOINT = "Grounded-SAM-2/checkpoints/sam2.1_hiera_small.pt"
SAM2_CONFIG = "configs/sam2.1/sam2.1_hiera_s.yaml"

# --- VGGT-X ---
VGGTX_DIR = "VGGT-X"
VGGTX_CHUNK_SIZE = 512    # reduce to 128-256 if OOM
VGGTX_MAX_QUERY_PTS = 2048
VGGTX_SHARED_CAMERA = True
VGGTX_USE_GA = True       # global alignment
VGGTX_SAVE_DEPTH = True

# --- VLM ---
GROK_API_KEY = ""         # <-- paste your xAI/Grok key here (or leave empty to skip)
GROK_MODEL = "grok-3-fast"
GROK_BASE_URL = "https://api.x.ai/v1"

print(f"Video: {VIDEO_PATH}")
print(f"Exists: {os.path.exists(VIDEO_PATH)}")

In [None]:
# ============================================================
# Cell 3: Video Preprocessing — Fisheye Undistortion + Keyframes
# ============================================================
import time
import numpy as np
import matplotlib.pyplot as plt
from utils.preprocess import extract_keyframes

# Scene directory for VGGT-X (expects images/ subdirectory)
scene_dir = os.path.join(OUTPUT_DIR, "scene")
frames_dir = os.path.join(scene_dir, "images")

t0 = time.time()
keyframes, timestamps, frame_indices, fps, w, h = extract_keyframes(
    VIDEO_PATH, frames_dir,
    interval=KEYFRAME_INTERVAL,
    k_scale=FISHEYE_K_SCALE,
    D=FISHEYE_D,
    balance=FISHEYE_BALANCE,
    max_frames=MAX_FRAMES,
)
print(f"Completed in {time.time() - t0:.1f}s")

# Show sample keyframes
n_show = min(6, len(keyframes))
fig, axes = plt.subplots(1, n_show, figsize=(4 * n_show, 4))
sample_idx = np.linspace(0, len(keyframes) - 1, n_show, dtype=int)
for ax, si in zip(axes, sample_idx):
    ax.imshow(keyframes[si])
    ax.set_title(f"Frame {frame_indices[si]} | t={timestamps[si]:.1f}s")
    ax.axis("off")
plt.suptitle(f"Undistorted Keyframes ({len(keyframes)} total)", fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# ============================================================
# Cell 4: Grounded SAM 2 — Detection + Segmentation + Tracking
# ============================================================
import torch
from utils.detection import run_grounded_sam2

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

t0 = time.time()
all_detections, object_labels = run_grounded_sam2(
    keyframes, frames_dir, device,
    text_prompt=TEXT_PROMPT,
    threshold=DETECTION_THRESHOLD,
    redetect_every=REDETECT_EVERY,
    sam2_checkpoint=SAM2_CHECKPOINT,
    sam2_config=SAM2_CONFIG,
)
print(f"\nCompleted in {time.time() - t0:.1f}s")

# Show detections on sample frames
n_show = min(5, len(keyframes))
fig, axes = plt.subplots(1, n_show, figsize=(5 * n_show, 5))
sample_idx = np.linspace(0, len(keyframes) - 1, n_show, dtype=int)

for ax, si in zip(axes, sample_idx):
    ax.imshow(keyframes[si])
    for det in all_detections[si]:
        x1, y1, x2, y2 = det["bbox"]
        rect = plt.Rectangle((x1, y1), x2 - x1, y2 - y1,
                             linewidth=2, edgecolor='lime', facecolor='none')
        ax.add_patch(rect)
        ax.text(x1, y1 - 3, det["label"], color='yellow', fontsize=7,
                weight='bold', bbox=dict(boxstyle='round,pad=0.1',
                facecolor='black', alpha=0.7))
    ax.set_title(f"t={timestamps[si]:.1f}s | {len(all_detections[si])} dets")
    ax.axis("off")
plt.suptitle("Grounded SAM 2 Detections", fontsize=14)
plt.tight_layout()
plt.show()

# Show segmentation masks for first sample
si = sample_idx[0]
if all_detections[si]:
    fig, axes = plt.subplots(1, min(4, len(all_detections[si])), figsize=(16, 4))
    if not hasattr(axes, '__len__'):
        axes = [axes]
    for ax, det in zip(axes, all_detections[si][:4]):
        if det.get("mask") is not None:
            ax.imshow(det["mask"], cmap='viridis')
            ax.set_title(f"{det['label']} (id={det['id']})")
        ax.axis("off")
    plt.suptitle(f"Segmentation Masks (frame {si})", fontsize=12)
    plt.tight_layout()
    plt.show()

In [None]:
# ============================================================
# Cell 5: VGGT-X — 3D Reconstruction + Depth + Trajectory
# ============================================================
from utils.depth import run_full_3d_pipeline

t0 = time.time()
recon_data = run_full_3d_pipeline(
    scene_dir=scene_dir,
    vggtx_dir=VGGTX_DIR,
    chunk_size=VGGTX_CHUNK_SIZE,
    max_query_pts=VGGTX_MAX_QUERY_PTS,
    shared_camera=VGGTX_SHARED_CAMERA,
    use_ga=VGGTX_USE_GA,
    save_depth=VGGTX_SAVE_DEPTH,
    num_keyframes=len(keyframes),
)
print(f"\nCompleted in {time.time() - t0:.1f}s")

# Show depth maps
n_show = min(5, len(keyframes))
fig, axes = plt.subplots(2, n_show, figsize=(5 * n_show, 8))
sample_idx = np.linspace(0, len(keyframes) - 1, n_show, dtype=int)

for col, si in enumerate(sample_idx):
    axes[0, col].imshow(keyframes[si])
    axes[0, col].set_title(f"t={timestamps[si]:.1f}s")
    axes[0, col].axis("off")

    fname = f"{si:06d}.jpg"
    depth = recon_data["depth_map_cache"].get(fname)
    if depth is not None:
        im = axes[1, col].imshow(depth, cmap='turbo')
        axes[1, col].set_title(f"Depth: {depth.min():.1f}-{depth.max():.1f}m")
        plt.colorbar(im, ax=axes[1, col], fraction=0.046)
    else:
        axes[1, col].text(0.5, 0.5, "No depth", ha='center', va='center',
                          transform=axes[1, col].transAxes)
    axes[1, col].axis("off")

plt.suptitle("Keyframes + VGGT-X Depth Maps", fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# ============================================================
# Cell 6: 3D Point Cloud + Worker Trajectory (Interactive)
# ============================================================
import plotly.graph_objects as go

pts = recon_data["points_xyz"]
cam = recon_data["cam_positions_smooth"]

# Subsample for performance
if len(pts) > 30000:
    idx = np.random.choice(len(pts), 30000, replace=False)
    viz_pts = pts[idx]
    viz_rgb = recon_data["points_rgb"][idx]
else:
    viz_pts = pts
    viz_rgb = recon_data["points_rgb"]

fig = go.Figure()

# Point cloud
if len(viz_pts) > 0:
    colors = [f'rgb({r},{g},{b})' for r, g, b in viz_rgb]
    fig.add_trace(go.Scatter3d(
        x=viz_pts[:, 0], y=viz_pts[:, 1], z=viz_pts[:, 2],
        mode='markers',
        marker=dict(size=1, color=colors, opacity=0.5),
        name='Point Cloud'
    ))

# Worker trajectory
if len(cam) > 0:
    fig.add_trace(go.Scatter3d(
        x=cam[:, 0], y=cam[:, 1], z=cam[:, 2],
        mode='lines+markers',
        marker=dict(size=3, color='red'),
        line=dict(color='red', width=4),
        name='Worker Path'
    ))
    # Start/end markers
    fig.add_trace(go.Scatter3d(
        x=[cam[0, 0]], y=[cam[0, 1]], z=[cam[0, 2]],
        mode='markers', marker=dict(size=8, color='green', symbol='diamond'),
        name='Start'
    ))
    fig.add_trace(go.Scatter3d(
        x=[cam[-1, 0]], y=[cam[-1, 1]], z=[cam[-1, 2]],
        mode='markers', marker=dict(size=8, color='blue', symbol='diamond'),
        name='End'
    ))

fig.update_layout(
    title=f"3D Workspace ({len(pts)} points) + Worker Trajectory ({recon_data['total_distance']:.1f}m)",
    width=900, height=700,
    scene=dict(aspectmode='data')
)
fig.show()

In [None]:
# ============================================================
# Cell 7: Scene Graph Builder
# ============================================================
from utils.scene_graph import build_scene_graphs

t0 = time.time()
scene_graphs = build_scene_graphs(
    keyframes, all_detections, recon_data, timestamps, frame_indices
)
print(f"Completed in {time.time() - t0:.1f}s")

# Show a sample scene graph
import json
sample_sg = scene_graphs[len(scene_graphs) // 2]  # middle frame
print(f"\n--- Sample Scene Graph (frame {sample_sg['original_frame']}, t={sample_sg['timestamp_str']}) ---")
print(json.dumps(sample_sg, indent=2, default=str))

In [None]:
# ============================================================
# Cell 8: FAISS Spatial Memory
# ============================================================
from utils.memory import SpatialMemory

memory_dir = os.path.join(OUTPUT_DIR, "memory_store")
memory = SpatialMemory(memory_dir)
memory.ingest(scene_graphs, VIDEO_PATH)
memory.save()

print(f"\nMemory stats: {memory.stats()}")

# --- Demo queries ---
print("\n--- Spatial Queries ---")

blocks = memory.query_label("block")
print(f"Frames with 'block': {len(blocks)}")

rebar = memory.query_label("rebar")
print(f"Frames with 'rebar': {len(rebar)}")

close = memory.query_depth_range(0.5, 3.0)
print(f"Objects in work range (0.5-3m): {len(close)} frames")

person_near_block = memory.query_proximity("person", "block", max_m=2.0)
print(f"Person near block (<2m): {len(person_near_block)} frames")

hand_near_block = memory.query_proximity("hand", "block", max_m=1.0)
print(f"Hand near block (<1m): {len(hand_near_block)} frames")

In [None]:
# ============================================================
# Cell 9: VLM Reasoning via Grok (optional)
# ============================================================
analysis_json = {}

if GROK_API_KEY:
    from utils.vlm import run_vlm_analysis

    t0 = time.time()
    analysis_json = run_vlm_analysis(
        scene_graphs, VIDEO_PATH, GROK_API_KEY,
        model=GROK_MODEL, base_url=GROK_BASE_URL,
        num_samples=30, temperature=0.3, max_tokens=4000,
    )
    print(f"\nCompleted in {time.time() - t0:.1f}s")
else:
    print("Skipping VLM — set GROK_API_KEY in Cell 2 to enable.")
    print("The pipeline still produces structured spatial data without VLM.")

In [None]:
# ============================================================
# Cell 10: Visualizations
# ============================================================
from utils.visualize import (
    plot_annotated_frames, plot_3d_scene, plot_trajectory_topdown,
    plot_activity_timeline, plot_object_frequency, export_results
)

# Build depth maps list for visualization
depth_maps_list = []
for i in range(len(keyframes)):
    fname = f"{i:06d}.jpg"
    depth_maps_list.append(recon_data["depth_map_cache"].get(fname))

# Annotated frames + depth
plot_annotated_frames(keyframes, scene_graphs, depth_maps_list, timestamps, OUTPUT_DIR)

# Top-down trajectory
plot_trajectory_topdown(recon_data["cam_positions_smooth"], OUTPUT_DIR)

# Object frequency
plot_object_frequency(scene_graphs, OUTPUT_DIR)

# Activity timeline (if VLM was run)
if analysis_json:
    plot_activity_timeline(analysis_json, OUTPUT_DIR)

# Show the saved plots inline
for img_name in ["annotated_frames.png", "trajectory_topdown.png", "object_frequency.png"]:
    path = os.path.join(OUTPUT_DIR, img_name)
    if os.path.exists(path):
        from IPython.display import display, Image as IPImage
        display(IPImage(filename=path))

In [None]:
# ============================================================
# Cell 11: Export Results
# ============================================================
summary = export_results(
    scene_graphs, analysis_json, recon_data["cam_positions_smooth"],
    object_labels, timestamps, VIDEO_PATH, OUTPUT_DIR
)

print("\n" + "=" * 60)
print("PIPELINE COMPLETE")
print("=" * 60)
print(f"\nResults saved to: {os.path.abspath(OUTPUT_DIR)}/")
print(f"  scene_graphs.json    — structured spatial data for LLM")
print(f"  vlm_analysis.json    — activity classification")
print(f"  camera_trajectory.npy — worker path")
print(f"  summary.json         — overview stats")
print(f"  memory_store/        — FAISS queryable memory")
print(f"  *.png, *.html        — visualizations")

In [None]:
# ============================================================
# Cell 12: Interactive Memory Queries (run anytime after Cell 8)
# ============================================================
# You can re-run this cell with different queries

# Change these to explore:
QUERY_LABEL = "block"        # search for any label
DEPTH_MIN = 0.5              # meters
DEPTH_MAX = 2.0
PROX_A = "person"            # proximity: label A near label B
PROX_B = "block"
PROX_MAX_M = 2.0

print(f"--- Label query: '{QUERY_LABEL}' ---")
results = memory.query_label(QUERY_LABEL)
print(f"  Found in {len(results)} frames")
if results:
    r = results[0]
    print(f"  Example: frame {r['frame_idx']} at {r['timestamp_str']}")
    for d in r['detections']:
        if QUERY_LABEL.lower() in d['label'].lower():
            print(f"    {d['label']}: depth={d['depth_m']}m, pos={d['position_3d']}")

print(f"\n--- Depth range: {DEPTH_MIN}-{DEPTH_MAX}m ---")
results = memory.query_depth_range(DEPTH_MIN, DEPTH_MAX)
print(f"  {len(results)} frames with objects in range")

print(f"\n--- Proximity: '{PROX_A}' within {PROX_MAX_M}m of '{PROX_B}' ---")
results = memory.query_proximity(PROX_A, PROX_B, max_m=PROX_MAX_M)
print(f"  {len(results)} frames")
for r in results[:3]:
    print(f"    frame {r['frame_idx']} at {r['timestamp_str']} — dist={r.get('_dist', '?')}m")

In [None]:
# ============================================================
# Cell 13: Batch Processing — Run on Multiple Videos
# ============================================================
import glob

VIDEO_DIR = "videos/IronsiteHackathonData"
BATCH_OUTPUT = "output_batch"

videos = sorted(glob.glob(os.path.join(VIDEO_DIR, "*.mp4")))
print(f"Found {len(videos)} videos:")
for v in videos:
    print(f"  {os.path.basename(v)}")

# Uncomment below to run the full pipeline on all videos:
# for video_path in videos:
#     name = os.path.splitext(os.path.basename(video_path))[0]
#     out = os.path.join(BATCH_OUTPUT, name)
#     print(f"\n{'='*60}")
#     print(f"Processing: {name}")
#     print(f"{'='*60}")
#     !python pipeline.py --video "{video_path}" --output "{out}" --skip-vlm
#     print(f"Done: {name}")