In [None]:
%reload_ext autoreload
%autoreload 2

# db: simple read test
import json
import yaml
import types
import open_clip
import torch
import cv2
import numpy as np
import matplotlib as mpl
from scipy.spatial import KDTree

from utils.db_utils import get_df, get_data, connect_db, DB
from utils.plotly_utils import *
from utils.vis import *
from utils.predict_scenegraph import PredictSceneGraph
from utils.imagine_nav_planner import ImagineNavPlanner

dump_folder = 'dump/hm3d_tradeoff_path_sigmoid_schedule'
output_folder = f'{dump_folder}/objectnav-dino'

# list db size
! ls -lh $output_folder

# load results
results = get_df(f'{output_folder}/result.db', 'result')
print(f'Loaded {len(results)} results')
print(f'Current success rate: {results.tail(1)["success"].values[0]/len(results):.2%}')
print(f'Current SPL: {results["spl"].mean():.2f}')


In [None]:
# load agent modules
device = torch.device("cuda")
args = types.SimpleNamespace(**json.load(open(f'{dump_folder}/args.json')))
with open(f'{dump_folder}/{args.exp_config}') as f:
    exp_config = yaml.safe_load(f)
clip_model, _, clip_preprocess = open_clip.create_model_and_transforms(
    "ViT-H-14", "laion2b_s32b_b79k"
)
clip_model = clip_model.to(device).half()
clip_tokenizer = open_clip.get_tokenizer("ViT-H-14")
clip_model_list = (clip_model, clip_preprocess, clip_tokenizer)
imagine_nav_planner = ImagineNavPlanner(args, exp_config, clip_model_list)

In [None]:
# access db
import os
import pathlib
with DB(f'{output_folder}/result.db') as con:
    table = con.table('result')
    print(table)
# episode infos
# steps_df = get_df(f'{output_folder}/result.db', 'result', select=['count_steps', 'episode', 'target', 'habitat_success', 'switch_upstair_count', 'switch_downstair_count'])
steps_df = get_df(f'{output_folder}/result.db', 'result', filter=lambda x:x['count_steps']>490, select=['count_steps', 'episode', 'target', 'habitat_success', 'switch_upstair_count', 'switch_downstair_count'])
print(steps_df.head(30))
# step infos
sample_episode_label = steps_df['episode'].values[0]
with DB(f'{output_folder}/steps/{sample_episode_label}.db') as con:
    table = con.table('step_data')
    print(table)

In [None]:
def get_sg_data(episode_label, step):
    data = get_data(
        f'{output_folder}/steps/{episode_label}.db',
        'step_data',
        filter=lambda x: (x['episode_label']==episode_label) & (x['step']==step),
        select=[
            'global_scene_graph',
            'gt_scenegraph',
            'timestamp',
            'cate_object',
            'origins_grid',
            'current_grid_pose',
            'camera_position_tensor',
            'global_bev_rgb_map_tensor',
            # 'occupancy_map_tensor',
            'gradient_map_tensor',
        ]
    )
    data = data[np.argmax([x['timestamp'] for x in data])]
    return data

def xyz_to_grid(xyz, grid_size, origins_grid):
    return np.array([
        np.floor((xyz[:, 0])*100 / grid_size).astype(int) + int(origins_grid[0]),
        np.floor((xyz[:, 2])*100 / grid_size).astype(int) + int(origins_grid[1]),
    ]).T

def update_region(region, grid_size, origins_grid):
    if 'bbox' in region:
        # gt
        bbox = xyz_to_grid(np.array(region['bbox']), grid_size, origins_grid)
        region['grid_bbox'] = bbox
    else:
        # obs
        object_centers = []
        for obj in region.get('objects'):
            object_centers.append(obj['center'])
        object_centers = np.array(object_centers).astype(np.int32)
        rect = cv2.minAreaRect(object_centers)
        bbox = np.array(cv2.boxPoints(rect))
        region['grid_bbox'] = bbox
        region['center'] = np.array(rect[0])
        if not 'caption' in region and len(region['objects']) == 0:
            region['caption'] = 'unknown'
    return bbox

def plot_region_cv2(img, regions, map_size, show_objects=False):
    viridis = mpl.colormaps['gist_rainbow'].resampled(len(regions))
    colors = (viridis(range(len(regions)))[:, :3] * 255).astype(np.uint8)
    for i, region in enumerate(regions):
        if show_objects:
            for obj in region.get('objects'):
                obj_center = np.array([obj['center'][1], map_size-obj['center'][0]]).astype(np.int32)
                cv2.circle(img, obj_center, 5, (255, 0, 0), 2)
        pts = region['grid_bbox'].reshape((-1, 1, 2)).astype(np.int32)
        pts[...,0], pts[...,1] = pts[...,1], map_size - pts[...,0]
        cv2.polylines(img, [pts], isClosed=True, color=colors[i].tolist(), thickness=2)
        region_center = np.array([region['center'][1], map_size-region['center'][0]]).astype(np.int32)
        img = cv2.circle(img, region_center, 5, (0, 255, 0), -1)
        img, _ = add_text(img, f"({region['id']}: {region['caption']})", region_center, font_scale=0.35, color=(200,200,200), thickness=1, horizontal_align='center', vertical_align='center')
    return img


def plot_matches_cv2(k_vis, q_vis, matches, reversed=False, map_size=480):
    vis = np.concatenate((k_vis, q_vis), axis=1)
    for match in matches:
        k_center = np.array([match['k']['center'][1], map_size-match['k']['center'][0]]).astype(np.int32)
        q_center = np.array([match['q']['center'][1], map_size-match['q']['center'][0]]).astype(np.int32)
        if reversed:
            k_center += np.array([map_size, 0])
        else:
            q_center += np.array([map_size, 0])
        cv2.circle(vis, k_center, 5, (255, 0, 0), 2)
        cv2.circle(vis, q_center, 5, (255, 0, 0), 2)
        cv2.line(vis, k_center, q_center, (255, 0, 0), 2)
        add_text(vis, f"{match['corr_score']:.2f}", (k_center+q_center)//2, font_scale=0.7, color=(200,200,0), thickness=2, horizontal_align='center', vertical_align='center')
    return vis


In [None]:
from matplotlib.path import Path
def check_overlap(region, matched_region, relaxed=False):
    # if a[center] is in b[bbox] and b[center] is in a[bbox]
    polygon_a = Path(region['grid_bbox'])
    polygon_b = Path(matched_region['grid_bbox'])
    is_a_in_b = polygon_b.contains_point(region['center'])
    is_b_in_a = polygon_a.contains_point(matched_region['center'])
    if relaxed:
        return is_a_in_b or is_b_in_a
    else:
        return is_a_in_b and is_b_in_a
def detect_match(scene_graph, keys, queries, knn=None, distance=None, overlap_relaxed=False, corr_score=0.5):
    """
    scene_graph is just used for calculating the corr_score
    set either knn or distance or corr_thresh to filter matches
    overlap_relaxed: True/False/None, None means no check for region match
    """
    if knn is None:
        knn = len(keys)
    else:
        knn = min(knn, len(keys))
    matches = []
    tree = KDTree([x['center'] for x in keys])
    for query in queries:
        distances, indices = tree.query(query['center'], k=knn)
        if not isinstance(distances, np.ndarray):
            distances, indices = np.array([distances]), np.array([indices])
        matches += [dict(k=keys[i], q=query, dist=dist) for i, dist in zip(indices, distances)]
    if distance is not None:
        matches = [*filter(lambda x: x['dist'] <= distance, matches)]
    if overlap_relaxed is not None:
        matches = [*filter(lambda x: check_overlap(x['k'], x['q'], relaxed=overlap_relaxed), matches)]
    # calculate corr_score
    for match in matches:
        match['corr_score'] = scene_graph.get_text_sim_score(match['k']['caption'], match['q']['caption'])
    if corr_score is not None:
        matches = [*filter(lambda x: x['corr_score'] >= corr_score, matches)]
    # keep only one match for each query according to corr_score
    matches = sorted(matches, key=lambda x: x['corr_score'], reverse=True)
    seen = set()
    new_matches = []
    for match in matches:
        if match['q']['id'] in seen:
            continue
        seen.add(match['q']['id'])
        new_matches.append(match)
    matches = new_matches
    return matches

def evaluate_sg(
    predict_sg,
    obs_regions,
    gt_regions,
    obs_objects,
    gt_objects,
    knn_region = 3,
    knn_object = 5,
    max_object_dist = 100.0/5.0,
):
    matches = {}
    matches['region_recall_relaxed'] = detect_match(
        scene_graph=predict_sg,
        keys=obs_regions,
        queries=gt_regions,
        knn=knn_region,
        overlap_relaxed=True,
        corr_score=None,
    )

    matches['region_recall_strict'] = detect_match(
        scene_graph=predict_sg,
        keys=obs_regions,
        queries=gt_regions,
        knn=knn_region,
        overlap_relaxed=False,
        corr_score=None,
    )
    matches['region_precision_relaxed'] = detect_match(
        scene_graph=predict_sg,
        keys=gt_regions,
        queries=obs_regions,
        knn=knn_region,
        overlap_relaxed=True,
        corr_score=None,
    )
    matches['region_precision_strict'] = detect_match(
        scene_graph=predict_sg,
        keys=gt_regions,
        queries=obs_regions,
        knn=knn_region,
        overlap_relaxed=False,
        corr_score=None,
    )
    matches['object_recall'] = detect_match(
        scene_graph=predict_sg,
        keys=obs_objects,
        queries=gt_objects,
        knn=knn_object,
        distance=max_object_dist,
        overlap_relaxed=None,
        corr_score=0.9,
    )
    matches['object_precision'] = detect_match(
        scene_graph=predict_sg,
        keys=gt_objects,
        queries=obs_objects,
        knn=knn_object,
        distance=max_object_dist,
        overlap_relaxed=None,
        corr_score=0.9,
    )
    scores = {
        'region_recall_relaxed': len(matches['region_recall_relaxed']) / len(gt_regions),
        'region_recall_strict': len(matches['region_recall_strict']) / len(gt_regions),
        'region_precision_relaxed': len(matches['region_precision_relaxed']) / len(obs_regions),
        'region_precision_strict': len(matches['region_precision_strict']) / len(obs_regions),
        'object_recall': len(matches['object_recall']) / len(gt_objects),
        'object_precision': len(matches['object_precision']) / len(obs_objects),
    }
    return matches, scores

In [None]:
def crop_minAreaRect(img, rect):
    box = cv2.boxPoints(rect)
    box = np.intp(box)
    # cv2.drawContours(img, [box], 0, (0, 0, 255), 2)
    width = int(rect[1][0])
    height = int(rect[1][1])
    src_pts = box.astype("float32")
    dst_pts = np.array([[0, height-1],
                        [0, 0],
                        [width-1, 0],
                        [width-1, height-1]], dtype="float32")
    M = cv2.getPerspectiveTransform(src_pts, dst_pts)
    warped = cv2.warpPerspective(img, M, (width, height))
    return warped

def check_visibility(center_i, center_j, wall_map):
    rect = cv2.minAreaRect(np.array([center_i, center_j]).astype(np.int32))
    rect = (rect[0], (max(rect[1][0], 2), max(rect[1][1], 10)), rect[2])
    wall = crop_minAreaRect(wall_map.T, rect)
    wall_pixels = wall.sum()
    return wall_pixels

def plot_visible_edge(fig, center_i, center_j, text, map_size=480):
    if fig is None:
        return
    fig.add_trace(
        go.Scatter(
            x=[center_i[1], center_j[1]],
            y=[map_size-center_i[0], map_size-center_j[0]],
            mode='markers+lines',
            marker=dict(size=6, color='red'),
            line=dict(color='orange', width=3),
            text=text,
        )
    )
def plot_object(fig, center_i, text, map_size=480):
    if fig is None:
        return
    fig.add_trace(
        go.Scatter(
            x=[center_i[1]],
            y=[map_size-center_i[0]],
            mode='markers',
            marker=dict(size=6, color='blue'),
            line=dict(color='pink', width=3),
            text=text,
        )
    )

class UnionFind:
    def __init__(self, items):
        self.parent = {i: i for i in items}
        self.rank   = {i: 0 for i in items}

    def find(self, x):
        if self.parent[x] != x:
            self.parent[x] = self.find(self.parent[x])
        return self.parent[x]

    def union(self, a, b):
        ra, rb = self.find(a), self.find(b)
        if ra == rb:
            return
        if self.rank[ra] < self.rank[rb]:
            ra, rb = rb, ra
        self.parent[rb] = ra
        if self.rank[ra] == self.rank[rb]:
            self.rank[ra] += 1
from collections import defaultdict
def detect_regions(objects, wall_map, knn_object=10, distance_threshold=30, wall_threshold=15, min_num=0, vis_fig=None):
    # 1) Build id->object map and collect all ids
    obj_map = {obj['id']: obj for obj in objects}
    ids = list(obj_map.keys())

    # 2) Initialize Union-Find on all ids
    uf = UnionFind(ids)

    # 3) For each object, query its nearby neighbors
    object_tree = KDTree([x['center'] for x in objects])
    for id_i in ids:
        obj_i = obj_map[id_i]
        # look for all neighbors
        distances, indices = object_tree.query(obj_i['center'], k=knn_object)
        if not isinstance(distances, np.ndarray):
            distances, indices = np.array([distances]), np.array([indices])
        else:
            distances, indices = distances[:len(objects)], indices[:len(objects)]
        for dist, j in zip(list(distances), list(indices)):
            if dist > distance_threshold:
                continue
            obj_j = objects[j]
            id_j = obj_j['id']
            center_i = obj_i['center']
            center_j = obj_j['center']
            wall_pixels = check_visibility(center_i, center_j, wall_map)
            if wall_pixels<=wall_threshold:
                uf.union(id_i, id_j)
                plot_visible_edge(vis_fig, center_i, center_j, f'{obj_i["caption"]} - {obj_j["caption"]}\nwall_pixels={wall_pixels}', map_size=wall_map.shape[0])
            else:
                plot_object(vis_fig, center_i, f'{obj_i["caption"]}\nwall_pixels={wall_pixels}', map_size=wall_map.shape[0])
    # 4) Bucket by root parent to form regions
    groups = {}
    for id in ids:
        root = uf.find(id)
        groups.setdefault(root,[]).append(obj_map[id])
    sg_regions = []
    for i, (_, objects) in enumerate(groups.items()):
        if min_num > 0 and len(objects) < min_num:
            continue
        sg_regions.append({
            'id': i,
            'objects': objects,
            'center': np.mean([obj['center'] for obj in objects], axis=0).tolist(),
            'caption': 'unknown',
        })
    return sg_regions

In [None]:
# 6s7QHgap2fW_55 switch floor error
# wcojb4TFT35_4 shift error

episode_label = steps_df['episode'].iloc[3]
step = 490
data = get_sg_data(episode_label, step)

obs_sg = PredictSceneGraph(data['cate_object'], clip_model_list=clip_model_list)
obs_sg.init_from_json(data['global_scene_graph'])
gt_sg = PredictSceneGraph(data['cate_object'], clip_model_list=clip_model_list)
gt_sg.init_from_json(data['gt_scenegraph'])

origins_grid = data['origins_grid']
grid_size = args.map_resolution
camera_position = data['camera_position'][:3, 3]
# camera_position = np.array(data['camera_position']).reshape(4,4)[:3, 3]
map_size = data['global_bev_rgb_map'].shape[0]


floor_avg_heights = [floor['floor_avg_height'] for floor in gt_sg.scene_graph['floors']]
floor_id = np.argmin(np.abs(np.array(floor_avg_heights) - camera_position[1]))

print('processing gt regions')
gt_objects = []
gt_regions = gt_sg.scene_graph['floors'][floor_id]['regions']
for region in gt_regions:
    update_region(region, grid_size, origins_grid)
    gt_objects.extend(region.get('objects'))

print('processing obs regions')
obs_regions = []
obs_objects = []
for room in obs_sg.scene_graph.get('rooms'):
    if len(room.get('regions')) == 0:
        continue
    for region in room.get('regions'):
        update_region(region, grid_size, origins_grid)
        obs_regions.append(region)
        obs_objects.extend(region.get('objects'))

In [None]:
matches, scores = evaluate_sg(
    predict_sg=obs_sg,
    obs_regions=obs_regions,
    gt_regions=gt_regions,
    obs_objects=obs_objects,
    gt_objects=gt_objects,
    knn_region=3,
    knn_object=5,
    max_object_dist=100.0/grid_size,
)
for k, v in scores.items():
    print(f'{k}: {v:.2%}')

obs_vis = create_fig(img=data['global_bev_rgb_map'][...,::-1])
obs_vis = plot_region(obs_vis, obs_regions, map_size, show_objects=True)

gt_vis = create_fig(img=data['global_bev_rgb_map'][...,::-1])
gt_vis = plot_region(gt_vis, gt_regions, map_size, show_objects=True)

plot_matches(obs_vis, gt_vis, matches['region_recall_relaxed']).show()
plot_matches(obs_vis, gt_vis, matches['region_precision_relaxed'], reversed=True).show()
# plot_matches(obs_vis, gt_vis, matches['region_recall_strict']).show()
# plot_matches(obs_vis, gt_vis, matches['region_precision_strict'], reversed=True).show()
# plot_matches(obs_vis, gt_vis, matches['object_recall']).show()
# plot_matches(obs_vis, gt_vis, matches['object_precision'], reversed=True).show()

In [None]:
import copy
objects = copy.deepcopy(obs_objects)
bev_map = data['global_bev_rgb_map']
gradient_map = data['gradient_map']
wall_map = (gradient_map>1.2).astype(np.uint8)

# detect rooms
vis_fig = px.imshow(wall_map[::-1])
new_rooms = detect_regions(objects, wall_map, knn_object=30, distance_threshold=50, wall_threshold=10, min_num=3, vis_fig=vis_fig)
# vis_fig.show()
for region in new_rooms:
    update_region(region, grid_size, origins_grid)
# room_vis = create_fig(img=bev_map[...,::-1])
room_vis = px.imshow(wall_map[::-1])
plot_region(room_vis, new_rooms, map_size, show_objects=True).show()

# detect regions in the whole map
# vis_fig = px.imshow(wall_map[::-1])
# new_regions = detect_regions(objects, wall_map, knn_object=15, distance_threshold=30, wall_threshold=10, min_num=3,vis_fig=vis_fig)
# vis_fig.show()
# for region in new_regions:
#     update_region(region, grid_size, origins_grid)
# region_vis = create_fig(img=data['global_bev_rgb_map'][...,::-1])
# plot_region(region_vis, new_regions, map_size, show_objects=True).show()

# Rebuild the scene graph!!!
# detect regions in each room and update the room
new_sg = copy.deepcopy(new_rooms)
new_regions = []
new_objects = []
for room in new_sg:
    regions = detect_regions(room['objects'], wall_map, knn_object=15, distance_threshold=30, wall_threshold=10, min_num=3,vis_fig=vis_fig)
    for region in regions:
        update_region(region, grid_size, origins_grid)
        new_objects.extend(region['objects'])
    # update region and objects
    room['regions'] = regions
    new_regions.extend(regions)
    room.pop('objects')
    # update id
    room['id'] = f"{room['id']}"
    for region in regions:
        region['id'] = f"{room['id']}.{region['id']}"
        for obj in region['objects']:
            obj['id'] = f"{room['id']}.{region['id']}.{obj['id']}"
region_vis = px.imshow(wall_map[::-1])
plot_region(region_vis, new_regions, map_size, show_objects=True).show()

In [None]:
# set to new_rooms to compare rooms, new_regions to compare regions
new_obs_regions = new_regions


matches, scores = evaluate_sg(
    predict_sg=obs_sg,
    obs_regions=new_obs_regions,
    gt_regions=gt_regions,
    obs_objects=new_objects,
    gt_objects=gt_objects,
    knn_region=3,
    knn_object=5,
    max_object_dist=100.0/grid_size,
)
for k, v in scores.items():
    print(f'{k}: {v:.2%}')

obs_vis = create_fig(img=data['global_bev_rgb_map'][...,::-1])
obs_vis = plot_region(obs_vis, new_obs_regions, map_size, show_objects=True)

gt_vis = create_fig(img=data['global_bev_rgb_map'][...,::-1])
gt_vis = plot_region(gt_vis, gt_regions, map_size, show_objects=True)

plot_matches(obs_vis, gt_vis, matches['region_recall_relaxed']).show()
plot_matches(obs_vis, gt_vis, matches['region_precision_relaxed'], reversed=True).show()
# plot_matches(obs_vis, gt_vis, matches['region_recall_strict']).show()
# plot_matches(obs_vis, gt_vis, matches['region_precision_strict'], reversed=True).show()
# plot_matches(obs_vis, gt_vis, matches['object_recall']).show()
# plot_matches(obs_vis, gt_vis, matches['object_precision'], reversed=True).show()

In [None]:
show_image(bev_map)
show_image(gradient_map)
show_image(wall_map)

In [None]:
obs_sg.scene_graph