In [None]:
from bddl.knowledge_base import *

In [None]:
# Load particle params and delete everything that shows up there.
import csv, json
synset_has_particle_density = set()
with open("/scr/BEHAVIOR-1K/asset_pipeline/metadata/substance_hyperparams.csv") as f:
    reader = csv.DictReader(f)
    for row in reader:
        synset = Synset.get(row["synset"].strip())
        if synset is None:
            print(f"Synset {row['synset']} not found")
            continue
        has_density = "particle_density" in json.loads(row["hyperparams"])
        if has_density:
            synset_has_particle_density.add(synset)
        elif "visualSubstance" not in synset.property_names:
            print(f"Synset {synset} does not have particle density")

synset_is_visual = {s for s in Synset.all_objects() if "visualSubstance" in s.property_names}
ignore_synsets = synset_has_particle_density | synset_is_visual
ignore_categories = {c for s in ignore_synsets for c in s.categories}

In [None]:
import string

raw_prompt = """
You are a helpful assistant that generates average mass annotations for objects in a dataset of 3D models.
Give me the estimated average mass of the below category of objects in JSON format. The output should be a JSON
dictionary containing one key "mass", whose value is the category mass in kilograms. Output only the JSON - no additional conversation.

* When you have objects that are named like x_box or x_bottle (e.g. wine_bottle, rice_bag) that are suffixed with a container name, assume that those are empty containers.
* Objects that are prefixed with a container type like bottle_of_x or box_of_x (e.g. bottle_of_wine, bag_of_rice) should be assumed to be full.
* When you have a particle based object (e.g. rice, flour), assume that it is a single particle of that object.
* If you see "atomizer", assume that it is a spray bottle containing that liquid (and is full).
* If you see cooked_ in the name, assume that it is a cooked version of that object (e.g. cooked_rice, cooked_pasta) and has roughly the same mass as the raw version.
* If you see "half" in the name, assume that it is the object that you would obtain by cutting the object once with a knife (e.g. half_banana, half_apple).
* If you see "diced" in the name, assume that it is one of each of the pieces that you would obtain by cutting the object into small cubes with a knife (e.g. diced_banana, diced_apple).
* If I give you images, assume that they are some example photos of some objects that belong to the category.
"""

valid_chars = set(string.ascii_lowercase) | {" "}
def valid_name(s):
    return True
    return all(c in valid_chars for c in s)

In [None]:
from getpass import getpass
OPENAI_API_TOKEN = ""   # getpass()

In [None]:
# USE_OLLAMA, model_id
MODELS = {
  # "gemma3:27b": (True, None),
  # "llama4:scout": (True, None),
  "google/gemini-2.5-flash-preview": (False, None),
  "openai/gpt-4o-mini": (False, None),
  "meta-llama/llama-4-maverick": (False, ["lambda", "deepinfra", "novita"]),
  "qwen/qwen2.5-vl-72b-instruct": (False, ["nebius"]),
}

In [None]:
import asyncio
from openai import AsyncOpenAI
from ollama import AsyncClient
from asynciolimiter import Limiter

# Limit to 2 requests per second
rate_limiter = Limiter(1)

import json

def strip_code_tags(msg):
    if msg.startswith("```json"):
        msg = msg[8:]
    if msg.startswith("```"):
        msg = msg[3:]
    if msg.endswith("```"):
        msg = msg[:-3]
    return msg.strip()

ollama_client = AsyncClient()
async def query_model_ollama(model_id, obj_name, message, images=None):
    messages = [
        {"role": "system", "content": raw_prompt},
        {"role": "user", "content": message},
    ]
    if images is not None:
        messages[1]["images"] = images
    response = await ollama_client.chat(
        model=model_id,
        messages=messages,
    )
    response_message = response.message.content
    return obj_name, json.loads(strip_code_tags(response_message))

openai_client = AsyncOpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=OPENAI_API_TOKEN
)
async def query_model_openai(model_id, obj_name, message, images=None, providers=None):
    await rate_limiter.wait()
    content = [
        {
            "type": "text",
            "text": message,
        }
    ]
    if images is not None:
        for base64_image in images:
            data_url = f"data:image/jpeg;base64,{base64_image}"
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": data_url
                }
            })
    messages = [
        {"role": "system", "content": raw_prompt},
        {"role": "user", "content": content},
    ]
    response = await openai_client.chat.completions.create(
        model=model_id,
        # response_format={
        #     "type": "json_schema",
        #     "json_schema": {
        #         "name": "object_mass",
        #         "strict": True,
        #         "schema": {
        #             "type": "object",
        #             "properties": {
        #                 "mass": {
        #                     "type": "number",
        #                     "description": "Mass of object category in kilograms"
        #                 },
        #             },
        #             "required": ["mass"],
        #             "additionalProperties": False
        #         }
        #     }
        # },
        messages=messages,
        extra_body={"provider": {"only": providers}} if providers is not None else None,
    )
    # print(response)
    response_message = response.choices[0].message.content
    return obj_name, json.loads(strip_code_tags(response_message))

async def query_model(model_id, obj_name, message, images=None):
    use_ollama, providers = MODELS[model_id]
    if use_ollama:
        return await query_model_ollama(model_id, obj_name, message, images=images)
    else:
        return await query_model_openai(model_id, obj_name, message, images=images, providers=providers)

async def query_models(obj_name, message, images=None):
    tasks = []
    model_ids = []
    for use_ollama, model_id in MODELS:
        model_ids.append(model_id)
        if use_ollama:
            tasks.append(query_model_ollama(model_id, message, images))
        else:
            tasks.append(query_model_openai(model_id, message, images))
    results = await asyncio.gather(*tasks)
    return obj_name, {model_id: result[1] for model_id, result in zip(model_ids, results)}

In [None]:
import collections
import cv2
import base64
import random
import pathlib

def sample_frames_from_image(filename, n):
    cap = cv2.VideoCapture(filename)
    total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT) / 2  # only from the first half
    
    # Randomly pick a frame and make the other ones be equally spaced from it
    start_frame = random.randint(0, int(total_frames))
    frame_indices = sorted([int(start_frame + i * (total_frames / n)) % total_frames for i in range(n)])

    frames = []
    for i in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        ret, frame = cap.read()
        jpeg_image = cv2.imencode('.jpg', frame)[1].tobytes()
        frames.append(base64.b64encode(jpeg_image).decode('utf-8'))
    cap.release()

    return frames

NUM_IMAGES = 4
def sample_category_images(category):
    objects = list(category.objects)
    objects_videos = [pathlib.Path(f"/scr/BEHAVIOR-1K/asset_pipeline/artifacts/pipeline/object_images/{obj.name}.mp4") for obj in objects]
    objects_videos = [obj for obj in objects_videos if obj.exists()]
    if len(objects_videos) == 0:
        return None
    objects_videos = collections.Counter(random.choices(objects_videos, k=NUM_IMAGES))
    objects_images = [img for obj, cnt in objects_videos.items() for img in sample_frames_from_image(obj, cnt)]
    random.shuffle(objects_images)
    return objects_images

In [None]:
from IPython import display
from base64 import b64decode

display.Image(b64decode(sample_category_images(Category.get("walls"))[0]))

In [None]:
import traceback
import numpy as np
import tqdm.notebook as tqdm
import pathlib
import asyncio

RESULT_DIR = pathlib.Path("/scr/BEHAVIOR-1K/asset_pipeline/metadata/vlm_mass_predictions/")

def human_readable_size(size):
    """Convert a size in meters to a human-readable string with a unit of mm, cm, or m."""
    if size < 0.001:
        return f"{size * 1e3:.2f} mm"
    elif size < 1:
        return f"{size * 1e2:.2f} cm"
    else:
        return f"{size:.2f} m"

async def run_pass(model_id):
    model_suffix = model_id.split("/")[-1]
    result_path = RESULT_DIR / f"{model_suffix}.json"

    results = {}
    if result_path.exists():
        with open(result_path, "r") as f:
            results = json.load(f)

    gpt_missing = [c for c in Category.all_objects() if c.name not in results and len(c.objects) > 0 and c not in ignore_categories]
    print(len(gpt_missing))
    # gpt_missing = gpt_missing[:100]

    async_futures = []
    for c in gpt_missing:
        message = ""
        images = []
        cat = c.name
        synset = c.synset
        assert c.synset is not None

        if c.synset.definition:
            message += f'"{cat}": a kind of "{synset.name}", defined as "{synset.definition}"\n'
        else:
            parent_synset = synset.parents[0]
            assert parent_synset.definition
            message += f'"{cat}": a kind of "{synset.name}", which is a child of {parent_synset.name}, defined as "{parent_synset.definition}"\n'

        avg_dim = np.array([list(obj.bounding_box_size) for obj in c.objects if obj.bounding_box_size is not None]).mean(axis=0)
        avg_dim_readable = ", ".join([human_readable_size(d) for d in avg_dim])
        message += "Average dimensions: " + avg_dim_readable + "\n"
        # print(message)
        c_images = sample_category_images(c)
        if c_images is not None:
            images.extend(c_images)
            
        async_futures.append(query_model(model_id, cat, message, images))

    for result in tqdm.tqdm(asyncio.as_completed(async_futures), total=len(async_futures)):
        try:
            cat, result_dict = await result
            # filtered_results = {k: v for k, v in result_dict.items() if k in missing_names}
            results[cat] = result_dict["mass"] if "mass" in result_dict else None

            # Save after every result
            with open(result_path, "w") as f:
                json.dump(results, f)
        except:
            print("Error in cat", cat)
            traceback.print_exc()

In [None]:
for model in MODELS:
    print("Running pass for model", model)
    await run_pass(model)

In [None]:
from collections import defaultdict


results = defaultdict(dict)
for model in MODELS:
    model_suffix = model.split("/")[-1]
    result_path = RESULT_DIR / f"{model_suffix}.json"
    if not result_path.exists():
        continue
    with open(result_path, "r") as f:
        for cat, mass in json.load(f).items():
            results[cat][model] = mass

print(results)
# print("Missing categories", len([c for c in Category.all_objects() if c.name not in results]))
# cat_names = {c.name for c in Category.all_objects()}
# print("Extra categories", len([x for x in results.keys() if x not in cat_names]))

In [None]:
# Check the max/min ratio for each category
ratios = {}
for cat, masses in results.items():
    assert len(masses) == len(MODELS)
    min_mass = min([m for m in masses.values() if m is not None])
    max_mass = max([m for m in masses.values() if m is not None])
    ratio = max_mass / min_mass
    ratios[cat] = ratio
ratios = sorted(ratios.items(), key=lambda x: x[1])
print(ratios[:10], ratios[-10:])

In [None]:
# Now print everything in the order they show up in the category sheet so that we can paste this there.
import csv
with open("/scr/BEHAVIOR-1K/asset_pipeline/metadata/category_mapping.csv", "r") as f:
    category_list = list(csv.DictReader(f))

model_keys = sorted(MODELS.keys())
print(",".join(model_keys))
for cat in category_list:
    name = cat["category"]
    values = [str(results[name][model]) for model in model_keys] if name in results else [""] * len(model_keys)
    print(",".join(values))