In [1]:
import torch
from qwen_vl_utils import process_vision_info
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor

torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Device Name: {torch.cuda.get_device_name(device)}" if device.type == "cuda" else "Using cpu")

Device Name: NVIDIA GeForce RTX 3090


In [3]:
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct",
    torch_dtype="auto",
    device_map=device
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
Loading checkpoint shards: 100%|██████████| 5/5 [00:02<00:00,  2.28it/s]


In [5]:
with open("./prompts/komo_tutorial.txt", 'r') as file:
    komo_tutorial = file.read()

with open("./prompts/problem_definition.txt", 'r') as file:
    problem_definition = file.read()

image_path = "./data/example_images/cam1_2.jpg"

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": komo_tutorial,
            },
            {
                "type": "image",
                "image": image_path,
            },
            {
                "type": "text",
                "text": problem_definition,
            },
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to(device)

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=1024)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text[0])

komo = ry.KOMO(C, 1, 1, 0, False)
komo.addControlObjective([], 0, 1e-1)
komo.addControlObjective([], 1, 1e0)
komo.addObjective([1], ry.FS.positionDiff, ['l_gripper', 'blob'], ry.OT.eq, [1e1])
komo.addObjective([1], ry.FS.jointLimits, [], ry.OT.ineq)
komo.addObjective([1], ry.FS.accumulatedCollisions, [], ry.OT.eq)
komo.addObjective([1], ry.FS.jointState, [], ry.OT.eq, [1e1], [], order=1)
komo.addJointLimitConstraint(['l_gripper'], ry.JL.min, [0.1], [0.1])
komo.addJointLimitConstraint(['l_gripper'], ry.JL.max, [1.5], [1.5])
komo.addCollisionConstraint(['l_gripper'], 'blob', ry.CC.min, [0.1], [0.1])
komo.addCollisionConstraint(['l_gripper'], 'blob', ry.CC.max, [0.1], [0.1])
