In [1]:
import sys
import numpy as np
from rlbench.action_modes.action_mode import MoveArmThenGripper
from rlbench.action_modes.arm_action_modes import ArmActionMode, JointVelocity, JointPosition, EndEffectorPoseViaPlanning, EndEffectorPoseViaIK
from rlbench.action_modes.gripper_action_modes import Discrete
from rlbench.environment import Environment
from rlbench.observation_config import ObservationConfig, CameraConfig
from rlbench.tasks import ReachTarget, PickAndLift, StackBlocks, PushButton, StackBlocks, PickUpCup, PlaceHangerOnRack
import matplotlib.pyplot as plt
from transformers import AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig
from PIL import Image
import torch
from transformers import BitsAndBytesConfig
from pyquaternion import Quaternion
from rlbench.backend.robot import Robot
from scipy.spatial.transform import Rotation
from rlbench.backend.scene import Scene
from pathlib import Path
import os, json

from transformers import AutoModelForVision2Seq
from peft import PeftModel
import argparse
import torch
from vla.action_tokenizer import RLbenchActionTokenizer
from vla.dataset import RLbenchDataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base_model_path = "/media/lawrence/Work/checkpoints/openvla-7b"
adapter_path = "adapter-tmp/openvla-7b+test2+e4+b8+lr-2e-05+lora-r8+dropout-0.0+q-4bit"

In [3]:
processor = AutoProcessor.from_pretrained(base_model_path, trust_remote_code=True)
action_tokenizer = RLbenchActionTokenizer(processor.tokenizer)

In [4]:
quantization_config = BitsAndBytesConfig(
            load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4", #llm_int8_skip_modules = ['projector'],
        )
base_model = AutoModelForVision2Seq.from_pretrained(
        base_model_path,
        torch_dtype=torch.bfloat16,
        attn_implementation="sdpa",
        quantization_config=quantization_config,
        low_cpu_mem_usage=True,
        trust_remote_code=True,
        device_map = "auto"
    )
vla = PeftModel.from_pretrained(base_model, adapter_path, offload_buffers=True)
vla = vla.merge_and_unload()


The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
Loading checkpoint shards: 100%|██████████| 3/3 [00:34<00:00, 11.44s/it]


In [5]:
from vla.base_prompter import PurePromptBuilder
vla_dataset = RLbenchDataset(
    "./datasets/pick_up_cup/data.pt",
    action_tokenizer,
    processor.tokenizer,
    image_transform=processor.image_processor.apply_transform,
    prompt_builder_fn=PurePromptBuilder,
)

In [6]:
test_action = torch.tensor([1,-0.5,0.5,-3.14,0,3.14,1])
processor.tokenizer(action_tokenizer(test_action))

{'input_ids': [1, 29871, 31697, 31698, 31798, 31898, 31948, 31997, 31999], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [7]:
prompt = "In: What action should the robot take to <INSTRUCTION>?\nOut:"
# prompt = prompt.replace("<INSTRUCTION>", instr.lower())
image = Image.fromarray(np.random.random([224,224]))
inputs = processor(prompt, image).to(vla.device, dtype=torch.bfloat16)
inputs['input_ids'] = torch.cat((inputs['input_ids'], torch.unsqueeze(torch.Tensor([29871]).long(), dim=0).to(inputs['input_ids'].device)), dim=1)

In [8]:
action_dict = vla.generate(**inputs, max_new_tokens = 7, output_scores = True, return_dict_in_generate=True, do_sample=True)

In [9]:
processor.tokenizer.decode(torch.tensor([31879, 31857, 31889, 31885, 31877, 31870, 31744]))

'▓ਿദ巴飛Ħ忠'

In [10]:
class Agent(object):
    def __init__(self, vla, processor, action_tokenizer):
        self.vla = vla
        self.processor = processor
        self.action_tokenizer = action_tokenizer
        
    def act(self, obs, instr):
        prompt = "In: What action should the robot take to <INSTRUCTION>?\nOut:"
        prompt = prompt.replace("<INSTRUCTION>", instr.lower())
        image = Image.fromarray(obs.front_rgb)
        inputs = self.processor(prompt, image).to(self.vla.device, dtype=torch.bfloat16)
        inputs['input_ids'] = torch.cat((inputs['input_ids'], torch.unsqueeze(torch.Tensor([29871]).long(), dim=0).to(inputs['input_ids'].device)), dim=1)        
        action_dict = self.vla.generate(**inputs, max_new_tokens = 7, output_scores = True, return_dict_in_generate=True)
        ation_score = torch.stack(action_dict['scores']).squeeze(1)[:,self.action_tokenizer.action_token_begin_idx:self.processor.tokenizer.vocab_size]
        pred_action = self.action_tokenizer.decode_token_score_to_actions(ation_score, soft = False)
        return pred_action



In [11]:
agent = Agent(vla, processor, action_tokenizer)

In [12]:
camera = CameraConfig(image_size=(224, 224), depth=False, point_cloud=False, mask=False)
obs_config = ObservationConfig(left_shoulder_camera=camera, right_shoulder_camera=camera, front_camera=camera, overhead_camera=camera)

env = Environment(
    action_mode=MoveArmThenGripper(
        arm_action_mode=EndEffectorPoseViaPlanning(absolute_mode=True, collision_checking=False), gripper_action_mode=Discrete()),
    obs_config=obs_config,
    headless=False)
env.launch()



In [13]:
task = env.get_task(PickUpCup)

In [14]:
descriptions, obs = task.reset()

In [15]:
action_dict['sequences']

tensor([[    1,   512, 29901,  1724,  3158,   881,   278, 19964,  2125,   304,
           529,  1177, 10810, 29965,  9838, 29958, 29973,    13,  3744, 29901,
         29871, 31874, 31999, 31905, 31805, 31871, 31912, 31864]],
       device='cuda:0')

In [16]:
instr = descriptions[1]
prompt = "In: What action should the robot take to {<INSTRUCTION>}?\nOut:"
prompt = prompt.replace("<INSTRUCTION>", instr.lower())
image = Image.fromarray(obs.front_rgb)

inputs = processor(prompt, image).to(vla.device, dtype=torch.bfloat16)
inputs['input_ids'] = torch.cat((inputs['input_ids'], torch.unsqueeze(torch.Tensor([29871]).long(), dim=0).to(inputs['input_ids'].device)), dim=1)        
action_dict = vla.generate(**inputs, max_new_tokens = 7, output_scores = True, return_dict_in_generate=True)

agent.act(obs, instr)


tensor([ 0.4150,  0.3050,  1.3450,  2.6075, -2.9217, -1.6650,  1.0000],
       device='cuda:0')

In [17]:
agent.act(obs,descriptions[0])

tensor([ 0.2350,  0.3050,  1.2950, -0.9111, -2.4819, -2.5447,  1.0000],
       device='cuda:0')

In [24]:
task = env.get_task(PickUpCup)
training_steps = 1000
episode_length = 100
obs = None
for i in range(training_steps):
    if i % episode_length == 0:
        print('Reset Episode')
        descriptions, obs = task.reset()
        print(descriptions[1])
    try:
        action = agent.act(obs,descriptions[0]).cpu().numpy()
        action_rotation = Rotation.from_euler('xyz', action[3:6])
        action_quaternion = action_rotation.as_quat()
        # print(delta_quaternion)  # returns (qx, qy, qz, qw)
        action = np.concatenate([action[0:3], action_quaternion, action[-1:]])
        print(action)
        obs, reward, terminate = task.step(action)
        print(reward)
    except Exception as e:
        print(e)
        continue

Reset Episode
grasp the red cup and lift it
[ 0.125       0.24499999  1.42499995 -0.06633946  0.64631956 -0.06633946
 -0.75727747  0.        ]
0.0
[ 0.29499999  0.345       1.33500004 -0.6062427  -0.75008563 -0.21538193
 -0.15314035  1.        ]
0.0
[ 0.345       0.39499998  1.47500002  0.61580367 -0.24922724  0.65987038
  0.35105939  1.        ]
A path could not be found. Most likely due to the target being inaccessible or a collison was detected.
[ 0.345       0.39499998  1.47500002  0.61580367 -0.24922724  0.65987038
  0.35105939  1.        ]
A path could not be found. Most likely due to the target being inaccessible or a collison was detected.
[ 0.345       0.39499998  1.47500002  0.61580367 -0.24922724  0.65987038
  0.35105939  1.        ]
A path could not be found. Most likely due to the target being inaccessible or a collison was detected.


KeyboardInterrupt: 

In [None]:
env.shutdown()

[CoppeliaSim:loadinfo]   done.


In [None]:
action

array([-0.02001152,  0.02458013, -0.01649398,  0.00402673, -0.01346854,
        0.02819962,  0.99607843])