In [1]:

import torch
import tqdm
import wandb
from accelerate import PartialState
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training, PeftConfig
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig
from transformers.modeling_outputs import CausalLMOutputWithPast
from torch.optim.lr_scheduler import StepLR
from transformers import get_linear_schedule_with_warmup

from vla.base_prompter import PurePromptBuilder
from vla.utils import PaddedCollatorForPosePrediction, runningLoss
from vla.action_tokenizer import RLbenchPoseTokenizer
from vla.dataset import RLbenchCotDataset
import numpy as np
import torch.nn.functional as F

import numpy as np
from PIL import Image

from rlbench.action_modes.action_mode import MoveArmThenGripper
from rlbench.action_modes.arm_action_modes import ArmActionMode, JointVelocity, JointPosition, EndEffectorPoseViaPlanning, EndEffectorPoseViaIK


from rlbench.action_modes.gripper_action_modes import Discrete
from rlbench.environment import Environment
from rlbench.observation_config import ObservationConfig, CameraConfig
# from rlbench.tasks.pick_described_object import PickDescribedObject
from rlbench.tasks import PutGroceriesInCupboard, PickAndLift, StackBlocks, PlaceHangerOnRack, PickDescribedObject, TakeLidOffSaucepan, SetTheTable, PutGroceriesInCupboard
from scipy.spatial.transform import Rotation as R
from matplotlib import pyplot as plt
from PIL import Image
from pyrep.const import RenderMode

2024-08-04 15:29:54.592518: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-04 15:29:54.592580: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-04 15:29:54.594420: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-04 15:29:54.602413: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
base_model_path = "/media/lawrence/Work/checkpoints/ecot-openvla-7b-bridge"
adapter_path = "adapter-tmp/2_sample_data_q+nll+pick_described_object+e1+b8+lr-0.0001+lora-r16+dropout-0.0+q-4bit"
adapter_path1 = "adapter-tmp/weighted_loss_cot_1+nll+pick_described_object2+e1+b8+lr-0.0001+lora-r16+dropout-0.0+q-4bit"
adapter_path2 = "adapter-tmp/weighted_loss+weighted+pick_described_object+e1+b8+lr-0.0005+lora-r16+dropout-0.0+q-4bit"
data_path = "datasets/pick_described_object/train_data.pt"


In [5]:
quantization_config = BitsAndBytesConfig(
            load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4", #llm_int8_skip_modules = ['projector'],
        )
base_model = AutoModelForVision2Seq.from_pretrained(
        base_model_path,
        torch_dtype=torch.bfloat16,
        attn_implementation="sdpa",
        quantization_config=quantization_config,
        low_cpu_mem_usage=True,
        trust_remote_code=True,
        device_map = "cuda"
    )
base_model = prepare_model_for_kbit_training(base_model)

Loading checkpoint shards: 100%|██████████| 3/3 [00:31<00:00, 10.66s/it]


In [6]:
item_num = 5
stage_num = 2 
add_tokens = ['<g>', '</g>'] + [f'<item_{i}>' for i in np.arange(item_num)] + ['<o>', '</o>', '<t>', '</t>'] + [f'<stage_{i}>' for i in np.arange(stage_num)] + ['<a>', '</a>', '<q>', '<cot>']

processor = AutoProcessor.from_pretrained(base_model_path, trust_remote_code=True)
processor.tokenizer.add_tokens(add_tokens)
dataset_statistics: tuple = (np.array([-0.2, -0.35,  0.75199986, -np.pi/2, -np.pi/2, -np.pi/2,  0. ]), np.array([0.5, 0.35, 1.3, np.pi/2, 0, np.pi/2, 1.])) # Min-Max normalization statistics

action_tokenizer = RLbenchPoseTokenizer(processor.tokenizer, dataset_statistics)
trainset = RLbenchCotDataset(
    data_path,
    action_tokenizer,
    processor.tokenizer,
    image_transform=processor.image_processor.apply_transform,
)

collator = PaddedCollatorForPosePrediction(
    processor.tokenizer.model_max_length, processor.tokenizer.pad_token_id, padding_side="right"
)

train_dataloader = DataLoader(
    trainset,
    batch_size=2,
    shuffle=False,
    sampler=None,
    collate_fn=collator,
    num_workers=1,  # Important =>> Set to 0 if using RLDS; TFDS rolls its own parallelism!
)

In [7]:
vla = PeftModel.from_pretrained(base_model, adapter_path, adapter_name="actor_critic")
vla.load_adapter(adapter_path1, adapter_name="actor_critic1")
vla.load_adapter(adapter_path2, adapter_name="actor_critic2")

_IncompatibleKeys(missing_keys=['base_model.model.vision_backbone.featurizer.pos_embed', 'base_model.model.vision_backbone.featurizer.patch_embed.proj.base_layer.weight', 'base_model.model.vision_backbone.featurizer.patch_embed.proj.base_layer.bias', 'base_model.model.vision_backbone.featurizer.patch_embed.proj.lora_A.actor_critic.weight', 'base_model.model.vision_backbone.featurizer.patch_embed.proj.lora_A.actor_critic1.weight', 'base_model.model.vision_backbone.featurizer.patch_embed.proj.lora_B.actor_critic.weight', 'base_model.model.vision_backbone.featurizer.patch_embed.proj.lora_B.actor_critic1.weight', 'base_model.model.vision_backbone.featurizer.blocks.0.norm1.weight', 'base_model.model.vision_backbone.featurizer.blocks.0.norm1.bias', 'base_model.model.vision_backbone.featurizer.blocks.0.attn.qkv.base_layer.weight', 'base_model.model.vision_backbone.featurizer.blocks.0.attn.qkv.base_layer.bias', 'base_model.model.vision_backbone.featurizer.blocks.0.attn.qkv.lora_A.actor_critic.

In [8]:
def get_instruct_prompt(action_tokenizer, gripper, instruction: str):
    # gripper = action_tokenizer(gripper)
    prompt = (
        f"In: What should be the next key pose of the gripper to {instruction}? The current gripper pose is <g>{gripper} </g>.\n "
        "Out: <cot> "
    )
    return prompt

In [9]:
batch = next(iter(train_dataloader))

In [10]:
torch.tensor(processor.tokenizer(get_instruct_prompt(None,1,1)).input_ids)

tensor([    1,   512, 29901,  1724,   881,   367,   278,  2446,  1820, 18593,
          310,   278,   330,   374,  2496,   304, 29871, 29896, 29973,   450,
         1857,   330,   374,  2496, 18593,   338, 32001, 29896, 32002, 29889,
           13,  4451, 29901, 32017, 29871])

In [11]:
batch['input_ids'][:,]

tensor([[    1,   512, 29901,  1724,   881,   367,   278,  2446,  1820, 18593,
           310,   278,   330,   374,  2496,   304,  4337,   278, 26438,  3800,
           304,   278, 25972, 29973,   450,  1857,   330,   374,  2496, 18593,
           338, 32001, 31417, 31501, 31668, 31747, 31888, 31995, 31998, 32002,
         29889,    13,  4451, 29901, 32017, 32007, 29892, 32008, 31437, 31588,
         31613, 32009, 29892, 32010, 31398, 31501, 31643, 32011, 29892, 32012,
         29892, 32014, 31417, 31501, 31668, 31747, 31888, 31995, 31999, 32015,
         29892, 32016, 32000,     2],
        [    1,   512, 29901,  1724,   881,   367,   278,  2446,  1820, 18593,
           310,   278,   330,   374,  2496,   304,  1925,   278, 26438,  3800,
           297,   278, 25972, 29973,   450,  1857,   330,   374,  2496, 18593,
           338, 32001, 31425, 31579, 31669, 31764, 31856, 31926, 31999, 32002,
         29889,    13,  4451, 29901, 32017, 32007, 29892, 32008, 31437, 31588,
         31613

In [12]:
# prompt = get_instruct_prompt(gripper,instr)
# image = Image.fromarray(obs.front_rgb)
# inputs = processor(prompt, image).to(vla.device, dtype=torch.bfloat16)


In [13]:
batch['input_ids'][:,:-29]

tensor([[    1,   512, 29901,  1724,   881,   367,   278,  2446,  1820, 18593,
           310,   278,   330,   374,  2496,   304,  4337,   278, 26438,  3800,
           304,   278, 25972, 29973,   450,  1857,   330,   374,  2496, 18593,
           338, 32001, 31417, 31501, 31668, 31747, 31888, 31995, 31998, 32002,
         29889,    13,  4451, 29901, 32017],
        [    1,   512, 29901,  1724,   881,   367,   278,  2446,  1820, 18593,
           310,   278,   330,   374,  2496,   304,  1925,   278, 26438,  3800,
           297,   278, 25972, 29973,   450,  1857,   330,   374,  2496, 18593,
           338, 32001, 31425, 31579, 31669, 31764, 31856, 31926, 31999, 32002,
         29889,    13,  4451, 29901, 32017]])

In [14]:
batch["pixel_values"][1:2].shape

torch.Size([1, 3, 224, 224])

In [15]:
vla.set_adapter("actor_critic")
with torch.no_grad(), torch.autocast("cuda", dtype=torch.bfloat16):
    output_dict = vla.generate(
        input_ids = batch['input_ids'][1:2,:-29].to(vla.device),
        pixel_values=batch["pixel_values"][1:2].to(torch.bfloat16).to(vla.device),
        max_new_tokens = 28,
        do_sample=False,
        temperature=1,
        return_dict_in_generate=True,
        output_scores = True,
        
    )

In [16]:
_, _, _, _,_, action_mask = action_tokenizer.get_mask(output_dict.sequences)

In [17]:
processor.tokenizer

LlamaTokenizerFast(name_or_path='/media/lawrence/Work/checkpoints/ecot-openvla-7b-bridge', vocab_size=32000, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<PAD>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32000: AddedToken("<PAD>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32001: AddedToken("<g>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	32002: AddedToken("</g>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	32003: AddedT

In [18]:
processor.tokenizer.decode(output_dict.sequences[0].cpu().numpy())

'<s> In: What should be the next key pose of the gripper to put the sugar box in the basket? The current gripper pose is<g>七思节ɫ터𝓝给</g>.\n Out:<cot><item_4>,<o>认ˆ态</o>,<t>交ペམ</t>,<stage_0>,<a>认ˆ್면ḳ收弘</a>,<q>,'

In [19]:
class Agent(object):
    def __init__(self, vla, processor, action_tokenizer):
        self.vla = vla
        self.processor = processor
        self.action_tokenizer = action_tokenizer

    def get_openvla_prompt(self, instruction: str):
        SYSTEM_PROMPT = "You are an assistant helping to control a robotic manipulator. The robot performs tasks by following a series of steps to interact with objects in its environment. The environment includes items like soup cans and baskets, and the robot uses a gripper to pick up and move these items.\n\nInstructions format:\n- 'USER': Describes the task to be performed.\n- 'ASSISTANT': Provides a detailed step-by-step plan for the robot to execute the task.\n\nThe 'ASSISTANT' response includes:\n1. A logical step-by-step plan for the task.\n2. The current positions of relevant objects and the gripper.\n3. The current state of the gripper (whether it has grasped the object or not).\n4. The next key pose of the gripper to achieve the task.\n\nExample:\n\nUSER: What action should the robot take to pick up the soup and place it in the basket?\nASSISTANT: Let's think step by step. The plan is to move the gripper to the soup and pick it up, then move over the basket, and then place the soup in the basket. The soup is located at <object>ĉ‖호 </object>. The basket is located at <target>Ζ‖ご </target>. The gripper pose is <gripper>阳‖素군雅导弘 </gripper>. The gripper hasn't grasped the soup. So the current step is to move the gripper to the soup and pick it up. The next key pose of the gripper is <action>机‖素秀麻방弘 </action>. \n <current conversation>"
        return f"{SYSTEM_PROMPT} USER: What action should the robot take to {instruction.lower()}? ASSISTANT: Let's think step by step,"

    def get_instruct_prompt(self, gripper, instruction: str):
        gripper = self.action_tokenizer(gripper)
        prompt = (
            f"In: What should be the next key pose of the gripper to {instruction}? The current gripper pose is <g>{gripper} </g>.\n "
            "Out: <cot>"
        )
        return prompt

    def act(self, obs, instr, temperature = 1, deterministic = False):
        gripper = np.concatenate([obs.gripper_pose,[obs.gripper_open]])
        prompt = self.get_instruct_prompt(gripper,instr)
        image = Image.fromarray(obs.front_rgb)
        inputs = self.processor(prompt, image).to(self.vla.device, dtype=torch.bfloat16)
        while True:
            with torch.autocast("cuda", dtype=torch.bfloat16):
                output_dict = vla.generate(
                    **inputs,
                    max_new_tokens = 28,
                    do_sample=False,
                    temperature=1,
                    return_dict_in_generate=True,
                    output_scores = True,
                )
            # output_dict = vla.generate(**inputs, max_new_tokens = 50, output_scores = True, return_dict_in_generate=True, do_sample = True, temperature = 0.5)
            gripper_mask, item_mask, object_mask, target_mask, stage_mask, action_mask = action_tokenizer.get_mask(output_dict.sequences)
            if action_mask.sum().item() != 7:
                print("Action mask is not correct")
                continue
            break
        print(processor.tokenizer.decode(output_dict.sequences[0]))
        output_logits = torch.stack(output_dict.scores, dim = 1)
        action_logits = output_logits[action_mask[:,-output_logits.size(1):]][:,action_tokenizer.action_token_begin_idx:processor.tokenizer.vocab_size].view(1,-1,action_tokenizer.n_bins)
        action = self.action_tokenizer.get_action(action_logits,temperature = temperature, deterministic = deterministic)
        # get_action(self, logits: torch.tensor, temperature: float = 1, deterministic: bool = False)
        q = output_logits[:,-1, 32016]

        return action, q
        

In [20]:
agent = Agent(vla,processor,action_tokenizer)

In [21]:
class ReplayBuffer:
    """
    A simple FIFO experience replay buffer for DDPG agents.
    """

    def __init__(self, obs_dim, act_dim, size):
        self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
        self.obs2_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
        self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32)
        self.rew_buf = np.zeros(size, dtype=np.float32)
        self.done_buf = np.zeros(size, dtype=np.float32)
        self.ptr, self.size, self.max_size = 0, 0, size

    def store(self, obs, act, rew, next_obs, done):
        self.obs_buf[self.ptr] = obs
        self.obs2_buf[self.ptr] = next_obs
        self.act_buf[self.ptr] = act
        self.rew_buf[self.ptr] = rew
        self.done_buf[self.ptr] = done
        self.ptr = (self.ptr+1) % self.max_size
        self.size = min(self.size+1, self.max_size)

    def sample_batch(self, batch_size=32):
        idxs = np.random.randint(0, self.size, size=batch_size)
        batch = dict(obs=self.obs_buf[idxs],
                     obs2=self.obs2_buf[idxs],
                     act=self.act_buf[idxs],
                     rew=self.rew_buf[idxs],
                     done=self.done_buf[idxs])
        return {k: torch.as_tensor(v, dtype=torch.float32) for k,v in batch.items()}


# RLbench Env

In [6]:
env.shutdown()

[CoppeliaSim:loadinfo]   done.


In [2]:
camera = CameraConfig(image_size=(224, 224), depth=False, point_cloud=False, mask=False)
obs_config = ObservationConfig(left_shoulder_camera=camera, right_shoulder_camera=camera, front_camera=camera, overhead_camera=camera)
obs_config.front_camera.render_mode = RenderMode.OPENGL

env = Environment(
    action_mode=MoveArmThenGripper(
        arm_action_mode=EndEffectorPoseViaPlanning(absolute_mode=True, collision_checking=False), gripper_action_mode=Discrete()),
    obs_config=obs_config,
    headless=False, shaped_rewards = True)
env.launch()
task = env.get_task(PickDescribedObject)

In [34]:
agent.vla.set_adapter("actor_critic")
agent.vla.active_adapter

'actor_critic1'

In [38]:
import gc
gc.collect()

8083

In [39]:
torch.cuda.empty_cache()

In [16]:
task = env.get_task(PickDescribedObject)
task.sample_variation()
descriptions, obs = task.reset()
task.step(np.concatenate([task._task.get_waypoints()[1]._waypoint.get_pose(),[0]]))

(<rlbench.backend.observation.Observation at 0x7b9e0397e310>,
 -0.8147093484236813,
 False)

In [17]:
task.step(np.concatenate([task._task.get_waypoints()[0]._waypoint.get_pose(),[0]]))

(<rlbench.backend.observation.Observation at 0x7b9e0397da90>, 0, False)

In [18]:
task.step(np.concatenate([task._task.get_waypoints()[1]._waypoint.get_pose(),[1]]))

(<rlbench.backend.observation.Observation at 0x7b9e0397e5d0>, 0, True)

In [21]:
task._task.robot.gripper.get_joint_positions()

[0.038983672857284546, 0.03901934623718262]

In [4]:
task._task.get_waypoints()[1]._waypoint.get_pose()

array([-1.99999928e-01, -3.24999958e-01,  1.20000005e+00,  4.37113883e-08,
        1.00000000e+00,  7.54979013e-08, -3.30011808e-15])

In [41]:
task.sample_variation()
descriptions, obs = task.reset()
agent.vla.set_adapter("actor_critic")
agent.vla.active_adapter
torch.cuda.empty_cache()
# Image.fromarray(obs.front_rgb)
while True:
    try:
        action, q = agent.act(obs, descriptions[0], temperature=1, deterministic=False)
        action_rotation = R.from_euler('xyz', action[3:6])
        action_quaternion = action_rotation.as_quat()
        action = np.concatenate([action[0:3], action_quaternion, action[-1:]])
        obs, reward, terminate = task.step(action)
        print(reward, terminate)
    except Exception as e:
        # continue
        print(e)
        pass

<s> In: What should be the next key pose of the gripper to put the chocolate dessert mix in the basket? The current gripper pose is<g>ड广শữḳ洲</g>.
 Out:<cot><item_0>,<o>创요関</o>,<t>交ペམ</t>,<stage_0>,<a>认马共면ḳ收弘</a>,<q>,
-2.799954216043461 False
Action mask is not correct
Action mask is not correct
Action mask is not correct
Action mask is not correct


KeyboardInterrupt: 

In [26]:
reward

NameError: name 'reward' is not defined

In [None]:
agent.act(obs, descriptions[0], temperature=10, deterministic=False)

<s> In: What should be the next key pose of the gripper to put the sugar box in the basket? The current gripper pose is<g>景ἱ߬ữ飛舞</g>.
 Out:<cot><item_4>,<o>认ˆ态</o>,<t>交ペམ</t>,<stage_0>,<a>认ˆ್면ḳ收弘</a>,<q>,


(array([ 0.0485    , -0.0525    ,  0.86981989, -2.71747765, -0.00785399,
        -1.52367244,  1.        ]),
 tensor([2.7188], device='cuda:0'))