In [5]:
from models.openvla_oft.configuration_prismatic import OpenVLAConfig
from models.openvla_oft.modeling_prismatic import OpenVLAForActionPrediction
from models.openvla_oft.processing_prismatic import PrismaticImageProcessor, PrismaticProcessor
# from models.openvla_oft.openvla_utils import update_auto_map, check_model_logic_mismatch
from transformers import AutoConfig, AutoImageProcessor, AutoModelForVision2Seq, AutoProcessor
AutoConfig.register("openvla", OpenVLAConfig)
AutoImageProcessor.register(OpenVLAConfig, PrismaticImageProcessor)
AutoProcessor.register(OpenVLAConfig, PrismaticProcessor)
AutoModelForVision2Seq.register(OpenVLAConfig, OpenVLAForActionPrediction)
local_path = "/file_system/common-models/SimpleVLA-RL/Openvla-oft-SFT-libero10-trajall"
# if self.rank == 0:
#update_auto_map(local_path)
#check_model_logic_mismatch(local_path)

In [6]:
from models.openvla_oft.configuration_prismatic import OpenVLAConfig
from models.openvla_oft.processing_prismatic import PrismaticImageProcessor, PrismaticProcessor
print("*********USE VLA tokenizer*************")
AutoConfig.register("openvla", OpenVLAConfig)
AutoProcessor.register(OpenVLAConfig, PrismaticProcessor)
processor = AutoProcessor.from_pretrained(local_path, trust_remote_code=True)
tokenizer=processor.tokenizer

*********USE VLA tokenizer*************


In [None]:
import os
import torch
import json
torch_dtype = torch.float32

# override model kwargs
actor_model_config = AutoConfig.from_pretrained(local_path, trust_remote_code=True)

actor_module = AutoModelForVision2Seq.from_pretrained(
                                        pretrained_model_name_or_path=local_path,
                                        torch_dtype=torch_dtype,
                                        #attn_implementation="flash_attention_2",
                                        config=actor_model_config,              
                                        trust_remote_code=True,
                                    )
print(actor_module)
#oft add
actor_module.vision_backbone.set_num_images_in_input(1)

dataset_statistics_path = os.path.join(local_path, "dataset_statistics.json")
if os.path.isfile(dataset_statistics_path):
    with open(dataset_statistics_path, "r") as f:
        norm_stats = json.load(f)
    actor_module.norm_stats = norm_stats
else:
    print(
        "WARNING: No local dataset_statistics.json file found for current checkpoint.\n"
        "You can ignore this if you are loading the base VLA (i.e. not fine-tuned) checkpoint."
        "Otherwise, you may run into errors when trying to call `predict_action()` due to an absent `unnorm_key`."
    )

Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00,  7.14it/s]

OpenVLAForActionPrediction(
  (vision_backbone): PrismaticVisionBackbone(
    (featurizer): VisionTransformer(
      (patch_embed): PatchEmbed(
        (proj): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
        (norm): Identity()
      )
      (pos_drop): Dropout(p=0.0, inplace=False)
      (patch_drop): Identity()
      (norm_pre): Identity()
      (blocks): Sequential(
        (0): Block(
          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
          (attn): Attention(
            (qkv): Linear(in_features=1024, out_features=3072, bias=True)
            (q_norm): Identity()
            (k_norm): Identity()
            (attn_drop): Dropout(p=0.0, inplace=False)
            (proj): Linear(in_features=1024, out_features=1024, bias=True)
            (proj_drop): Dropout(p=0.0, inplace=False)
          )
          (ls1): LayerScale()
          (drop_path1): Identity()
          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
          (mlp)




In [9]:
import numpy as np
from env.libero_env import LiberoEnv
from env.libero_utils import get_libero_image, quat2axisangle
def _obs_to_input(obs):
    # remove the wrist image
    return {
        "full_image": get_libero_image(obs, 224),
        "state": np.concatenate([
            obs["robot0_eef_pos"],
            quat2axisangle(obs["robot0_eef_quat"]),
            obs["robot0_gripper_qpos"]
        ])
    }







Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


In [None]:
from PIL import Image
import torchvision.transforms.functional as F
from torch.nn.utils.rnn import pad_sequence
def center_crop_image(image: Image.Image) -> Image.Image:

    crop_scale = 0.9
    orig_w, orig_h = image.size
    image_tensor = F.to_tensor(image)
    crop_h = int(orig_h * crop_scale)
    crop_w = int(orig_w * crop_scale)
    image_tensor = F.center_crop(image_tensor, (crop_h, crop_w))
    image_tensor = F.resize(image_tensor, (orig_h, orig_w))
    final_image = F.to_pil_image(image_tensor)
    
    final_image = final_image.convert("RGB")
    return final_image

def process_input(inputs:list, task_descriptions:list, config):
    
    batchdata = {"input_ids":[],"attention_mask":[],"pixel_values":[]}  
    
    for i in range(len(inputs)):
        input = inputs[i]
        task_description = task_descriptions[i]
        
        image = Image.fromarray(input["full_image"]).convert("RGB")
        if config["center_crop"]:
            image = center_crop_image(image)
        prompt = f"In: What action should the robot take to {task_description.lower()}?\nOut:"
        batch_feature  = processor(prompt, image)
        
        if "wrist_image" in input.keys():
            wrist_image = Image.fromarray(input["wrist_image"]).convert("RGB")
            if config["center_crop"]:
                wrist_image = center_crop_image(wrist_image)
            wrist_batch_feature = processor(prompt, wrist_image)
            primary_pixel_values = batch_feature["pixel_values"]
            batch_feature["pixel_values"] = torch.cat([primary_pixel_values] + [wrist_batch_feature["pixel_values"]], dim=1)
            
        input_ids = batch_feature["input_ids"]
        attention_mask = batch_feature["attention_mask"]
        pixel_values = batch_feature["pixel_values"]
        
        if not torch.all(input_ids[:, -1] == 29871):
            input_ids = torch.cat(
                (input_ids, torch.unsqueeze(torch.Tensor([29871]).long(), dim=0).to(input_ids.device)), dim=1
            )
            attention_mask = torch.cat(
                (attention_mask, torch.unsqueeze(torch.Tensor([True]).bool(), dim=0).to(attention_mask.device)), dim=1
            )
        
        batchdata["input_ids"].append(input_ids)    
        batchdata["attention_mask"].append(attention_mask)    
        batchdata["pixel_values"].append(pixel_values)    
    
    
    device = torch.device('cuda') 
    
    batchdata["input_ids"] = [x.transpose(0, 1) for x in batchdata["input_ids"]]
    batchdata["attention_mask"] = [x.transpose(0, 1) for x in batchdata["attention_mask"]]
    batchdata["input_ids"] = pad_sequence(batchdata["input_ids"], batch_first=True, padding_value=processor.tokenizer.pad_token_id).squeeze(-1).to(device)
    batchdata["attention_mask"] = pad_sequence(batchdata["attention_mask"], batch_first=True, padding_value=0).squeeze(-1).to(device)
    
    padding_mask = batchdata["input_ids"].ne(processor.tokenizer.pad_token_id)
    assert  torch.all(padding_mask==batchdata["attention_mask"].ne(0))
    padding_mask = ~padding_mask
    padding_mask = padding_mask.int() 
    sorted_indices = torch.argsort(padding_mask, dim=1, descending=True, stable=True)
    batchdata["input_ids"] = torch.gather(batchdata["input_ids"], 1, sorted_indices)
    batchdata["attention_mask"] = torch.gather(batchdata["attention_mask"], 1, sorted_indices)
    
    
    batchdata["pixel_values"] = torch.cat(batchdata["pixel_values"] , dim=0).to(device)
    assert torch.all(batchdata["attention_mask"].ne(0) == batchdata["input_ids"].ne(processor.tokenizer.pad_token_id))

    return batchdata
    

In [None]:
actor_module = actor_module.to('cuda')
actor_module.eval()
actor_module.device

TypeError: 'torch.device' object is not callable

In [22]:
DO_SAMPLE = True
TEMP = 1.6
UNNORM_KEY = "libero_10"
UNNORM_KEY = f"{UNNORM_KEY}_no_noops"
MAX_PROMPT_LENGTH = 512
def pad_sequence_to_length(tensors, max_seq_len, pad_token_id, left_pad=False):
    """
    pad a 2D tensors (e.g. responses, logprobs) in the last dim to max_seq_length.
    input shape: [bs, seq_length]
    output shape: [bs, max_seq_length]
    (0, max_seq_len - tensors.shape[-1]) means right pad to max_seq_length and no left pad
    """
    if tensors.shape[-1] >= max_seq_len:
        return tensors
    pad_tuple = (max_seq_len - tensors.shape[-1], 0) if left_pad else (0, max_seq_len - tensors.shape[-1])
    return torch.nn.functional.pad(tensors, pad_tuple, 'constant', pad_token_id)

@torch.no_grad()
def _generate_one_step(prompts: dict):
    idx = prompts['input_ids']  # (bs, prompt_length)
    attention_mask = prompts['attention_mask']  # left-padded attention_mask
    pixel_values = prompts["pixel_values"]



    # make sampling args can be overriden by inputs
    do_sample = prompts.get('do_sample', DO_SAMPLE)


    temperature = prompts.get('temperature', TEMP)

    #generation_config = GenerationConfig(temperature=temperature, top_p=top_p, top_k=top_k)

    with torch.autocast(device_type='cuda', dtype=torch.bfloat16):
        actions, response = actor_module.generate_action_verl(
            input_ids=idx,
            pixel_values=pixel_values,
            attention_mask=attention_mask,
            padding_idx = processor.tokenizer.pad_token_id,
            do_sample=do_sample,
            unnorm_key= UNNORM_KEY,
            temperature=temperature, )
    
    
    assert processor.tokenizer.pad_token_id is not None

    assert idx.ndim == 2
    idx = pad_sequence_to_length(idx,max_seq_len=MAX_PROMPT_LENGTH,pad_token_id=processor.tokenizer.pad_token_id,left_pad=True)
    
    assert attention_mask.ndim == 2
    attention_mask = pad_sequence_to_length(attention_mask,max_seq_len=MAX_PROMPT_LENGTH,pad_token_id=0,left_pad=True)
    
    
    assert idx.device.type == 'cuda'
    assert response.device.type == 'cuda'
    #assert seq.device.type == 'cuda'
    assert attention_mask.device.type == 'cuda'
    assert pixel_values.device.type == 'cuda'
    batch ={
            'responses': response,
            'input_ids': idx,
            'attention_mask': attention_mask,
            "pixel_values":pixel_values,
            "action":actions,
        }

    return batch

In [None]:
from collections import defaultdict
import copy
import torch.cuda.profiler as profiler

config = {
    "center_crop": True,
    "num_steps_wait": 10
}


# max_steps = 200
max_steps = 512
action_chunks_len = 8
with torch.profiler.profile(
            activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
            profile_memory=True,  # 内存数据采集的开关
            record_shapes=True,  # 算子input shape信息采集的开关
            with_stack=True,
            with_flops=True,
            with_modules=True,
            schedule=torch.profiler.schedule(wait=10, warmup=1, active=3, repeat=2),
            on_trace_ready=torch.profiler.tensorboard_trace_handler("./traces")
    ) as prof:
    libero_env = LiberoEnv(task_name="libero_goal", task_id=0, trial_id=0, is_valid=True, max_steps=50, config=config)
    valid_video = defaultdict(list)
    vla_history = []
    init_data = libero_env.get_initial_state()
    print("Initial data:", init_data)
    task_descriptions = [init_data['task_description']]

    valid_video[init_data['task_file_name']].extend(init_data['valid_images'])
    env_data = copy.deepcopy(init_data)
    env_obs = env_data['obs']
    for step in range(max_steps):
        # print("Step:", step)
        prof.step()
        inputs = [_obs_to_input(env_obs)]
        vla_input = process_input(inputs, task_descriptions, config)
        # vla_input.update(meta_info)
        vla_output = _generate_one_step(vla_input)
        actions = vla_output["action"]
        step_data = {
            "responses": vla_output["responses"],
            "input_ids": vla_output["input_ids"],
            "attention_mask": vla_output["attention_mask"],
            "pixel_values": vla_output["pixel_values"],
            "action": actions,
            "step": step
        }
        vla_history.append(step_data)
        

        result = libero_env.step(actions[0])
        valid_video[init_data['task_file_name']].extend(result['valid_images'])
        env_obs = result["obs"]

        step += action_chunks_len
    

[info] using task orders [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Initial data: {'type': 'init', 'obs': OrderedDict([('robot0_joint_pos', array([ 0.00656868, -0.18201341,  0.01442465, -2.47342319,  0.00391101,
        2.23496506,  0.80100066])), ('robot0_joint_pos_cos', array([ 0.99997843,  0.98348124,  0.99989597, -0.78495709,  0.99999235,
       -0.61640478,  0.69598853])), ('robot0_joint_pos_sin', array([ 0.00656864, -0.18101009,  0.01442415, -0.61955014,  0.003911  ,
        0.78742946,  0.7180529 ])), ('robot0_joint_vel', array([-2.01874574e-03,  1.03248609e-05,  7.92422787e-06, -2.52319442e-05,
        5.34783824e-03, -4.40832558e-05,  1.03695310e-02])), ('robot0_eef_pos', array([-0.21680813,  0.00942231,  1.17117443])), ('robot0_eef_quat', array([ 9.99600340e-01,  1.55205698e-03, -2.82250684e-02,  3.09803869e-04])), ('robot0_gripper_qpos', array([ 0.03872147, -0.03872411])), ('robot0_gripper_qvel', array([ 0.00326971, -0.00325662])), ('agentview_image', array([[[180, 165, 147],
        [1

  return Image.fromarray(npimg, mode=mode)


In [None]:
vla_history

In [25]:
print(valid_video.keys())

dict_keys(['libero_goal_task_0_trial_0'])


In [26]:
import imageio
import random
def save_rollout_video(rollout_images, exp_name, task_name, step_idx, success ):
    """Saves an MP4 replay of an episode."""
    rollout_dir = f"./rollouts/{exp_name}" 
    os.makedirs(rollout_dir, exist_ok=True)
    ran_id = random.randint(1, 10000)
    #processed_task_description = task_description.lower().replace(" ", "_").replace("\n", "_").replace(".", "_")[:50]
    mp4_path = f"{rollout_dir}/step={step_idx}--task={task_name}--success={success}--ran={ran_id}.mp4"
    video_writer = imageio.get_writer(mp4_path, fps=30)
    for img in rollout_images:
        video_writer.append_data(img)
    video_writer.close()
    print(f"Saved rollout MP4 at path {mp4_path}")
    return mp4_path

In [27]:
for task_file, images in valid_video.items():
    # complete = any(r['complete'] for r in task_records if r['task_file_name'] == task_file)
    complete = False
    save_rollout_video(
        images,
        "0",
        task_file,
        0,
        complete
    )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Saved rollout MP4 at path ./rollouts/0/step=0--task=libero_goal_task_0_trial_0--success=False--ran=2133.mp4
