In [2]:
import json
import os
import re
import time
from scripts.generate_embodied_data.primitive_movements import get_move_primitives_episode
import tensorflow_datasets as tfds
from prismatic import load # ?important for tfds to not cost GPU memory

2024-10-20 00:49:50.484811: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-20 00:49:50.484893: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-20 00:49:50.485783: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-20 00:49:50.491674: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
local_path="/home/lixiang/codebase/embodied-CoT/datasets/libero/libero_spatial_no_noops/1.0.0"
builder = tfds.builder_from_directory(builder_dir=local_path)

In [4]:
episide_ids = range(builder.info.splits['train'].num_examples)

In [5]:
def build_prompt(features, language_instruction, caption=None, list_only_moves=False):
    structured_features = "{\n"

    keys = list(features.keys())

    for i in range(len(features[keys[0]])):
        if list_only_moves:
            structured_features = structured_features + f'    {i}: "{features["move_primitive"][i]}"\n'
        else:
            structured_features = structured_features + f'    {i}: {"{"}\n'

            for key in keys:
                feature_value = features[key][i]
                if isinstance(feature_value, str):
                    feature_value = f'"{feature_value}"'

                structured_features = structured_features + f'        "{key}": {feature_value},\n'

            structured_features = structured_features + "    },\n"

    structured_features = structured_features + "}"

    if list_only_moves:
        features_desc = (
            "Each entry in that dictionary corresponds to a single step on the "
            "trajectory and describes the move that is about to be executed."
        )
    else:
        features_desc = (
            "Each entry in that dictionary corresponds to a single step on "
            "the trajectory. The provided features are the following:\n"
            "\n"
            '- "state_3d" are the current 3d coordinates of the robotic arm end effector; '
            "moving forward increases the first coordinate; moving left increases the second "
            "coordinate; moving up increases the third coordinate,\n"
            '- "move_primitive" describes the move that is about to be executed,\n'
            '- "gripper_position" denotes the location of the gripper in the 256x256 image observation'
        )

    if caption is None:
        caption = ""
    else:
        caption = f"""## Scene description

The robot is operating in the following environment. {caption}

"""

    break_line = ""  # for line formatting

    return f"""# Annotate the training trajectory with reasoning

## Specification of the experimental setup

You're an expert reinforcement learning researcher. You've trained an optimal policy for controlling a robotic arm. The
robot successfully completed a task specified by the instruction: "{language_instruction}". For that purpose, the
robotic arm executed a sequence of actions. Consecutive moves that were executed are the following:


```python
trajectory_features = {structured_features}
```

{features_desc}

{caption}## Your objective

I want you to annotate the given trajectory with reasoning. That is, for each step, I need to know not only {
break_line}which action should be chosen, but importantly what reasoning justifies that action choice. I want you to {
break_line}be descriptive and include all the relevant information available. The reasoning should include the task {
break_line}to complete, the remaining high-level steps, the high-level movements that should be executed and why they {
break_line}are required, the premises that allow inferring the direction of each move, including the locations of {
break_line}relevant objects, possible obstacles or difficulties to avoid, and any other relevant justification.

### Begin by describing the task

Start by giving an overview of the task. Make it more comprehensive than the simple instruction. Include the activity, {
break_line}the objects the robotic arm interacts with, and their relative locations in the environment. Then, describe {
break_line}the high-level movements that were most likely executed, based on the task that was completed and the {
break_line}primitive movements that were executed. Then, for each high-level movement write the interval of steps that {
break_line}movement consists of. Also, for each high-level movement write a justification for why it should be {
break_line}executed. Write an answer for this part using markdown and natural language. Be descriptive and highlight {
break_line}all the relevant details, but ensure that your description is consistent with the trajectory that was {
break_line}executed, specified by the features listed above in the `trajectory_features` dictionary.

### List the reasonings for each step

Finally, for each step describe the reasoning that allows to determine the correct action. For each step describe the {
break_line}remaining part of the objective, the current progress, the objects that are still relevant for determining {
break_line}the plan, and the plan for the next steps, based on the available features. Start the reasoning from a high {
break_line}level and gradually add finer features. I need you to be descriptive and very precise. Ensure that the {
break_line}reasoning is consistent with the task and the executed trajectory. Write the answer for this part as a {
break_line}Python-executable dictionary. For every step in the initial trajectory there should be exactly one separate {
break_line}item of the form <step id>:<reasoning>. Do not group the answers. The final dictionary should have exactly {
break_line}the same set of integer keys as the dictionary of features provided in the `trajectory_features` dictionary {
break_line}above. The reasoning should be a single string that describes the reasoning in natural language and {
break_line}includes all the required features.

Each reasoning string should have the following form:
- Describe the full task that remains to be completed (but only describe what remains), and place it inside a {
break_line}tag <task>.
- Describe the complete high-level plan for completing the remaining task (the list of remaining high-level steps), {
break_line}and place it inside a tag <plan>.
- Describe the high-level step that should be executed now (chosen from the list of high-level steps), and place it {
break_line}inside a tag <subtask>.
- Describe why the chosen high-level step should be executed now, which features of the current environment influence {
break_line}that decision, and how it should be done. Place it within a tag <subtask_reason>.
- Describe the current primitive movement of the arm that needs to be executed, and place it inside a tag <move>.
- Describe why the chosen movement should be executed now and which features of the current environment influence that {
break_line}decision. Place it inside a tag <move_reason>.

## Task summary

Here is a breakdown of what needs to be done:

- Describe the task.
- Describe the high-level movements that were executed, based on the completed task and the listed features.
- Describe the plan for the solution that allowed the robot to complete the task successfully.
- For each step on the trajectory, describe the reasoning that leads to determining the correct action. The reasoning {
break_line}should be descriptive and precise. You should provide exactly one reasoning string for each step on the {
break_line}trajectory specified by `trajectory_features`.
- At the very end of the response, write a single label FINISHED to indicate that the answer is complete."""


In [6]:
def build_prompt2(features, language_instruction, caption=None, list_only_moves=False):
    structured_features = "{\n"

    keys = list(features.keys())

    for i in range(len(features[keys[0]])):
        if list_only_moves:
            structured_features = structured_features + f'    {i}: "{features["move_primitive"][i]}"\n'
        else:
            structured_features = structured_features + f'    {i}: {"{"}\n'

            for key in keys:
                feature_value = features[key][i]
                if isinstance(feature_value, str):
                    feature_value = f'"{feature_value}"'

                structured_features = structured_features + f'        "{key}": {feature_value},\n'

            structured_features = structured_features + "    },\n"

    structured_features = structured_features + "}"

    if list_only_moves:
        features_desc = (
            "Each entry in that dictionary corresponds to a single step on the "
            "trajectory and describes the move that is about to be executed."
        )
    else:
        features_desc = (
            "Each entry in that dictionary corresponds to a single step on "
            "the trajectory. The provided features are the following:\n"
            "\n"
            '- "state_3d" are the current 3d coordinates of the robotic arm end effector; '
            "moving forward increases the first coordinate; moving left increases the second "
            "coordinate; moving up increases the third coordinate,\n"
            '- "move_primitive" describes the move that is about to be executed,\n'
            '- "gripper_position" denotes the location of the gripper in the 256x256 image observation'
        )

    if caption is None:
        caption = ""
    else:
        caption = f"""## Scene description

The robot is operating in the following environment. {caption}

"""

    break_line = ""  # for line formatting

    return f"""# Annotate the training trajectory with reasoning

## Specification of the experimental setup

You're an expert reinforcement learning researcher. You've trained an optimal policy for controlling a robotic arm. The
robot successfully completed a task specified by the instruction: "{language_instruction}". For that purpose, the
robotic arm executed a sequence of actions. Consecutive moves that were executed are the following:


```python
trajectory_features = {structured_features}
```

{features_desc}

{caption}## Your objective

I want you to annotate the given trajectory with reasoning. That is, for each step, I need to know not only {
break_line}which action should be chosen, but importantly what reasoning justifies that action choice. I want you to {
break_line}be descriptive and include all the relevant information available. The reasoning should include the task {
break_line}to complete, the remaining high-level steps, the high-level movements that should be executed and why they {
break_line}are required, the premises that allow inferring the direction of each move, including the locations of {
break_line}relevant objects, possible obstacles or difficulties to avoid, and any other relevant justification.

### Begin by describing the task

Start by giving an overview of the task. Make it more comprehensive than the simple instruction. Include the activity, {
break_line}the objects the robotic arm interacts with, and their relative locations in the environment. Then, describe {
break_line}the high-level movements that were most likely executed, based on the task that was completed and the {
break_line}primitive movements that were executed. Then, for each high-level movement write the interval of steps that {
break_line}movement consists of. Also, for each high-level movement write a justification for why it should be {
break_line}executed. Write an answer for this part using markdown and natural language. Be descriptive and highlight {
break_line}all the relevant details, but ensure that your description is consistent with the trajectory that was {
break_line}executed, specified by the features listed above in the `trajectory_features` dictionary.

### List the reasonings for each step

Finally, for each step describe the reasoning that allows to determine the correct action. For each step describe the {
break_line}remaining part of the objective, the current progress, the objects that are still relevant for determining {
break_line}the plan, and the plan for the next steps, based on the available features. Start the reasoning from a high {
break_line}level and gradually add finer features. I need you to be descriptive and very precise. Ensure that the {
break_line}reasoning is consistent with the task and the executed trajectory. Write the answer for this part as a {
break_line}Python-executable dictionary. For every step in the initial trajectory there should be exactly one separate {
break_line}item of the form <step id>:<reasoning>. <reasoning> can be set to 'same' if it is the same as last step. The final dictionary should have exactly {
break_line}the same set of integer keys as the dictionary of features provided in the `trajectory_features` dictionary {
break_line}above. The reasoning should be a single string that describes the reasoning in natural language and {
break_line}includes all the required features.

Each reasoning string should have the following form:
- Describe the full task that remains to be completed (but only describe what remains), and place it inside a {
break_line}tag <task>.
- Describe the complete high-level plan for completing the remaining task (the list of remaining high-level steps), {
break_line}and place it inside a tag <plan>.
- Describe the high-level step that should be executed now (chosen from the list of high-level steps), and place it {
break_line}inside a tag <subtask>.
- Describe why the chosen high-level step should be executed now, which features of the current environment influence {
break_line}that decision, and how it should be done. Place it within a tag <subtask_reason>.
- Describe the current primitive movement of the arm that needs to be executed, and place it inside a tag <move>.
- Describe why the chosen movement should be executed now and which features of the current environment influence that {
break_line}decision. Place it inside a tag <move_reason>.

## Task summary

Here is a breakdown of what needs to be done:

- Describe the task.
- Describe the high-level movements that were executed, based on the completed task and the listed features.
- Describe the plan for the solution that allowed the robot to complete the task successfully.
- For each step on the trajectory, describe the reasoning that leads to determining the correct action. Reasoning can be set to 'same' if it is the same as last step.
- At the very end of the response, write a single label FINISHED to indicate that the answer is complete."""

In [7]:
reasonings = dict()
with open("/home/lixiang/codebase/embodied-CoT/scripts/generate_embodied_data/bounding_boxes/descriptions/full_descriptions.json", "r") as captions_file:
    captions_dict = json.load(captions_file)

In [8]:
ds = builder.as_dataset(split=f'train[{0}%:{100}%]')
#episode = next(iter(ds))

In [None]:
ds = builder.as_dataset(f'train[16:17]')

In [10]:
for i, episode in enumerate(ds):
    if i == 16:
        episode_id = i #episode["episode_metadata"]["episode_id"].numpy()
        file_path = episode["episode_metadata"]["file_path"].numpy().decode()
        break

In [11]:
ft = dict()
ft["state_3d"] = [list(step["observation"]["state"][:3].numpy()) for step in episode["steps"]]
move_primitives = get_move_primitives_episode(episode)
ft["move_primitive"] = [move[0] for move in move_primitives]

In [12]:
mt = {
    "episode_id": str(episode_id),
    "file_path": str(episode["episode_metadata"]["file_path"].numpy())[2:-1],
    "n_steps": len(episode["steps"]),
    "language_instruction": str(next(iter(episode["steps"]))["language_instruction"].numpy().decode()),
}

In [13]:
mt["caption"] = captions_dict[mt["file_path"]][mt["episode_id"]]["caption"]

In [26]:
language_instruction = mt["language_instruction"]
caption = mt["caption"] if "caption" in mt.keys() else None

#prompt = build_prompt(ft, language_instruction, caption=caption, list_only_moves=True)
# use 'same' to replace repeated output to save tokens
prompt = build_prompt2(ft, language_instruction, caption=caption, list_only_moves=True)
print("metadata:", mt, "\nprompt:", prompt)


metadata: {'episode_id': '1', 'file_path': '/iris/u/moojink/prismatic-dev/LIBERO/libero/datasets/regenerated--no_noops/libero_spatial/pick_up_the_black_bowl_in_the_top_drawer_of_the_wooden_cabinet_and_place_it_on_the_plate_demo.hdf5', 'n_steps': 138, 'language_instruction': 'pick up the black bowl in the top drawer of the wooden cabinet and place it on the plate', 'caption': 'A silver robot arm is picking up a black bowl from the top drawer of a wooden cabinet, which has a white plate on the countertop nearby.'} 
prompt: # Annotate the training trajectory with reasoning

## Specification of the experimental setup

You're an expert reinforcement learning researcher. You've trained an optimal policy for controlling a robotic arm. The
robot successfully completed a task specified by the instruction: "pick up the black bowl in the top drawer of the wooden cabinet and place it on the plate". For that purpose, the
robotic arm executed a sequence of actions. Consecutive moves that were execut

In [27]:
from call_llm import call_qianwen
from utils import parse_trajectory_file, replace_same_content, save_trajectory_file

In [28]:
call_qianwen(prompt, episode_id)

/home/lixiang/codebase/embodied-CoT/scripts/generate_embodied_data/prompt/1.txt
prompt file save to: /home/lixiang/codebase/embodied-CoT/scripts/generate_embodied_data/prompt/1.txt
/home/lixiang/codebase/embodied-CoT/scripts/generate_embodied_data/prompt_answer/1.txt
answer file save to: /home/lixiang/codebase/embodied-CoT/scripts/generate_embodied_data/prompt_answer/1.txt


In [16]:
input_file_path = f'/home/lixiang/codebase/embodied-CoT/scripts/generate_embodied_data/prompt_answer/{episode_id}.txt'
trajectory_data = parse_trajectory_file(input_file_path)
# Replace "same" entries
updated_trajectory_data = replace_same_content(trajectory_data)
# Save the updated data to a new file
output_file_path = f'/home/lixiang/codebase/embodied-CoT/scripts/generate_embodied_data/prompt_answer/{episode_id}_new.txt'  # replace with your desired output file path
save_trajectory_file(updated_trajectory_data, output_file_path)


In [17]:
with open(output_file_path, 'r') as f:
    reasoning_output = f.read()
print(reasoning_output)

0: <task>Pick up the black bowl next to the cookie box and place it on the plate.</task><plan>1. Move to the black bowl, 2. Pick up the black bowl, 3. Move to the plate, 4. Place the black bowl on the plate.</plan><subtask>Move to the black bowl</subtask><subtask_reason>The robot needs to approach the black bowl to be able to pick it up. It starts by stopping to ensure it is ready to move.</subtask_reason><move>stop</move><move_reason>The robot is initially stationary and needs to prepare to start moving.</move_reason>
1: <task>Pick up the black bowl next to the cookie box and place it on the plate.</task><plan>1. Move to the black bowl, 2. Pick up the black bowl, 3. Move to the plate, 4. Place the black bowl on the plate.</plan><subtask>Move to the black bowl</subtask><subtask_reason>The robot needs to approach the black bowl to be able to pick it up. It starts by stopping to ensure it is ready to move.</subtask_reason><move>stop</move><move_reason>The robot is initially stationary an

In [18]:
import re
def find_task_occurrences(input_string, tags):
    pattern = r"(\d+):"
    for tag in tags:
        pattern = pattern + r"\s*<" + tag + r">([^<]*)<\/" + tag + ">"

    matches = re.findall(pattern, input_string)
    return matches

tags=("task", "plan", "subtask", "subtask_reason", "move", "move_reason")

trajectory = dict()

matches = find_task_occurrences(reasoning_output, tags)

for match in matches:
    trajectory[int(match[0])] = dict(zip(tags, match[1:]))

In [19]:
print('len(trajectory)', len(trajectory), 'trajectory[0]', trajectory[0])

len(trajectory) 110 trajectory[0] {'task': 'Pick up the black bowl next to the cookie box and place it on the plate.', 'plan': '1. Move to the black bowl, 2. Pick up the black bowl, 3. Move to the plate, 4. Place the black bowl on the plate.', 'subtask': 'Move to the black bowl', 'subtask_reason': 'The robot needs to approach the black bowl to be able to pick it up. It starts by stopping to ensure it is ready to move.', 'move': 'stop', 'move_reason': 'The robot is initially stationary and needs to prepare to start moving.'}


In [20]:
reasoning = trajectory
entry = {"reasoning": reasoning, "features": ft, "metadata": mt}

In [1]:
import json
reasoning_dataset_path = "/home/lixiang/codebase/embodied-CoT/datasets/embodied_features_bridge/embodied_features_bridge.json"    
with open(reasoning_dataset_path, "r") as f:
    reasoning_dataset = json.load(f)

In [3]:
len(reasoning_dataset.keys())

3200

In [4]:
for file_name in reasoning_dataset.keys():
    break
file_name

'/nfs/kun2/users/homer/datasets/bridge_data_all/numpy_256/bridge_data_v2/deepthought_folding_table/stack_blocks/19/train/out.npy'

In [5]:
for epsoid_id in reasoning_dataset[file_name].keys():
    break
epsoid_id

'43'

In [8]:
reasoning_dataset[file_name][epsoid_id].keys()

dict_keys(['features', 'metadata', 'reasoning'])

In [10]:
reasoning_dataset[file_name][epsoid_id]["features"].keys()

dict_keys(['move_primitive', 'gripper_position', 'bboxes'])

In [11]:
reasoning_dataset[file_name][epsoid_id]["metadata"].keys()

dict_keys(['episode_id', 'file_path', 'n_steps', 'language_instruction'])

In [12]:
reasoning_dataset[file_name][epsoid_id]["reasoning"].keys()

dict_keys(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39'])

In [14]:
reasoning_dataset[file_name][epsoid_id]["reasoning"]['0']

{'task': 'Move the wooden arch onto the table.',
 'plan': 'Reach for the wooden arch. Grasp the wooden arch. Move the wooden arch to the table. Drop the wooden arch onto the table.',
 'subtask': 'Reach for the wooden arch.',
 'subtask_reason': 'The wooden arch is the object that needs to be moved, so the first step is to reach for it.',
 'move': 'stop',
 'move_reason': 'The arm is already in a good position to reach for the wooden arch.'}

In [15]:
reasoning_dataset[file_name][epsoid_id]["metadata"]

{'episode_id': 43,
 'file_path': '/nfs/kun2/users/homer/datasets/bridge_data_all/numpy_256/bridge_data_v2/deepthought_folding_table/stack_blocks/19/train/out.npy',
 'n_steps': 40,
 'language_instruction': 'move the wooden arch to the table'}

In [20]:
reasoning_dataset[file_name][epsoid_id]["features"]['move_primitive'][0]

'stop'

In [21]:
reasoning_dataset[file_name][epsoid_id]["features"]['gripper_position'][0]

[97, 45]

In [22]:
reasoning_dataset[file_name][epsoid_id]["features"]['bboxes'][0]

[[0.3405100107192993, 'wooden blocks', [150, 4, 188, 100]]]

In [19]:
len(reasoning_dataset[file_name][epsoid_id]["features"]['move_primitive'])

40