In [9]:
from absl import app, flags, logging
import flax
import jax
import optax
import tensorflow as tf
import numpy as np
import tqdm
import wandb

from octo.data.dataset import make_single_dataset
from octo.data.utils.data_utils import NormalizationType
from octo.model.components.action_heads import L1ActionHead
from octo.model.components.tokenizers import LowdimObsTokenizer
from octo.model.octo_model import OctoModel
from octo.utils.jax_utils import initialize_compilation_cache
from octo.utils.spec import ModuleSpec
from octo.utils.train_utils import ( freeze_weights, merge_params, process_text, TrainState )
import tensorflow_datasets as tfds


In [6]:
from octo.model.octo_model import OctoModel

model = OctoModel.load_pretrained("hf://rail-berkeley/octo-base")

Fetching 8 files: 100%|██████████| 8/8 [00:00<00:00, 59283.45it/s]


In [10]:
print(model.dataset_statistics.keys())

# dict_keys(['austin_buds_dataset_converted_externally_to_rlds', 'austin_sailor_dataset_converted_externally_to_rlds', 'austin_sirius_dataset_converted_externally_to_rlds', 'bc_z', 
#            'berkeley_autolab_ur5', 'berkeley_cable_routing', 'berkeley_fanuc_manipulation', 'bridge_dataset', 'cmu_stretch', 'dlr_edan_shared_control_converted_externally_to_rlds', 
#            'fractal20220817_data', 'furniture_bench_dataset_converted_externally_to_rlds', 'iamlab_cmu_pickup_insert_converted_externally_to_rlds', 'jaco_play', 'kuka', 'language_table', 
#            'nyu_door_opening_surprising_effectiveness', 'nyu_franka_play_dataset_converted_externally_to_rlds', 'roboturk', 'stanford_hydra_dataset_converted_externally_to_rlds', 'taco_play', 
#            'toto', 'ucsd_kitchen_dataset_converted_externally_to_rlds', 'utaustin_mutex', 'viola'])

dict_keys(['austin_buds_dataset_converted_externally_to_rlds', 'austin_sailor_dataset_converted_externally_to_rlds', 'austin_sirius_dataset_converted_externally_to_rlds', 'bc_z', 'berkeley_autolab_ur5', 'berkeley_cable_routing', 'berkeley_fanuc_manipulation', 'bridge_dataset', 'cmu_stretch', 'dlr_edan_shared_control_converted_externally_to_rlds', 'fractal20220817_data', 'furniture_bench_dataset_converted_externally_to_rlds', 'iamlab_cmu_pickup_insert_converted_externally_to_rlds', 'jaco_play', 'kuka', 'language_table', 'nyu_door_opening_surprising_effectiveness', 'nyu_franka_play_dataset_converted_externally_to_rlds', 'roboturk', 'stanford_hydra_dataset_converted_externally_to_rlds', 'taco_play', 'toto', 'ucsd_kitchen_dataset_converted_externally_to_rlds', 'utaustin_mutex', 'viola'])


In [11]:
# create RLDS dataset builder
builder = tfds.builder_from_directory(builder_dir='gs://gresearch/robotics/iamlab_cmu_pickup_insert_converted_externally_to_rlds/0.1.0/')
ds = builder.as_dataset(split='train[:1]')

# # sample episode + resize to 256x256 (default third-person cam resolution)
# episode = next(iter(ds))
# steps = list(episode['steps'])
# images = [cv2.resize(np.array(step['observation']['image']), (256, 256)) for step in steps]

# # extract goal image & language instruction
# goal_image = images[-1]
# language_instruction = steps[0]['observation']['natural_language_instruction'].numpy().decode()

# # visualize episode
# print(f'Instruction: {language_instruction}')
# media.show_video(images, fps=10)

2024-05-09 12:35:54.937889: W external/local_tsl/tsl/platform/cloud/google_auth_provider.cc:184] All attempts to get a Google authentication bearer token failed, returning an empty token. Retrieving token from files failed with "NOT_FOUND: Could not locate the credentials file.". Retrieving token from GCE failed with "FAILED_PRECONDITION: Error executing an HTTP request: libcurl code 6 meaning 'Couldn't resolve host name', error details: Could not resolve host: metadata.google.internal".
2024-05-09 12:35:57.206003: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [13]:
import mediapy as media
import cv2
# sample episode + resize to 256x256 (default third-person cam resolution)
episode = next(iter(ds))
steps = list(episode['steps'])
images = [cv2.resize(np.array(step['observation']['image']), (256, 256)) for step in steps]

In [14]:
a = steps[0]

In [15]:
a.keys()

dict_keys(['action', 'discount', 'is_first', 'is_last', 'is_terminal', 'language_embedding', 'language_instruction', 'observation', 'reward'])

In [None]:
# extract goal image & language instruction
goal_image = images[-1]
language_instruction = steps[0]['observation']['language_instruction'].numpy().decode()

# visualize episode
print(f'Instruction: {language_instruction}')
media.show_video(images, fps=10)

In [41]:
batch_size = 1
horizon = 2

# set up the model:
pretrained_path = "hf://rail-berkeley/octo-base"
initialize_compilation_cache()
pretrained_model = OctoModel.load_pretrained(pretrained_path)
print(pretrained_model.get_pretty_spec())
config = pretrained_model.config
del config["model"]["observation_tokenizers"]["wrist"]
text_processor = pretrained_model.text_processor
text = "pick up the ball and throw it to the big red dog, then slap the ball with the bat."
batch = {"task": {"language_instruction": [text.encode('utf-8')]},
        "observation": {"image_primary": np.random.uniform(0, 256, size=(batch_size, horizon, 256, 256, 3)).astype(np.int8),
                        "pad_mask": np.ones((batch_size, horizon)).astype(np.float32)}}
example_batch = process_text(batch, text_processor)
config["model"]["heads"]["action"] = ModuleSpec.create(L1ActionHead, pred_horizon=horizon, action_dim=7, readout_key="readout_action")
model = OctoModel.from_config(config, example_batch, text_processor, verbose=True)
merged_params = merge_params(model.params, pretrained_model.params)
model = model.replace(params=merged_params)
print(model.get_pretty_spec())
del pretrained_model


Fetching 8 files: 100%|██████████| 8/8 [00:00<00:00, 79324.90it/s]



This model is trained with a window size of 2, predicting 7 dimensional actions 4 steps into the future.
Observations and tasks conform to the following spec:

Observations: {
    image_primary: ('batch', 'history_window', 256, 256, 3),
    image_wrist: ('batch', 'history_window', 128, 128, 3),
}
Tasks: {
    image_primary: ('batch', 256, 256, 3),
    image_wrist: ('batch', 128, 128, 3),
    language_instruction: {
        attention_mask: ('batch', 16),
        input_ids: ('batch', 16),
    },
}

At inference, you may pass in any subset of these observation and task keys, with a history window up to 2 timesteps.



    task_*: <AttentionRule.CAUSAL: 'other.timestep <= self.timestep'>,
})
    task_*: <AttentionRule.CAUSAL: 'other.timestep <= self.timestep'>,
    obs_*: <AttentionRule.CAUSAL: 'other.timestep <= self.timestep'>,
})
    task_*: <AttentionRule.CAUSAL: 'other.timestep <= self.timestep'>,
    obs_*: <AttentionRule.CAUSAL: 'other.timestep <= self.timestep'>,
    readout_action: <AttentionRule.CAUSAL: 'other.timestep <= self.timestep'>,
})



[3m                               OctoModule Summary                               [0m
┏━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓
┃[1m [0m[1mpath         [0m[1m [0m┃[1m [0m[1mmodule       [0m[1m [0m┃[1m [0m[1minputs       [0m[1m [0m┃[1m [0m[1moutputs      [0m[1m [0m┃[1m [0m[1mparams      [0m[1m [0m┃
┡━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩
│               │ OctoModule    │ -             │ - obs:        │              │
│               │               │ image_primar… │     mask:     │              │
│               │               │ [2mint8[0m[1,2,256… │ [2mbool[0m[1,2,256] │              │
│               │               │   pad_mask:   │     tokens:   │              │
│               │               │ [2mfloat32[0m[1,2]  │ [2mfloat32[0m[1,2,… │              │
│               │               │ -             │   obs_primar… │              │
│               │            




This model is trained with a window size of 2, predicting 7 dimensional actions 2 steps into the future.
Observations and tasks conform to the following spec:

Observations: {
    image_primary: ('batch', 'history_window', 256, 256, 3),
}
Tasks: {
    language_instruction: {
        attention_mask: ('batch', 16),
        input_ids: ('batch', 16),
    },
}

At inference, you may pass in any subset of these observation and task keys, with a history window up to 2 timesteps.



In [47]:
model.

OctoModel(module=OctoModule(
    # attributes
    octo_transformer = OctoTransformer(
        # attributes
        observation_tokenizers = {'primary': ImageTokenizer(
            # attributes
            encoder = {'module': 'octo.model.components.vit_encoders', 'name': 'SmallStem16', 'args': [], 'kwargs': {}}
            use_token_learner = False
            num_tokens = 8
            conditioning_type = 'none'
            obs_stack_keys = ['image_primary']
            task_stack_keys = ['image_primary']
            task_film_keys = ()
            proper_pad_mask = True
        )}
        task_tokenizers = {'language': LanguageTokenizer(
            # attributes
            encoder = 't5-base'
            finetune_encoder = False
            proper_pad_mask = True
        )}
        readouts = {'action': 1}
        transformer_kwargs = {'attention_dropout_rate': 0.0, 'add_position_embedding': False, 'num_layers': 12, 'mlp_dim': 3072, 'num_attention_heads': 12, 'dropout_rate': 0.0}
  

In [33]:
batch_size = 1
horizon = 2

def process_batch(batch):
    batch = process_text(batch, text_processor)
    return batch

pretrained_path = "hf://rail-berkeley/octo-base"

initialize_compilation_cache()

tf.config.set_visible_devices([], "GPU")

pretrained_model = OctoModel.load_pretrained(pretrained_path)
print(pretrained_model.get_pretty_spec())

config = pretrained_model.config
del config["model"]["observation_tokenizers"]["wrist"]

text_processor = pretrained_model.text_processor

text = "pick up the ball and throw it to the big red dog, then slap the ball with the bat."
batch = {
        "task": {"language_instruction": [text.encode('utf-8')]},
        "observation": {
                        "proprio": np.random.uniform(-2.0, 2.0, size=(batch_size, horizon, 14)).astype(np.float32),
                        "image": np.random.uniform(0, 256, size=(batch_size, horizon, 256, 256, 3)).astype(np.int8),
                        "pad_mask": np.ones((batch_size, horizon)).astype(np.float32)
                        }
    }
example_batch = process_batch(batch)

config["model"]["observation_tokenizers"]["proprio"] = ModuleSpec.create(LowdimObsTokenizer, n_bins=256, bin_type="normal", low=-2.0, high=2.0, obs_keys=["proprio"])
config["model"]["heads"]["action"] = ModuleSpec.create(L1ActionHead, pred_horizon=horizon, action_dim=14, readout_key="readout_action")
model = OctoModel.from_config(config, example_batch, text_processor, verbose=True)

merged_params = merge_params(model.params, pretrained_model.params)
model = model.replace(params=merged_params)
del pretrained_model

Fetching 8 files: 100%|██████████| 8/8 [00:00<00:00, 20497.51it/s]



This model is trained with a window size of 2, predicting 7 dimensional actions 4 steps into the future.
Observations and tasks conform to the following spec:

Observations: {
    image_primary: ('batch', 'history_window', 256, 256, 3),
    image_wrist: ('batch', 'history_window', 128, 128, 3),
}
Tasks: {
    image_primary: ('batch', 256, 256, 3),
    image_wrist: ('batch', 128, 128, 3),
    language_instruction: {
        attention_mask: ('batch', 16),
        input_ids: ('batch', 16),
    },
}

At inference, you may pass in any subset of these observation and task keys, with a history window up to 2 timesteps.



    task_*: <AttentionRule.CAUSAL: 'other.timestep <= self.timestep'>,
})
    task_*: <AttentionRule.CAUSAL: 'other.timestep <= self.timestep'>,
    obs_*: <AttentionRule.CAUSAL: 'other.timestep <= self.timestep'>,
})
    task_*: <AttentionRule.CAUSAL: 'other.timestep <= self.timestep'>,
    obs_*: <AttentionRule.CAUSAL: 'other.timestep <= self.timestep'>,
    readout_action: <AttentionRule.CAUSAL: 'other.timestep <= self.timestep'>,
})



[3m                               OctoModule Summary                               [0m
┏━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓
┃[1m [0m[1mpath         [0m[1m [0m┃[1m [0m[1mmodule       [0m[1m [0m┃[1m [0m[1minputs       [0m[1m [0m┃[1m [0m[1moutputs      [0m[1m [0m┃[1m [0m[1mparams      [0m[1m [0m┃
┡━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩
│               │ OctoModule    │ - image:      │ - obs:        │              │
│               │               │ [2mint8[0m[1,2,256… │     mask:     │              │
│               │               │   pad_mask:   │ [2mbool[0m[1,2,14]  │              │
│               │               │ [2mfloat32[0m[1,2]  │     tokens:   │              │
│               │               │   proprio:    │ [2mfloat32[0m[1,2,… │              │
│               │               │ [2mfloat32[0m[1,2,… │   obs_propri… │              │
│               │    



In [39]:
# run the model once and get an output
# language_instruction = batch["task"]["language_instruction"]
task = model.create_tasks(texts="please pick up the ball and place on the red matt")



obs = {
        "proprio": np.random.uniform(-2.0, 2.0, size=(batch_size, horizon, 14)).astype(np.float32),
        "image": np.random.uniform(0, 256, size=(batch_size, horizon, 256, 256, 3)).astype(np.int8),
        "pad_mask": np.ones((batch_size, horizon)).astype(np.float32)
    }

actions = model.sample_actions(jax.tree_map(lambda x: x[None], obs), task, rng=jax.random.PRNGKey(0))
actions = actions[0]


ERROR:root:'observations' contains mismatched shapes compared to example_batch: {
    image: '(1, 1, 2, 256, 256, 3) != (1, 2, 256, 256, 3)',
    pad_mask: '(1, 1, 2) != (1, 2)',
    proprio: '(1, 1, 2, 14) != (1, 2, 14)',
}


AssertionError: observations does not match example batch.

In [None]:
pretrained_path = "hf://rail-berkeley/octo-base"

initialize_compilation_cache()
# prevent tensorflow from using GPU memory since it's only used for data loading
tf.config.set_visible_devices([], "GPU")

# load pre-trained model
logging.info("Loading pre-trained model...")
pretrained_model = OctoModel.load_pretrained(pretrained_path)

In [None]:
def print_nested(val, nesting = -5): 
	if type(val) == dict: 
		print('') 
		nesting += 5 
		for k in val: 
			print(nesting * ' ', end='') 
			print(k, end=':') 
			print_nested(val[k],nesting) 
	else: 
		print(val)

print_nested(pretrained_model.config)
print(pretrained_model.get_pretty_spec())