In [None]:
import gymnasium as gym
import numpy as np
import roboverse

from stable_baselines3 import TD3
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3 import DDPG, HerReplayBuffer
from sb3_contrib import TQC
from sb3_contrib.common.wrappers import TimeFeatureWrapper
from stable_baselines3.common.vec_env import VecNormalize
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.callbacks import EvalCallback


model = TQC.load("data/tqc1/tqc_model_873000_steps")
env = roboverse.make("Widow250PickPlace-v2",
                         gui=True,
                         observation_mode="pixels",
                         transpose_image=False)
model.set_env(env)
env = model.get_env()

obs = env.reset()
print("start render")
for i in range(int(1e4)):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    print(rewards)
    env.render("human")
env.close()

In [3]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2"

In [1]:
import time
import numpy as np
from roboverse.policies import policies
from stable_baselines3.common.utils import set_random_seed


def make_env(env_id: str, rank: int, seed: int = 0):
    """
    Utility function for multiprocessed env.

    :param env_id: the environment ID
    :param num_env: the number of environments you wish to have in subprocesses
    :param seed: the inital seed for RNG
    :param rank: index of the subprocess
    """
    def _init():
        env = roboverse.make(env_id,
                         gui=False,
                         observation_mode="pixels",
                         transpose_image=False)
        #env = TimeFeatureWrapper(env)
        #env.reset(seed=seed + rank)
        env.reset()
        return env
    set_random_seed(seed)
    return _init


def collect_data(env, model, policy, target, num_trajectories=100, num_timesteps=30):
    policy_class = policies[policy]
    policy = policy_class(env)
    num_success = 0
    num_saved = 0
    accept_trajectory_key = target
    noise = 0.1
    EPSILON = 0.1

    while num_saved < num_trajectories:
        num_saved += 1
        num_steps = 1e6
        rewards = []
        env.reset()
        policy.reset()
        time.sleep(0.1)
        success = False
        for j in range(num_timesteps):
            action, agent_info = policy.get_action()

            # In case we need to pad actions by 1 for easier realNVP modelling 
            env_action_dim = env.action_space.shape[0]
            #if env_action_dim - action.shape[0] == 1:
            #    action = np.append(action, 0)
            action += np.random.normal(scale=noise, size=(env_action_dim,))
            action = np.clip(action, -1 + EPSILON, 1 - EPSILON)
            observation = env.get_observation()
            observation["image"] = np.transpose(observation["image"], (2, 0, 1))
            next_observation, reward, done, info = env.step(action)
            next_observation["image"] = np.transpose(next_observation["image"], (2, 0, 1))
            rewards.append(reward)
            success = sum(rewards) > 70
            model.replay_buffer.add(observation, next_observation, action, reward, np.array([done]), [{}])

            if success and num_steps > 1e3: #info[accept_trajectory_key]
                num_steps = j

            if success and j > 23: #info[accept_trajectory_key]
                break
            if done or agent_info['done']:
                break

        if success: #info[accept_trajectory_key]
            PRINT = False
            if PRINT:
                print("num_timesteps: ", num_steps, rewards)
                #print(observation["image"].shape)
                #print(next_observation["image"].shape)
            num_success += 1
        if num_saved%100 == 0:
            print(f"num_trajectories: {num_saved} success rate: {num_success/num_saved} Reward: {sum(rewards)}")

    print("success rate: {}".format(num_success / (num_saved)))


pybullet build time: Oct 14 2023 15:44:17


In [2]:
import gymnasium as gym
import numpy as np
import roboverse

from stable_baselines3 import TD3
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3 import DDPG, HerReplayBuffer
from sb3_contrib import TQC
from sb3_contrib.common.wrappers import TimeFeatureWrapper
from stable_baselines3.common.vec_env import VecNormalize
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback


env = roboverse.make("Widow250PickPlace-v2",
                         gui=False,
                         observation_mode="pixels",
                         transpose_image=False)
#env = TimeFeatureWrapper(env)
#env = DummyVecEnv([make_env("Widow250PickPlace-v1", i) for i in range(4)])
seed = 1
obs = env.reset()

# Save a checkpoint every 1000 steps
checkpoint_callback = CheckpointCallback(
  save_freq=1000,
  save_path=f"./data/seed_{seed}/",
  name_prefix="tqc_model",
  save_replay_buffer=False,
  save_vecnormalize=False,
)

model = TQC(env=env, batch_size=2048, buffer_size=200_000, gamma=0.95, learning_rate=0.001, policy='MultiInputPolicy',
             policy_kwargs=dict(net_arch=[512, 512, 512], n_critics=2),
             replay_buffer_class=HerReplayBuffer,
             replay_buffer_kwargs=dict(goal_selection_strategy='future', n_sampled_goal=4),
             tau=0.05, learning_starts=200, verbose=1)

#model = TQC.load("data/tqc")
#model.set_env(env)
COLLECT=True
if COLLECT:
    collect_data(env, model, "pickplace", "place_success_target", 3500, 30)
    model.save_replay_buffer(f"data/seed_{seed}/tqc_expert_pick_place")
else:
    print("load_replay_buffer")
    model.load_replay_buffer(f"data/seed_{seed}/tqc_expert_pick_place")

print("start pre-training from buffer only")
model.learn(total_timesteps=0, callback=checkpoint_callback, log_interval=5, tb_log_name="exp", reset_num_timesteps = False, progress_bar=True)
model.train(gradient_steps=20000)

print("start learning")
model.learn(total_timesteps=480_000, callback=checkpoint_callback, log_interval=5, tb_log_name="exp", reset_num_timesteps = False, progress_bar=True)
model.save(f"data/seed_{seed}/tqc_pick_place")
model.save_replay_buffer(f"data/seed_{seed}/tqc_trained_pick_place")

print("load_replay_buffer")
model.load_replay_buffer(f"data/seed_{seed}/tqc_expert_pick_place")
collect_data(env, model, "pickplace", "place_success_target", 10000, 35)
model.save_replay_buffer(f"data/seed_{seed}/tqc_expert_pick_place")

model.learn(total_timesteps=500_000, callback=checkpoint_callback, log_interval=5, tb_log_name="exp", reset_num_timesteps = False, progress_bar=True)
model.save(f"data/seed_{seed}/tqc_pick_place")
model.save_replay_buffer(f"data/seed_{seed}/tqc_trained_pick_place")

print("finish learning")

  logger.warn(
  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
  logger.warn(
  logger.warn(
  logger.warn(


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


  logger.deprecation(
  if not isinstance(done, (bool, np.bool8)):
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(


num_trajectories: 100 success rate: 0.79 Reward: 133.0
num_trajectories: 200 success rate: 0.84 Reward: 81.0
num_trajectories: 300 success rate: 0.86 Reward: 80.0
num_trajectories: 400 success rate: 0.8725 Reward: 133.0
num_trajectories: 500 success rate: 0.88 Reward: 137.0
num_trajectories: 600 success rate: 0.88 Reward: 76.0
num_trajectories: 700 success rate: 0.8885714285714286 Reward: 81.0
num_trajectories: 800 success rate: 0.89 Reward: 84.0
num_trajectories: 900 success rate: 0.8944444444444445 Reward: 84.0
num_trajectories: 1000 success rate: 0.893 Reward: 130.0
num_trajectories: 1100 success rate: 0.8927272727272727 Reward: 80.0
num_trajectories: 1200 success rate: 0.895 Reward: 81.0
num_trajectories: 1300 success rate: 0.8946153846153846 Reward: 80.0
num_trajectories: 1400 success rate: 0.8935714285714286 Reward: 78.0
num_trajectories: 1500 success rate: 0.8946666666666667 Reward: 129.0
num_trajectories: 1600 success rate: 0.89375 Reward: 82.0
num_trajectories: 1700 success ra

KeyboardInterrupt: 

In [2]:
import gymnasium as gym
import numpy as np
import roboverse

from stable_baselines3 import TD3
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3 import DDPG, HerReplayBuffer
from sb3_contrib import TQC
from sb3_contrib.common.wrappers import TimeFeatureWrapper
from stable_baselines3.common.vec_env import VecNormalize
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback


env = roboverse.make("Widow250PickPlace-v2",
                         gui=False,
                         observation_mode="pixels",
                         transpose_image=False)
#env = TimeFeatureWrapper(env)
#env = DummyVecEnv([make_env("Widow250PickPlace-v1", i) for i in range(4)])
seed = 2
obs = env.reset()

# Save a checkpoint every 1000 steps
checkpoint_callback = CheckpointCallback(
  save_freq=1000,
  save_path=f"./data/seed_{seed}/",
  name_prefix="tqc_model",
  save_replay_buffer=False,
  save_vecnormalize=False,
)

model = TQC(env=env, batch_size=2048, buffer_size=300_000, gamma=0.95, learning_rate=0.001, policy='MultiInputPolicy',
             policy_kwargs=dict(net_arch=[512, 512, 512], n_critics=2),
             replay_buffer_class=HerReplayBuffer,
             replay_buffer_kwargs=dict(goal_selection_strategy='future', n_sampled_goal=4),
             tau=0.05, learning_starts=200, verbose=1)

#model = TQC.load("data/tqc")
#model.set_env(env)
COLLECT=True
if COLLECT:
    collect_data(env, model, "pickplace", "place_success_target", 10000, 35)
    model.save_replay_buffer(f"data/seed_{seed}/tqc_expert_pick_place")
else:
    print("load_replay_buffer")
    model.load_replay_buffer(f"data/seed_{seed}/tqc_expert_pick_place")



  logger.warn(
  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
  logger.warn(
  logger.warn(
  logger.warn(


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


  logger.deprecation(
  if not isinstance(done, (bool, np.bool8)):
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(


num_trajectories: 100 success rate: 0.85 Reward: 82.0
num_trajectories: 200 success rate: 0.87 Reward: 84.0
num_trajectories: 300 success rate: 0.8766666666666667 Reward: 133.0
num_trajectories: 400 success rate: 0.8925 Reward: 80.0
num_trajectories: 500 success rate: 0.9 Reward: 86.0
num_trajectories: 600 success rate: 0.9016666666666666 Reward: 84.0
num_trajectories: 700 success rate: 0.9085714285714286 Reward: 75.0
num_trajectories: 800 success rate: 0.91 Reward: 90.0
num_trajectories: 900 success rate: 0.91 Reward: 135.0
num_trajectories: 1000 success rate: 0.9 Reward: 82.0
num_trajectories: 1100 success rate: 0.9009090909090909 Reward: 75.0
num_trajectories: 1200 success rate: 0.8983333333333333 Reward: -35.0
num_trajectories: 1300 success rate: 0.9 Reward: 78.0
num_trajectories: 1400 success rate: 0.8985714285714286 Reward: 84.0
num_trajectories: 1500 success rate: 0.898 Reward: 82.0
num_trajectories: 1600 success rate: 0.9 Reward: 78.0
num_trajectories: 1700 success rate: 0.9 Re



In [2]:
import gymnasium as gym
import numpy as np
import roboverse

from stable_baselines3 import TD3
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3 import DDPG, HerReplayBuffer
from sb3_contrib import TQC
from sb3_contrib.common.wrappers import TimeFeatureWrapper
from stable_baselines3.common.vec_env import VecNormalize
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback


env = roboverse.make("Widow250PickPlace-v2",
                         gui=False,
                         observation_mode="pixels",
                         transpose_image=False)
#env = TimeFeatureWrapper(env)
#env = DummyVecEnv([make_env("Widow250PickPlace-v1", i) for i in range(4)])
seed = 3
obs = env.reset()

# Save a checkpoint every 1000 steps
checkpoint_callback = CheckpointCallback(
  save_freq=1000,
  save_path=f"./data/seed_{seed}/",
  name_prefix="tqc_model",
  save_replay_buffer=False,
  save_vecnormalize=False,
)

model = TQC(env=env, batch_size=2048, buffer_size=300_000, gamma=0.95, learning_rate=0.001, policy='MultiInputPolicy',
             policy_kwargs=dict(net_arch=[512, 512, 512], n_critics=2),
             replay_buffer_class=HerReplayBuffer,
             replay_buffer_kwargs=dict(goal_selection_strategy='future', n_sampled_goal=4),
             tau=0.05, learning_starts=200, verbose=1)

#model = TQC.load("data/tqc")
#model.set_env(env)
COLLECT=True
if COLLECT:
    collect_data(env, model, "pickplace", "place_success_target", 10000, 35)
    model.save_replay_buffer(f"data/seed_{seed}/tqc_expert_pick_place")
else:
    print("load_replay_buffer")
    model.load_replay_buffer(f"data/seed_{seed}/tqc_expert_pick_place")



  logger.warn(
  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
  logger.warn(
  logger.warn(
  logger.warn(


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


  logger.deprecation(
  if not isinstance(done, (bool, np.bool8)):
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(


num_trajectories: 100 success rate: 0.95 Reward: 78.0
num_trajectories: 200 success rate: 0.89 Reward: -35.0
num_trajectories: 300 success rate: 0.8933333333333333 Reward: 82.0
num_trajectories: 400 success rate: 0.895 Reward: 79.0
num_trajectories: 500 success rate: 0.9 Reward: 84.0
num_trajectories: 600 success rate: 0.8916666666666667 Reward: 78.0
num_trajectories: 700 success rate: 0.8957142857142857 Reward: 82.0
num_trajectories: 800 success rate: 0.89125 Reward: 133.0
num_trajectories: 900 success rate: 0.8944444444444445 Reward: 86.0
num_trajectories: 1000 success rate: 0.899 Reward: 80.0
num_trajectories: 1100 success rate: 0.9018181818181819 Reward: 84.0
num_trajectories: 1200 success rate: 0.8991666666666667 Reward: 131.0
num_trajectories: 1300 success rate: 0.9023076923076923 Reward: 84.0
num_trajectories: 1400 success rate: 0.9014285714285715 Reward: 75.0
num_trajectories: 1500 success rate: 0.9013333333333333 Reward: 76.0
num_trajectories: 1600 success rate: 0.90125 Reward



In [1]:
import gymnasium as gym
import numpy as np
import roboverse
from roboverse.policies import policies

from stable_baselines3 import TD3
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3 import DDPG, HerReplayBuffer
from sb3_contrib import TQC
from sb3_contrib.common.wrappers import TimeFeatureWrapper
from stable_baselines3.common.vec_env import VecNormalize
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback
from options.baselines.ppoc_int2.run_roboverse import train


env = roboverse.make("Widow250PickPlace-v2",
                         gui=False,
                         observation_mode="pixels",
                         transpose_image=False)
#env = TimeFeatureWrapper(env)
#env = DummyVecEnv([make_env("Widow250PickPlaceMultiObject-v0", i) for i in range(4)])
obs = env.reset()

policy_class = policies["pickplace"]
expert = policy_class(env) #policy

train(env, num_timesteps=1000000, seed=1, num_options=4, app="", saves=False,
          wsaves=False, epoch=0, dc=0, plots=False, w_intfc=True, switch=False,
          mainlr=1e-4, intlr=9e-5, piolr=1e-4, fewshot=False, expert=expert)

pybullet build time: Oct 14 2023 15:44:17
  logger.warn(
  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
  logger.warn(
  logger.warn(
  logger.warn(


Logging to /tmp/openai-2024-04-26-00-42-42-020265


  hidden = tf.nn.relu(tf.compat.v1.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0)))
  return layer.apply(inputs)


********** Iteration 0 ************


  logger.deprecation(
  if not isinstance(done, (bool, np.bool8)):
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(


mean opt dur: [1.6931818181818181, 1.935361216730038, 1.8618181818181818, 2.1323529411764706]
mean op probs: [0.24185774 0.26122376 0.25127313 0.24564521]
mean term p: [0.5623796 0.5138756 0.525594  0.4634874]
mean vpreds: [-0.63698894  0.47838026 -0.4623326   0.17520909]
batch size: 444
optim epochs: 10
Optimizing...
batch size: 510
optim epochs: 10
Optimizing...
batch size: 520
optim epochs: 10
Optimizing...
batch size: 574
optim epochs: 10
Optimizing...
-----------------------------
| EpisodesSoFar  | 20       |
| EpLenMean      | 99       |
| EpRewMean      | -99      |
| EpThisIter     | 20       |
| TimeElapsed    | 77.9     |
| TimestepsSoFar | 1980     |
-----------------------------
********** Iteration 1 ************
run: False 1
mean opt dur: [2.2063492063492065, 16.92452830188679, 13.352941176470589, 15.6]
mean op probs: [0.28757256 0.2857099  0.23175704 0.19496042]
mean term p: [0.6649604  0.05640931 0.0576329  0.06057607]
mean vpreds: [-19.254585 -11.448821 -12.237253 -13

  self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
  step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon)


batch size: 234
optim epochs: 10
Optimizing...
-----------------------------
| EpisodesSoFar  | 1323     |
| EpLenMean      | 99       |
| EpRewMean      | -99      |
| EpThisIter     | 20       |
| TimeElapsed    | 4.8e+03  |
| TimestepsSoFar | 130977   |
-----------------------------
********** Iteration 64 ************
run: False 0.7249803359578534
action [nan nan nan nan nan nan nan nan]


RuntimeError: Action has NaN entries

In [1]:
import gymnasium as gym
import numpy as np
import roboverse
from roboverse.policies import policies

from stable_baselines3 import TD3
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3 import DDPG, HerReplayBuffer
from sb3_contrib import TQC
from sb3_contrib.common.wrappers import TimeFeatureWrapper
from stable_baselines3.common.vec_env import VecNormalize
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback
from options.baselines.ppoc_int2.run_roboverse import train


env = roboverse.make("Widow250PickPlace-v2",
                         gui=False,
                         observation_mode="pixels",
                         transpose_image=False)
#env = TimeFeatureWrapper(env)
#env = DummyVecEnv([make_env("Widow250PickPlaceMultiObject-v0", i) for i in range(4)])
obs = env.reset()

policy_class = policies["pickplace"]
expert = policy_class(env) #policy
expert.reset()

train(env, num_timesteps=1000000, seed=1, num_options=4, app="", saves=False,
          wsaves=False, epoch=0, dc=0, plots=False, w_intfc=True, switch=False,
          mainlr=1e-4, intlr=9e-5, piolr=1e-4, fewshot=False, expert=expert)

pybullet build time: Oct 14 2023 15:44:17
  logger.warn(
  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
  logger.warn(
  logger.warn(
  logger.warn(


Logging to /tmp/openai-2024-04-27-15-58-07-046995


  hidden = tf.nn.relu(tf.compat.v1.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0)))
  return layer.apply(inputs)


********** Iteration 0 ************
[[ 1.0476696  -0.8947305   0.30580634  1.2261183  -0.6319611   0.56723475
  -1.6117507   1.0984074 ]] 3
[[-1.11834789  0.3616853   0.          0.          0.          0.
   0.          0.        ]] 1
[[ 0.04071175  1.1467719  -2.1113846  -1.3884643  -1.5669872   1.3726074
  -0.18166116  0.48944873]] 1
[[-1.31277736  0.26155884  0.          0.          0.          0.
   0.          0.        ]] 1
[[-0.09271618 -1.7669677  -0.8267812  -1.0641131  -0.18071505  0.808223
  -0.6812957  -1.2416356 ]] 2
[[-1.06927217  0.05279882  0.          0.          0.          0.
   0.          0.        ]] 1
[[-1.5547423   0.61838925 -1.4925771  -1.5454214   0.04460224 -0.42470154
  -0.39340287 -0.0661606 ]] 0
[[-0.77294927 -0.07899596  0.          0.          0.          0.
   0.          0.        ]] 1
[[-0.06353461 -1.1847874   0.11839284 -0.5209143   1.0079409  -0.6051355
  -0.21371976  1.1939888 ]] 1
[[-0.5212588  -0.12385838  0.          0.          0.          0

  logger.deprecation(
  if not isinstance(done, (bool, np.bool8)):
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(


[[ 0.09640943 -0.877389    0.0026662  -0.39090765 -0.72064674 -1.8715702
  -1.1983695   1.9443511 ]] 2
[[-0.1654607  -0.08334889 -0.77602928  0.          0.          0.
   0.          0.        ]] 1
[[-0.31410727  0.607447   -0.7862106  -1.35703    -1.030429    1.3062845
   0.6584235  -1.4073217 ]] 1
[[-0.09533371 -0.10764493 -0.59165805  0.          0.          0.
   0.          0.        ]] 1
[[ 0.5047243   0.91103286 -0.2894291  -0.27956897 -0.49877143 -0.9055442
  -0.29083046  0.6825968 ]] 1
[[-0.05233867 -0.1136487  -0.38825383  0.          0.          0.
   0.          0.        ]] 1
[[ 0.8049183   1.8000774  -0.12666821 -1.2216297  -0.32362634 -2.3857734
  -0.6196189   1.0522237 ]] 2
[[-0.03147164 -0.10290223 -0.24077812  0.          0.          0.
   0.          0.        ]] 1
[[-0.45986032 -0.19819565  0.46059537 -1.5574381   0.50990814 -0.09840725
   0.11851154  0.4694676 ]] 2
[[-0.06160069 -0.15151966 -0.22170714  0.          0.          0.
   0.          0.        ]] 1
[[-0