In [None]:
import gymnasium as gym
import numpy as np
import roboverse

from stable_baselines3 import TD3
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3 import DDPG, HerReplayBuffer
from sb3_contrib import TQC
from sb3_contrib.common.wrappers import TimeFeatureWrapper
from stable_baselines3.common.vec_env import VecNormalize
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.callbacks import EvalCallback


model = TQC.load("data/tqc1/tqc_model_873000_steps")
env = roboverse.make("Widow250PickPlace-v2",
                         gui=True,
                         observation_mode="pixels",
                         transpose_image=False)
model.set_env(env)
env = model.get_env()

obs = env.reset()
print("start render")
for i in range(int(1e4)):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    print(rewards)
    env.render("human")
env.close()

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2"

In [2]:
import time
import numpy as np
from roboverse.policies import policies
from stable_baselines3.common.utils import set_random_seed


def make_env(env_id: str, rank: int, seed: int = 0):
    """
    Utility function for multiprocessed env.

    :param env_id: the environment ID
    :param num_env: the number of environments you wish to have in subprocesses
    :param seed: the inital seed for RNG
    :param rank: index of the subprocess
    """
    def _init():
        env = roboverse.make(env_id,
                         gui=False,
                         observation_mode="pixels",
                         transpose_image=False)
        #env = TimeFeatureWrapper(env)
        #env.reset(seed=seed + rank)
        env.reset()
        return env
    set_random_seed(seed)
    return _init


def collect_data(env, model, policy, target, num_trajectories=100, num_timesteps=30):
    policy_class = policies[policy]
    policy = policy_class(env)
    num_success = 0
    num_saved = 0
    accept_trajectory_key = target
    noise = 0.1
    EPSILON = 0.1

    while num_saved < num_trajectories:
        num_saved += 1
        num_steps = 1e6
        rewards = []
        env.reset()
        policy.reset()
        time.sleep(0.1)
        success = False
        for j in range(num_timesteps):
            action, agent_info = policy.get_action()

            # In case we need to pad actions by 1 for easier realNVP modelling 
            env_action_dim = env.action_space.shape[0]
            #if env_action_dim - action.shape[0] == 1:
            #    action = np.append(action, 0)
            action += np.random.normal(scale=noise, size=(env_action_dim,))
            action = np.clip(action, -1 + EPSILON, 1 - EPSILON)
            observation = env.get_observation()
            observation["image"] = np.transpose(observation["image"], (2, 0, 1))
            next_observation, reward, done, info = env.step(action)
            next_observation["image"] = np.transpose(next_observation["image"], (2, 0, 1))
            rewards.append(reward)
            success = sum(rewards) > 70
            model.replay_buffer.add(observation, next_observation, action, reward, np.array([done]), [{}])

            if success and num_steps > 1e3: #info[accept_trajectory_key]
                num_steps = j

            if success and j > 23: #info[accept_trajectory_key]
                break
            if done or agent_info['done']:
                break

        if success: #info[accept_trajectory_key]
            PRINT = False
            if PRINT:
                print("num_timesteps: ", num_steps, rewards)
                #print(observation["image"].shape)
                #print(next_observation["image"].shape)
            num_success += 1
        if num_saved%100 == 0:
            print(f"num_trajectories: {num_saved} success rate: {num_success/num_saved} Reward: {sum(rewards)}")

    print("success rate: {}".format(num_success / (num_saved)))


In [4]:
import gymnasium as gym
import numpy as np
import roboverse

from stable_baselines3 import TD3
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3 import DDPG, HerReplayBuffer
from sb3_contrib import TQC
from sb3_contrib.common.wrappers import TimeFeatureWrapper
from stable_baselines3.common.vec_env import VecNormalize
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback


env = roboverse.make("Widow250PickPlace-v2",
                         gui=False,
                         observation_mode="pixels",
                         transpose_image=False)
#env = TimeFeatureWrapper(env)
#env = DummyVecEnv([make_env("Widow250PickPlace-v1", i) for i in range(4)])
seed = 0
obs = env.reset()

# Save a checkpoint every 1000 steps
checkpoint_callback = CheckpointCallback(
  save_freq=1000,
  save_path=f"./data/seed_{seed}/",
  name_prefix="tqc_model",
  save_replay_buffer=False,
  save_vecnormalize=False,
)

model = TQC(env=env, batch_size=2048, buffer_size=1_000_000, gamma=0.95, learning_rate=0.001, policy='MultiInputPolicy',
             policy_kwargs=dict(net_arch=[512, 512, 512], n_critics=2),
             #replay_buffer_class=HerReplayBuffer,
             #replay_buffer_kwargs=dict(goal_selection_strategy='future', n_sampled_goal=4),
             tau=0.05, learning_starts=0, verbose=1)

#model = TQC.load("data/tqc")
#model.set_env(env)
COLLECT=True
if COLLECT:
    collect_data(env, model, "pickplace", "place_success_target", 10000, 35)
    model.save_replay_buffer(f"data/seed_{seed}/tqc_expert_pick_place")
else:
    print("load_replay_buffer")
    model.load_replay_buffer(f"data/seed_{seed}/tqc_expert_pick_place")

print("start pre-training from buffer only")
model.learn(total_timesteps=0, callback=checkpoint_callback, log_interval=5, tb_log_name="exp", reset_num_timesteps = False, progress_bar=True)
model.train(gradient_steps=20000)

print("start learning")
model.learn(total_timesteps=480_000, callback=checkpoint_callback, log_interval=5, tb_log_name="exp", reset_num_timesteps = False, progress_bar=True)
model.save(f"data/seed_{seed}/tqc_pick_place")
model.save_replay_buffer(f"data/seed_{seed}/tqc_trained_pick_place")

print("load_replay_buffer")
model.load_replay_buffer(f"data/seed_{seed}/tqc_expert_pick_place")
collect_data(env, model, "pickplace", "place_success_target", 10000, 35)
model.save_replay_buffer(f"data/seed_{seed}/tqc_expert_pick_place")

model.learn(total_timesteps=500_000, callback=checkpoint_callback, log_interval=5, tb_log_name="exp", reset_num_timesteps = False, progress_bar=True)
model.save(f"data/seed_{seed}/tqc_pick_place")
model.save_replay_buffer(f"data/seed_{seed}/tqc_trained_pick_place")

print("finish learning")

  logger.warn(
  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
  logger.warn(
  logger.warn(
  logger.warn(


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
num_trajectories: 100 success rate: 0.93 Reward: 82.0
num_trajectories: 200 success rate: 0.91 Reward: 82.0
num_trajectories: 300 success rate: 0.9033333333333333 Reward: 80.0
num_trajectories: 400 success rate: 0.9025 Reward: 75.0
num_trajectories: 500 success rate: 0.914 Reward: 72.0
num_trajectories: 600 success rate: 0.9166666666666666 Reward: 79.0
num_trajectories: 700 success rate: 0.9214285714285714 Reward: 129.0
num_trajectories: 800 success rate: 0.9175 Reward: 82.0
num_trajectories: 900 success rate: 0.9222222222222223 Reward: 78.0
num_trajectories: 1000 success rate: 0.923 Reward: 90.0
num_trajectories: 1100 success rate: 0.9209090909090909 Reward: -35.0
num_trajectories: 1200 success rate: 0.92 Reward: 78.0
num_trajectories: 1300 success rate: 0.9176923076923077 Reward: 74.0
num_trajectories: 1400 success rate: 0.9207142857142857 Reward: -3

Output()

Output()

start learning
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 99        |
|    ep_rew_mean     | -99       |
| time/              |           |
|    episodes        | 5         |
|    fps             | 14        |
|    time_elapsed    | 34        |
|    total_timesteps | 495       |
| train/             |           |
|    actor_loss      | -3.62e+08 |
|    critic_loss     | 3.23e+06  |
|    ent_coef        | 1.63e+05  |
|    ent_coef_loss   | 2.52      |
|    learning_rate   | 0.001     |
|    n_updates       | 20494     |
----------------------------------
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 99        |
|    ep_rew_mean     | -99       |
| time/              |           |
|    episodes        | 10        |
|    fps             | 14        |
|    time_elapsed    | 69        |
|    total_timesteps | 990       |
| train/             |           |
|    actor_loss      | -3.75e+08 |
|    

load_replay_buffer
num_trajectories: 100 success rate: 0.95 Reward: 130.0
num_trajectories: 200 success rate: 0.94 Reward: 80.0
num_trajectories: 300 success rate: 0.9333333333333333 Reward: 76.0
num_trajectories: 400 success rate: 0.925 Reward: 78.0
num_trajectories: 500 success rate: 0.924 Reward: 74.0
num_trajectories: 600 success rate: 0.925 Reward: 82.0
num_trajectories: 700 success rate: 0.9171428571428571 Reward: 78.0
num_trajectories: 800 success rate: 0.92 Reward: 78.0
num_trajectories: 900 success rate: 0.9255555555555556 Reward: 71.0
num_trajectories: 1000 success rate: 0.922 Reward: 80.0
num_trajectories: 1100 success rate: 0.9254545454545454 Reward: 75.0
num_trajectories: 1200 success rate: 0.9216666666666666 Reward: -35.0
num_trajectories: 1300 success rate: 0.9230769230769231 Reward: 16.0
num_trajectories: 1400 success rate: 0.9214285714285714 Reward: 80.0
num_trajectories: 1500 success rate: 0.9213333333333333 Reward: 80.0
num_trajectories: 1600 success rate: 0.918125 R

Output()

----------------------------------
| rollout/           |           |
|    ep_len_mean     | 99.2      |
|    ep_rew_mean     | -98.5     |
| time/              |           |
|    episodes        | 4850      |
|    fps             | 11        |
|    time_elapsed    | 14        |
|    total_timesteps | 480173    |
| train/             |           |
|    actor_loss      | -5.72e+14 |
|    critic_loss     | 1.44e+13  |
|    ent_coef        | 9.72e+09  |
|    ent_coef_loss   | -443      |
|    learning_rate   | 0.001     |
|    n_updates       | 500172    |
----------------------------------
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 99.2      |
|    ep_rew_mean     | -98.5     |
| time/              |           |
|    episodes        | 4855      |
|    fps             | 11        |
|    time_elapsed    | 57        |
|    total_timesteps | 480668    |
| train/             |           |
|    actor_loss      | -5.87e+14 |
|    critic_loss    

KeyboardInterrupt: 

In [None]:
# import gymnasium as gym
# import numpy as np
# import roboverse

# from stable_baselines3 import TD3
# from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
# from stable_baselines3 import DDPG, HerReplayBuffer
# from sb3_contrib import TQC
# from sb3_contrib.common.wrappers import TimeFeatureWrapper
# from stable_baselines3.common.vec_env import VecNormalize
# from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
# from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback


# env = roboverse.make("Widow250PickPlaceMultiObject-v0",
#                          gui=False,
#                          observation_mode="pixels",
#                          transpose_image=False)
# #env = TimeFeatureWrapper(env)
# #env = DummyVecEnv([make_env("Widow250PickPlace-v1", i) for i in range(4)])
# seed = 0
# obs = env.reset()

# # Save a checkpoint every 1000 steps
# checkpoint_callback = CheckpointCallback(
#   save_freq=1000,
#   save_path=f"./data/seed_{seed}/",
#   name_prefix="tqc_model",
#   save_replay_buffer=False,
#   save_vecnormalize=False,
# )

# model = TQC(env=env, batch_size=2048, buffer_size=1_000_000, gamma=0.95, learning_rate=0.001, policy='MultiInputPolicy',
#              policy_kwargs=dict(net_arch=[512, 512, 512], n_critics=2),
#              replay_buffer_class=HerReplayBuffer,
#              replay_buffer_kwargs=dict(goal_selection_strategy='future', n_sampled_goal=4),
#              tau=0.05, learning_starts=200, verbose=1)

# #model = TQC.load("data/tqc")
# #model.set_env(env)
# COLLECT=False
# if COLLECT:
#     collect_data(env, model, "pickplace", "place_success_target", 10000, 35)
#     model.save_replay_buffer(f"data/seed_{seed}/tqc_expert_pick_place")
# else:
#     print("load_replay_buffer")
#     model.load_replay_buffer(f"data/seed_{seed}/tqc_expert_pick_place")

# print("start pre-training from buffer only")
# model.learn(total_timesteps=0, callback=checkpoint_callback, log_interval=5, tb_log_name="exp", reset_num_timesteps = False, progress_bar=True)
# model.train(gradient_steps=20000)

# print("start learning")
# model.learn(total_timesteps=480_000, callback=checkpoint_callback, log_interval=5, tb_log_name="exp", reset_num_timesteps = False, progress_bar=True)
# model.save(f"data/seed_{seed}/tqc_pick_place")
# model.save_replay_buffer(f"data/seed_{seed}/tqc_trained_pick_place")

print("load_replay_buffer")
model.load_replay_buffer(f"data/seed_{seed}/tqc_expert_pick_place")
collect_data(env, model, "pickplace", "place_success_target", 10000, 35)
model.save_replay_buffer(f"data/seed_{seed}/tqc_expert_pick_place")

model.learn(total_timesteps=500_000, callback=checkpoint_callback, log_interval=5, tb_log_name="exp", reset_num_timesteps = False, progress_bar=True)
model.save(f"data/seed_{seed}/tqc_pick_place")

print("finish learning")

load_replay_buffer
num_trajectories: 100 success rate: 0.93 Reward: 80.0
num_trajectories: 200 success rate: 0.91 Reward: 74.0
num_trajectories: 300 success rate: 0.9033333333333333 Reward: 78.0
num_trajectories: 400 success rate: 0.905 Reward: 73.0
num_trajectories: 500 success rate: 0.906 Reward: 80.0
num_trajectories: 600 success rate: 0.905 Reward: 90.0
num_trajectories: 700 success rate: 0.9014285714285715 Reward: 80.0
num_trajectories: 800 success rate: 0.9 Reward: 75.0
num_trajectories: 900 success rate: 0.8988888888888888 Reward: 84.0
num_trajectories: 1000 success rate: 0.897 Reward: 75.0
num_trajectories: 1100 success rate: 0.899090909090909 Reward: -35.0
num_trajectories: 1200 success rate: 0.8983333333333333 Reward: 86.0
num_trajectories: 1300 success rate: 0.9007692307692308 Reward: 90.0
num_trajectories: 1400 success rate: 0.9035714285714286 Reward: 81.0
num_trajectories: 1500 success rate: 0.9026666666666666 Reward: 78.0
num_trajectories: 1600 success rate: 0.905 Reward:

Output()

----------------------------------
| rollout/           |           |
|    ep_len_mean     | 99.2      |
|    ep_rew_mean     | -98.3     |
| time/              |           |
|    episodes        | 4850      |
|    fps             | 12        |
|    time_elapsed    | 14        |
|    total_timesteps | 480173    |
| train/             |           |
|    actor_loss      | -2.87e+14 |
|    critic_loss     | 3.24e+13  |
|    ent_coef        | 5.2e+09   |
|    ent_coef_loss   | -429      |
|    learning_rate   | 0.001     |
|    n_updates       | 499972    |
----------------------------------
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 99.2      |
|    ep_rew_mean     | -98.3     |
| time/              |           |
|    episodes        | 4855      |
|    fps             | 12        |
|    time_elapsed    | 53        |
|    total_timesteps | 480668    |
| train/             |           |
|    actor_loss      | -3.03e+14 |
|    critic_loss    