In [None]:
import gymnasium as gym
import numpy as np
import roboverse

from stable_baselines3 import TD3
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3 import DDPG, HerReplayBuffer
from sb3_contrib import TQC
from sb3_contrib.common.wrappers import TimeFeatureWrapper
from stable_baselines3.common.vec_env import VecNormalize
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.callbacks import EvalCallback


model = TQC.load("data/tqc1/tqc_model_873000_steps")
env = roboverse.make("Widow250PickPlace-v2",
                         gui=True,
                         observation_mode="pixels",
                         transpose_image=False)
model.set_env(env)
env = model.get_env()

obs = env.reset()
print("start render")
for i in range(int(1e4)):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    print(rewards)
    env.render("human")
env.close()

In [3]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2"

In [4]:
import time
import numpy as np
from roboverse.policies import policies
from stable_baselines3.common.utils import set_random_seed


def make_env(env_id: str, rank: int, seed: int = 0):
    """
    Utility function for multiprocessed env.

    :param env_id: the environment ID
    :param num_env: the number of environments you wish to have in subprocesses
    :param seed: the inital seed for RNG
    :param rank: index of the subprocess
    """
    def _init():
        env = roboverse.make(env_id,
                         gui=False,
                         observation_mode="pixels",
                         transpose_image=False)
        #env = TimeFeatureWrapper(env)
        #env.reset(seed=seed + rank)
        env.reset()
        return env
    set_random_seed(seed)
    return _init


def collect_data(env, model, policy, target, num_trajectories=100, num_timesteps=30):
    policy_class = policies[policy]
    policy = policy_class(env)
    num_success = 0
    num_saved = 0
    accept_trajectory_key = target
    noise = 0.1
    EPSILON = 0.1

    while num_saved < num_trajectories:
        num_saved += 1
        num_steps = 1e6
        rewards = []
        env.reset()
        policy.reset()
        time.sleep(0.1)
        success = False
        for j in range(num_timesteps):
            action, agent_info = policy.get_action()

            # In case we need to pad actions by 1 for easier realNVP modelling 
            env_action_dim = env.action_space.shape[0]
            #if env_action_dim - action.shape[0] == 1:
            #    action = np.append(action, 0)
            action += np.random.normal(scale=noise, size=(env_action_dim,))
            action = np.clip(action, -1 + EPSILON, 1 - EPSILON)
            observation = env.get_observation()
            observation["image"] = np.transpose(observation["image"], (2, 0, 1))
            next_observation, reward, done, info = env.step(action)
            next_observation["image"] = np.transpose(next_observation["image"], (2, 0, 1))
            rewards.append(reward)
            success = sum(rewards) > 70
            model.replay_buffer.add(observation, next_observation, action, reward, np.array([done]), [{}])

            if success and num_steps > 1e3: #info[accept_trajectory_key]
                num_steps = j

            if success and j > 23: #info[accept_trajectory_key]
                break
            if done or agent_info['done']:
                break

        if success: #info[accept_trajectory_key]
            PRINT = False
            if PRINT:
                print("num_timesteps: ", num_steps, rewards)
                #print(observation["image"].shape)
                #print(next_observation["image"].shape)
            num_success += 1
        if num_saved%100 == 0:
            print(f"num_trajectories: {num_saved} success rate: {num_success/num_saved} Reward: {sum(rewards)}")

    print("success rate: {}".format(num_success / (num_saved)))


In [2]:
import gymnasium as gym
import numpy as np
import roboverse

from stable_baselines3 import TD3
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3 import DDPG, HerReplayBuffer
from sb3_contrib import TQC
from sb3_contrib.common.wrappers import TimeFeatureWrapper
from stable_baselines3.common.vec_env import VecNormalize
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback


env = roboverse.make("Widow250PickPlaceMultiObject-v0",
                         gui=False,
                         observation_mode="pixels",
                         transpose_image=False)
#env = TimeFeatureWrapper(env)
#env = DummyVecEnv([make_env("Widow250PickPlace-v1", i) for i in range(4)])
seed = 2
obs = env.reset()

# Save a checkpoint every 1000 steps
checkpoint_callback = CheckpointCallback(
  save_freq=1000,
  save_path=f"./data/seed_{seed}/",
  name_prefix="tqc_model",
  save_replay_buffer=False,
  save_vecnormalize=False,
)

# model = TQC(env=env, batch_size=2048, buffer_size=1_000_000, gamma=0.95, learning_rate=0.001, policy='MultiInputPolicy',
#              policy_kwargs=dict(net_arch=[512, 512, 512], n_critics=2),
#              replay_buffer_class=HerReplayBuffer,
#              replay_buffer_kwargs=dict(goal_selection_strategy='future', n_sampled_goal=4),
#              tau=0.05, learning_starts=200, verbose=1)

model = TQC.load("data/seed_1/tqc_pick_place", env=env)
model.set_env(env)

print("load_replay_buffer")
model.load_replay_buffer(f"data/seed_{seed}/tqc_expert_pick_place")

print("start learning")
model.learn(total_timesteps=480_000, callback=checkpoint_callback, log_interval=5, tb_log_name="exp", reset_num_timesteps = False, progress_bar=True)
model.save(f"data/seed_{seed}/tqc_pick_place_test")

print("load_replay_buffer")
model.load_replay_buffer(f"data/seed_{seed}/tqc_expert_pick_place")

model.learn(total_timesteps=500_000, callback=checkpoint_callback, log_interval=5, tb_log_name="exp", reset_num_timesteps = False, progress_bar=True)
model.save(f"data/seed_{seed}/tqc_pick_place_test")
print("finish learning")



Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.




Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
load_replay_buffer
start learning


Output()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 99       |
|    ep_rew_mean     | 193      |
| time/              |          |
|    episodes        | 9900     |
|    fps             | 10       |
|    time_elapsed    | 19       |
|    total_timesteps | 980198   |
| train/             |          |
|    actor_loss      | -82.9    |
|    critic_loss     | 11.6     |
|    ent_coef        | 0.00557  |
|    ent_coef_loss   | 250      |
|    learning_rate   | 0.001    |
|    n_updates       | 999997   |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 99       |
|    ep_rew_mean     | 177      |
| time/              |          |
|    episodes        | 9905     |
|    fps             | 10       |
|    time_elapsed    | 67       |
|    total_timesteps | 980693   |
| train/             |          |
|    actor_loss      | -123     |
|    critic_loss     | 12.6     |
|    ent_coef 

load_replay_buffer


Output()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 99       |
|    ep_rew_mean     | -18.2    |
| time/              |          |
|    episodes        | 14750    |
|    fps             | 10       |
|    time_elapsed    | 34       |
|    total_timesteps | 1460348  |
| train/             |          |
|    actor_loss      | -92.8    |
|    critic_loss     | 5.35     |
|    ent_coef        | 0.0196   |
|    ent_coef_loss   | 12.3     |
|    learning_rate   | 0.001    |
|    n_updates       | 1480147  |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 99       |
|    ep_rew_mean     | -21.4    |
| time/              |          |
|    episodes        | 14755    |
|    fps             | 10       |
|    time_elapsed    | 81       |
|    total_timesteps | 1460843  |
| train/             |          |
|    actor_loss      | -101     |
|    critic_loss     | 3.87     |
|    ent_coef 

finish learning


In [None]:
print("load_replay_buffer")
model.load_replay_buffer(f"data/seed_{seed}/tqc_expert_pick_place")

print("start learning")
model.learn(total_timesteps=500_000, callback=checkpoint_callback, log_interval=5, tb_log_name="exp", reset_num_timesteps = False, progress_bar=True)
model.save(f"data/seed_{seed}/tqc_pick_place_test")

print("load_replay_buffer")
model.load_replay_buffer(f"data/seed_{seed}/tqc_expert_pick_place")

model.learn(total_timesteps=500_000, callback=checkpoint_callback, log_interval=5, tb_log_name="exp", reset_num_timesteps = False, progress_bar=True)
model.save(f"data/seed_{seed}/tqc_pick_place_test")
print("finish learning")

load_replay_buffer


Output()

start learning
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 99       |
|    ep_rew_mean     | 34.5     |
| time/              |          |
|    episodes        | 19800    |
|    fps             | 13       |
|    time_elapsed    | 21       |
|    total_timesteps | 1960298  |
| train/             |          |
|    actor_loss      | -91.1    |
|    critic_loss     | 5.23     |
|    ent_coef        | 0.0178   |
|    ent_coef_loss   | 15.9     |
|    learning_rate   | 0.001    |
|    n_updates       | 1980097  |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 99       |
|    ep_rew_mean     | 19       |
| time/              |          |
|    episodes        | 19805    |
|    fps             | 14       |
|    time_elapsed    | 56       |
|    total_timesteps | 1960793  |
| train/             |          |
|    actor_loss      | -97.4    |
|    critic_loss     | 4.24     |

load_replay_buffer


Output()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 99       |
|    ep_rew_mean     | 23.2     |
| time/              |          |
|    episodes        | 24850    |
|    fps             | 11       |
|    time_elapsed    | 21       |
|    total_timesteps | 2460248  |
| train/             |          |
|    actor_loss      | -87.5    |
|    critic_loss     | 6.44     |
|    ent_coef        | 0.0186   |
|    ent_coef_loss   | 16.1     |
|    learning_rate   | 0.001    |
|    n_updates       | 2480047  |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 99       |
|    ep_rew_mean     | 13.6     |
| time/              |          |
|    episodes        | 24855    |
|    fps             | 11       |
|    time_elapsed    | 65       |
|    total_timesteps | 2460743  |
| train/             |          |
|    actor_loss      | -99.1    |
|    critic_loss     | 4.79     |
|    ent_coef 