In [1]:
#python scripts/scripted_collect.py -n 100 -t 30 -e Widow250PickPlace-v1 -pl grasp -a grasp_success_target --noise=0.1

In [2]:
#python scripts/scripted_collect.py -n 100 -t 30 -e Widow250PickPlace-v0 -pl pickplace -a place_success_target --noise=0.1 --gui

In [3]:
#python scripts/scripted_collect.py -n 100 -t 30 -e Widow250PickPlaceMultiObject-v0 -pl pickplace -a place_success_target --noise=0.1 --gui

In [1]:
import numpy as np
import roboverse
from roboverse.policies import policies


def collect_data(env, model, policy, target, num_trajectories=100, num_timesteps=30):
    policy_class = policies[policy]
    policy = policy_class(env)
    num_success = 0
    num_saved = 0
    num_attempts = 0
    accept_trajectory_key = target
    noise = 0.1
    EPSILON = 0.1

    while num_saved < num_trajectories:
        num_attempts += 1
        num_steps = -1
        rewards = []
        env.reset()
        policy.reset()
        for j in range(num_timesteps):
            action, agent_info = policy.get_action()

            # In case we need to pad actions by 1 for easier realNVP modelling 
            env_action_dim = env.action_space.shape[0]
            #if env_action_dim - action.shape[0] == 1:
            #    action = np.append(action, 0)
            action += np.random.normal(scale=noise, size=(env_action_dim,))
            action = np.clip(action, -1 + EPSILON, 1 - EPSILON)
            observation = env.get_observation_stacked() #env.get_observation()
            next_observation, reward, done, info = env.step(action)
            if not info[accept_trajectory_key]:
                reward += 0.99**(num_timesteps-j)/10
            rewards.append(reward)
            model.replay_buffer.add(observation, next_observation, action, reward, done, [{}])

            if info[accept_trajectory_key] and num_steps < 0:
                num_steps = j

            if info[accept_trajectory_key] and j > 20:
                break
            if done or agent_info['done']:
                break

        if info[accept_trajectory_key]:
            if True:
                print("num_timesteps: ", num_steps)
                #print(traj["observations"])
            num_success += 1
            num_saved += 1
        print(f"num_trajectories: {num_saved} success rate: {num_success/num_attempts} Reward: {sum(rewards)}")

    print("success rate: {}".format(num_success / (num_attempts)))

pybullet build time: Oct 14 2023 15:44:17


In [2]:
import gymnasium as gym
import numpy as np

from stable_baselines3 import TD3
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise


COLLECT = False
#env = gym.make("Pendulum-v1", render_mode="rgb_array")
env = roboverse.make("Widow250PickPlace-v1",
                         gui=False,
                         transpose_image=False)
obs = env.reset()

# The noise objects for TD3
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

model = TD3("MultiInputPolicy", env, buffer_size=10000, action_noise=action_noise, \
            tensorboard_log="data/td3", verbose=1, learning_starts=0) #noise Required for deterministic policy
if COLLECT:
    for i in range(2):
        collect_data(env, model, "grasp", "grasp_success_target", 250, 30)
        model.save_replay_buffer(f"data/td3_expert_grasp{i+1}")

if not COLLECT:
    model.replay_buffer.reset()
    model.load_replay_buffer(f"data/td3_expert_grasp1")
    model.learn(total_timesteps=0, log_interval=5, tb_log_name="exp", progress_bar=True)
    
    print("start pre-training from buffer only")
    for i in range(2):
        model.replay_buffer.reset()
        model.load_replay_buffer(f"data/td3_expert_grasp{i%2+1}")
        model.train(gradient_steps=2500, batch_size=256)

    print("start learning")
    for i in range(20):
        model.replay_buffer.reset()
        model.load_replay_buffer(f"data/td3_expert_grasp1")
        model.load_replay_buffer(f"data/td3_expert_grasp2")
        model.learn(total_timesteps=2005, log_interval=5, tb_log_name="exp", progress_bar=True)

    print("finish learning")
    model.save("data/td3_1")

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




Logging to data/td3/exp_3


Output()

start pre-training from buffer only
start learning
Logging to data/td3/exp_4


Output()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 101      |
|    ep_rew_mean     | 0        |
| time/              |          |
|    episodes        | 5        |
|    fps             | 31       |
|    time_elapsed    | 16       |
|    total_timesteps | 505      |
| train/             |          |
|    actor_loss      | 75.6     |
|    critic_loss     | 5.34e+03 |
|    learning_rate   | 0.001    |
|    n_updates       | 5404     |
---------------------------------


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 101      |
|    ep_rew_mean     | 0        |
| time/              |          |
|    episodes        | 10       |
|    fps             | 30       |
|    time_elapsed    | 32       |
|    total_timesteps | 1010     |
| train/             |          |
|    actor_loss      | 98.6     |
|    critic_loss     | 6.87e+03 |
|    learning_rate   | 0.001    |
|    n_updates       | 5909     |
---------------------------------


: 

In [3]:
# start env with gui
env.close()
env = roboverse.make("Widow250PickPlace-v1",
                         gui=True,
                         transpose_image=False)
obs = env.reset()
model.set_env(env)
vec_env = model.get_env()

#del model # remove to demonstrate saving and loading
#model = TD3.load("data/td3_1")

obs = vec_env.reset()
print("start render")
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = vec_env.step(action)
    vec_env.render("human")

startThreads creating 1 threads.
starting thread 0
started thread 0 
argc=2
argv[0] = --unused
argv[1] = --start_demo_name=Physics Server
ExampleBrowserThreadFunc started
X11 functions dynamically loaded using dlopen/dlsym OK!
X11 functions dynamically loaded using dlopen/dlsym OK!
Creating context
Created GL 3.3 context
Direct GLX rendering context obtained
Making context current
GL_VENDOR=Microsoft Corporation
GL_RENDERER=D3D12 (Intel(R) UHD Graphics 630)
GL_VERSION=4.1 (Core Profile) Mesa 23.2.1-1ubuntu3.1~22.04.2
GL_SHADING_LANGUAGE_VERSION=4.10
pthread_getconcurrency()=0
Version = 4.1 (Core Profile) Mesa 23.2.1-1ubuntu3.1~22.04.2
Vendor = Microsoft Corporation
Renderer = D3D12 (Intel(R) UHD Graphics 630)
b3Printf: Selected demo: Physics Server
startThreads creating 1 threads.
starting thread 0
started thread 0 
MotionThreadFunc thread started
ven = Microsoft Corporation
ven = Microsoft Corporation


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
  logger.warn(
  logger.warn(
  logger.warn(


Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
start render




In [4]:
collect_data(env, model, "grasp", "grasp_success_target", 50, 30)
#model.save_replay_buffer(f"data/td3_expert_grasp_test")

num_timesteps:  16
num_trajectories: 1 success rate: 1.0 Reward: 5.5452921640887824
num_timesteps:  17
num_trajectories: 2 success rate: 1.0 Reward: 6.364424431145807
num_timesteps:  13
num_trajectories: 3 success rate: 1.0 Reward: 10.0221039179569
num_timesteps:  15
num_trajectories: 4 success rate: 1.0 Reward: 8.19154401440478
num_timesteps:  16
num_trajectories: 5 success rate: 1.0 Reward: 7.277549849868909
num_timesteps:  17
num_trajectories: 6 success rate: 1.0 Reward: 6.364424431145807
num_timesteps:  15
num_trajectories: 7 success rate: 1.0 Reward: 8.19154401440478
num_timesteps:  15
num_trajectories: 8 success rate: 1.0 Reward: 8.19154401440478
num_timesteps:  15
num_trajectories: 9 success rate: 1.0 Reward: 5.454809185153187
num_timesteps:  15
num_trajectories: 10 success rate: 1.0 Reward: 8.19154401440478
num_timesteps:  16
num_trajectories: 11 success rate: 1.0 Reward: 7.277549849868909
num_timesteps:  14
num_trajectories: 12 success rate: 1.0 Reward: 9.106398237295291
num_t