# Solving the Pick-and-Place Environment in Robosuite

## Abstract

## Introduction




## Environment

Blah Blah about robosuite and Stable Baselines


## Implementation

1. Import numpy and robosuite



In [9]:
import yaml
import numpy as np
import torch
import os
import robosuite as suite

from gym.spaces import Box

from robosuite import load_controller_config
from robosuite.environments.base import register_env
from robosuite.controllers import load_controller_config, ALL_CONTROLLERS
from robosuite.wrappers import GymWrapper
from stable_baselines3 import PPO, DDPG

import rlkit.torch.pytorch_util as ptu
from rlkit.launchers.launcher_util import setup_logger
from rlkit.envs.wrappers import NormalizedBoxEnv
from rlkit.torch.sac.sac import SACTrainer
from rlkit.torch.networks import FlattenMlp, TanhMlpPolicy
from rlkit.data_management.env_replay_buffer import EnvReplayBuffer
from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy
from rlkit.samplers.data_collector import MdpPathCollector
from rlkit.torch.sac.policies import TanhGaussianPolicy, MakeDeterministic
from rlkit.exploration_strategies.gaussian_strategy import GaussianStrategy
from rlkit_custom import CustomTorchBatchRLAlgorithm

config = {}
print('Load configuration file config.yaml')
with open("config.yaml") as stream:
    try:
        config = yaml.safe_load(stream)
    except yaml.YAMLError as e:
        print(e)

np.random.seed(config["seed"])
torch.manual_seed(config["seed"])

THIS_DIR = os.getcwd()

print("Running environment with training =", str(config["training"]), " and simulation =", str(config["simulation"]))

Load configuration file config.yaml
Running environment with training = True  and simulation = True


2. Create Environment

In [10]:
# create environment instance

trainer_kwargs = dict(
    discount=0.99,
    soft_target_tau=5e-3,
    target_update_period=1,
    policy_lr=3e-4,
    qf_lr=3e-4,
    reward_scale=1,
    use_automatic_entropy_tuning=(not True),
)

expl_env_kwargs = dict(
    env_name="PickPlace",
    robots=config["robot_name"],
    controller=config["robot_controller"],
    horizon=500,
    control_freq=20,
    reward_scale=1,
    hard_reset=True,
    ignore_done=True,
)

eval_env_kwargs = dict(
    env_name="PickPlace",
    robots=config["robot_name"],
    controller=config["robot_controller"],
    horizon=500,
    control_freq=20,
    reward_scale=1,
    hard_reset=True,
    ignore_done=True,
)

variant = dict(
    algorithm="SAC",
    seed=config["seed"],
    version="normal",
    replay_buffer_size=int(1E6),
    qf_kwargs=dict(
        hidden_sizes=[256,256],
    ),
    policy_kwargs=dict(
        hidden_sizes=[256,256],
    ),
    algorithm_kwargs=dict(
        num_epochs=2000, # args.n_epochs
        num_eval_steps_per_epoch=500 * 10, #args.eval_horizon * args.num_eval
        num_trains_per_train_loop=1000,
        num_expl_steps_per_train_loop= 500 * 10,   # args.expl_horizon * args.expl_ep_per_train_loop
        min_num_steps_before_training=1000,
        expl_max_path_length=500, #args.expl_horizon
        eval_max_path_length=500, #args.eval_horizon
        batch_size=256,
    ),
    trainer_kwargs=trainer_kwargs,
    expl_environment_kwargs=expl_env_kwargs,
    eval_environment_kwargs=eval_env_kwargs,
)

ptu.set_gpu_mode(torch.cuda.is_available())

tmp_file_prefix = "{}_{}_{}_SEED{}".format("PickPlace", "".join(config["robot_name"]), config["robot_controller"], config["seed"])

abs_root_dir = os.path.join(THIS_DIR, "logs")
tmp_dir = setup_logger(tmp_file_prefix, variant=variant, base_log_dir=abs_root_dir)


2024-07-18 14:46:52.952848 CEST | [PickPlace_IIWA_OSC_POSE_SEED554641_2024_07_18_14_25_16_0000--s-0] [PickPlace_IIWA_OSC_POSE_SEED554641_2024_07_18_14_27_14_0000--s-0] Variant:
2024-07-18 14:46:52.953593 CEST | [PickPlace_IIWA_OSC_POSE_SEED554641_2024_07_18_14_25_16_0000--s-0] [PickPlace_IIWA_OSC_POSE_SEED554641_2024_07_18_14_27_14_0000--s-0] {
  "algorithm": "SAC",
  "seed": 554641,
  "version": "normal",
  "replay_buffer_size": 1000000,
  "qf_kwargs": {
    "hidden_sizes": [
      256,
      256
    ]
  },
  "policy_kwargs": {
    "hidden_sizes": [
      256,
      256
    ]
  },
  "algorithm_kwargs": {
    "num_epochs": 2000,
    "num_eval_steps_per_epoch": 5000,
    "num_trains_per_train_loop": 1000,
    "num_expl_steps_per_train_loop": 5000,
    "min_num_steps_before_training": 1000,
    "expl_max_path_length": 500,
    "eval_max_path_length": 500,
    "batch_size": 256
  },
  "trainer_kwargs": {
    "discount": 0.99,
    "soft_target_tau": 0.005,
    "target_update_period": 1,
  

3. Create Reinforcement Learning Model

In [11]:
if config["training"]:
    
    # Get environment configs for expl and eval envs and create the appropriate envs
    # suites[0] is expl and suites[1] is eval
    suites = []
    for env_config in (variant["expl_environment_kwargs"], variant["eval_environment_kwargs"]):
        # Load controller
        controller = env_config.pop("controller")
        controller_config = load_controller_config(default_controller=controller)
        # Create robosuite env and append to our list
        suites.append(suite.make(**env_config,
                                 has_renderer=False,
                                 has_offscreen_renderer=False,
                                 use_object_obs=True,
                                 use_camera_obs=False,
                                 reward_shaping=True,
                                 controller_configs=controller_config,
                                 ))
    # Create gym-compatible envs
    expl_env = NormalizedBoxEnv(GymWrapper(suites[0]))
    eval_env = NormalizedBoxEnv(GymWrapper(suites[1]))

    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs'],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs'],
    )
    target_qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs'],
    )
    target_qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs'],
    )

    # Instantiate trainer with appropriate agent
    expl_policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        **variant['policy_kwargs'],
    )
    eval_policy = MakeDeterministic(expl_policy)
    trainer = SACTrainer(
        env=eval_env,
        policy=expl_policy,
        qf1=qf1,
        qf2=qf2,
        target_qf1=target_qf1,
        target_qf2=target_qf2,
        **variant['trainer_kwargs']
    )

    # Adjust observation space if needed
    obs_space = expl_env.observation_space
    if isinstance(obs_space, Box):
        low = np.where(obs_space.low == -np.inf, -1e10, obs_space.low)
        high = np.where(obs_space.high == np.inf, 1e10, obs_space.high)
        expl_env.observation_space = Box(low, high, dtype=obs_space.dtype)

    # Adjust action space if needed
    action_space = expl_env.action_space
    if isinstance(action_space, Box):
        low = np.where(action_space.low == -np.inf, -1e10, action_space.low)
        high = np.where(action_space.high == np.inf, 1e10, action_space.high)
        expl_env.action_space = Box(low, high, dtype=action_space.dtype)


    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )

    # Define algorithm
    algorithm = CustomTorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


TypeError: Unknown space: Box(-inf, inf, (96,), float32)

3. Train Model

Starting learning iteration 0


4. Apply Model