# Solving the Pick-and-Place Environment in Robosuite

## Abstract

## Introduction




## Environment

Blah Blah about robosuite and Stable Baselines


## Implementation

1. Import needed dependencies: numpy, os, robosuite, stable_baselines 3



In [None]:
import numpy as np
import os
import yaml
import robosuite as suite
import torch

from stable_baselines3.common.save_util import save_to_zip_file, load_from_zip_file
from robosuite.controllers import load_controller_config
from stable_baselines3 import PPO, DDPG, SAC
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.logger import configure
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize, SubprocVecEnv
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.callbacks import CallbackList, EvalCallback, CheckpointCallback
from robosuite.wrappers import GymWrapper

# Check if MPS is available
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS backend is available.")
else:
    device = torch.device("cpu")
    print("MPS backend is not available, using CPU.")
    
config = {}
print('Load configuration file config.yaml')
with open("config.yaml") as stream:
    try:
        config = yaml.safe_load(stream)
    except yaml.YAMLError as e:
        print(e)

print("Running environment with training =", str(config["training"]), " and simulation =", str(config["simulation"]))

2. Creating Standardized Environment

In [None]:
# make robosuite environment into a gym environment as stable baselines only supports gym environments
def make_env(env_id, options, rank, seed=0):
    """
    Utility function for multiprocessed env.

    :param env_id: (str) the environment ID
    :param options: (dict) additional arguments to pass to the specific environment class initializer
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = GymWrapper(suite.make(env_id, **options))
        env.render_mode = 'mujoco'
        env = Monitor(env)
        env.reset(seed=seed + rank)
        return env
    set_random_seed(seed)
    return _init

#Setup appropriate controller
controller_config = load_controller_config(default_controller=config["robot_controller"])

'''
# Modify the controller configuration to include variable impedance
controller_config["impedance_mode"] = "variable"
controller_config["kp_limits"] = [0, 300]
controller_config["damping_ratio_limits"] = [0, 10]
'''

# Define environment parameters for specific environment "PickPlace"
env_options = {"robots":config["robot_name"],
    "controller_configs":controller_config,
    "has_renderer":True,
    "has_offscreen_renderer":True,
    "use_camera_obs":False,
    "reward_shaping":True,
    "horizon": config["horizon"],
    "control_freq": config["control_freq"],
    }

3. Build model and environment

In [None]:
# Find the latest checkpoint, so model can start training where it left off
log_dir = "./logs/"

#Set up TensorBoard logger
#tensor_logger = configure('./logs/', ["tensorboard"])
tensor_logger = log_dir + "tensorboard"
print("TensorBoard logging to", tensor_logger)

'''
def find_latest_checkpoint(log_dir, file_type=".zip", object_type=None):
    files = [f for f in os.listdir(log_dir) if f.startswith(f"{config['algorithm']}_model_") and (object_type is None or object_type in f) and f.endswith(file_type)]
    files.sort()
    if len(files) == 0:
        return None
    latest_checkpoint = files[-1]
    return os.path.join(log_dir, latest_checkpoint)
'''

#Initialize environment
if config["multiprocessing"]:
    # Create as many environments to be trained on as there are CPUs/Cores
    env = SubprocVecEnv([make_env("PickPlace", env_options, i, config["seed"]) for i in range(config["num_envs"])], start_method= 'spawn')
    if config["training"]:
        eval_env = SubprocVecEnv([make_env("PickPlace", env_options, i, config["seed"]) for i in range(config["num_envs"])], start_method= 'spawn')
        eval_env = VecNormalize(eval_env)
else:
    env = DummyVecEnv([make_env("PickPlace", env_options, i, config["seed"]) for i in range(config["num_envs"])])
    if config["training"]:
        eval_env = DummyVecEnv([make_env("PickPlace", env_options, i, config["seed"]) for i in range(config["num_envs"])])
        eval_env = VecNormalize(eval_env)
if config["normalize"]:
    # Wraps the environment to normalize observations and rewards and supports moving average
    env = VecNormalize(env)

#Initialize model:

#if config["training"] and find_latest_checkpoint(log_dir) is None:
if config["training"] and not os.path.isfile(config["model_file_name"] + ".zip"):
    print("No model found, creating a new one")

    # Create model
    if config["algorithm"] == "PPO":
        model = PPO(config["policy"], env, verbose=1, batch_size=config["batch_size"], tensorboard_log=tensor_logger, device=device)
    elif config["algorithm"] == "DDPG":
        n_actions = env.action_space.shape[-1]
        action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
        model = DDPG(config["policy"], env, action_noise=action_noise, verbose=1, batch_size=config["batch_size"], tensorboard_log=tensor_logger, device=device)
    elif config["algorithm"] == "SAC":
        model = SAC(config["policy"], env, verbose=1, batch_size=config["batch_size"], tensorboard_log=tensor_logger, device=device)
    print("Created a new model")
else:
    print("Loading existing model")
    '''
    ## Load the latest model
    model_latest_checkpoint = find_latest_checkpoint(log_dir)
    print("Loading model from", model_latest_checkpoint)
    env_last_checkpoint = find_latest_checkpoint(log_dir, object_type="_vecnormalize", file_type=".pkl")
    replay_latest_checkpoint = find_latest_checkpoint(log_dir, object_type="_replay_buffer", file_type=".pkl")
    '''
    model_latest_checkpoint = os.path.join(".", config["model_file_name"] + '.zip')
    env_last_checkpoint = os.path.join(".", 'vec_normalize_' + config["model_file_name"] + '.pkl')

    if config["normalize"]:
        #env = VecNormalize.load(env_last_checkpoint, env)
        env.load(env_last_checkpoint, env)
                
    if config["algorithm"] == "PPO": #Create to-be-trained model with the the PPO algorithm and an actor-critic policy using multi-layer perceptrons
        model = PPO.load(model_latest_checkpoint, env=env, tensorboard_log=tensor_logger, device=device)
    elif config["algorithm"] == "DDPG": #Create to-be-trained model with the DDPG algorithm using multi-layer perceptrons
        model = DDPG.load(model_latest_checkpoint, env=env,tensorboard_log=tensor_logger, device=device)
    elif config["algorithm"] == "SAC": #Create to-be-trained model with the SAC algorithm using multi-layer perceptrons
        model = SAC.load(model_latest_checkpoint, env=env, tensorboard_log=tensor_logger, device=device)
    else:
        raise ValueError("Invalid algorithm specified in the configuration.")
    '''
    if replay_latest_checkpoint is not None:
        model.load_replay_buffer(replay_latest_checkpoint)
    '''

4. Train model

In [None]:
'''
Training loop trains model for 1000000 steps every iteration and runs for 10 iterations
During each step, an action is predicted by the model, and the environment state is updated
'''
if config["training"]:
    '''
    class CustomCheckpointCallback(CheckpointCallback):
        def __init__(self, *args, **kwargs):
            super(CustomCheckpointCallback, self).__init__(*args, **kwargs)

        def _on_step(self) -> bool:
            try:
                return super(CustomCheckpointCallback, self)._on_step()
            except TypeError as e:
                print(f"Serialization error encountered: {e}")
                return True  # Continue training despite the error
    
    # Save a checkpoint every 1000 steps
    checkpoint_callback = CustomCheckpointCallback(
        save_freq=2048,
        save_path="./logs/",
        name_prefix=f"{config['algorithm']}_model",
        save_replay_buffer=True,
        save_vecnormalize=True,
        verbose=0,
        )  
    '''
    # Save during training the best model based on the evaluation score
    eval_callback = EvalCallback(eval_env, best_model_save_path="./logs/",
                             log_path="./logs/", eval_freq=2048,
                             deterministic=True, render=False,
                             verbose=0)
    
    # Create the callback list
    #callback = CallbackList([checkpoint_callback, eval_callback])
    callback = eval_callback

    # Define the model with the TensorBoard logger
    #model.set_logger(tensor_logger)
    
    # Define the number of training repetitions
    training_repetitions = config["training_repetitions"]
    total_timesteps = config["training_total_timesteps"]

    if config["algorithm"] == "PPO":
        # Define steps per iteration
        n_steps_per_iteration = model.n_steps
        n_steps_per_iteration_all_envs = n_steps_per_iteration * config["num_envs"]

        # Calculate number of iterations
        num_iterations = total_timesteps // n_steps_per_iteration
        print(f"Total timesteps: {total_timesteps}, Steps per iteration: {n_steps_per_iteration}, Number of iterations: {num_iterations}")

        for i in range(num_iterations):  # Number of learning iterations
            print(f"Starting learning iteration {i}")
            #model.learn(total_timesteps=n_steps_per_iteration_all_envs, callback=callback, progress_bar=True)
            model.learn(total_timesteps=n_steps_per_iteration_all_envs, tb_log_name="H_3000_CF_100", callback=callback)
            model.save(config["model_file_name"] + '.zip')
            if config["normalize"]:
                env.save('vec_normalize_' + config["model_file_name"] +'.pkl')
            print(f"Completed learning iteration {i} successfully")

    else:
        for i in range(training_repetitions):  # Number of learning iterations
            print(f"Starting learning iteration {i}")
            model.learn(total_timesteps=total_timesteps, callback=callback, progress_bar=True)
            model.save(config["model_file_name"] + '.zip')
            if config["normalize"]:
                env.save('vec_normalize_' + config["model_file_name"] +'.pkl')
            print(f"Completed learning iteration {i} successfully")
             
#Load the best model based on the evaluation score (total_reward):
model = PPO.load('./logs/best_model' + '.zip', env=env, device=device)

5. Evaluate model

In [None]:
# Evaluate the model by simulating and visualize actions in renderers
episode_rewards = []
evaluation_repetitions = config["evaluation_repetitions"]
for i_episode in range(evaluation_repetitions):
    obs = env.reset()
    total_reward = 0
    for t in range(2048):
        env.render()
        #print(obs)
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        total_reward += reward
        if done.all():
            print("Episode finished after {} timesteps".format(t+1))
            break
    episode_rewards.append(total_reward)
average_reward_per_environment = sum(episode_rewards) / len(episode_rewards)
average_reward = np.mean(average_reward_per_environment)
print(f"Iteration {i_episode+1}/{range(3)}, Average Reward per Environment: {average_reward_per_environment}, Average Reward: {average_reward}")

#Close environment
env.close()