In [1]:
!pip install robosuite
!pip install tianshou

Collecting robosuite
  Downloading robosuite-1.4.0-py3-none-any.whl (193.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.5/193.5 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting mujoco>=2.3.0 (from robosuite)
  Downloading mujoco-2.3.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m79.7 MB/s[0m eta [36m0:00:00[0m
Collecting glfw (from mujoco>=2.3.0->robosuite)
  Downloading glfw-2.6.2-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-manylinux2014_x86_64.whl (208 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m208.2/208.2 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: glfw, mujoco, robosuite
Successfully installed glfw-2.6.2 mujoco-2.3.7 robosuite-1.4.0
Collecting tianshou
  Downloading tianshou-0.5.1-py3-none-any.whl (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

In [2]:
from collections import OrderedDict
import numpy as np

import torch
from torch import nn
from torch.nn.utils import clip_grad_norm_
import torch.optim as optim
import torch.nn.functional as F

from typing import Any, List, Optional, Tuple, Union
import copy
import matplotlib.pyplot as plt

from robosuite.environments.manipulation.single_arm_env import SingleArmEnv
from robosuite.models.arenas import TableArena
from robosuite.models.objects import BoxObject
from robosuite.models.tasks import ManipulationTask
from robosuite.utils.mjcf_utils import CustomMaterial
from robosuite.utils.observables import Observable, sensor
from robosuite.utils.placement_samplers import UniformRandomSampler
from robosuite.utils.transform_utils import convert_quat

from tianshou.data import Batch, ReplayBuffer, SegmentTree, to_numpy

import imageio
from base64 import b64encode
from IPython.display import HTML



In [3]:
class Stack(SingleArmEnv):
    """
    This class corresponds to the stacking task for a single robot arm.

    Args:
        robots (str or list of str): Specification for specific robot arm(s) to be instantiated within this env
            (e.g: "Sawyer" would generate one arm; ["Panda", "Panda", "Sawyer"] would generate three robot arms)
            Note: Must be a single single-arm robot!

        env_configuration (str): Specifies how to position the robots within the environment (default is "default").
            For most single arm environments, this argument has no impact on the robot setup.

        controller_configs (str or list of dict): If set, contains relevant controller parameters for creating a
            custom controller. Else, uses the default controller for this specific task. Should either be single
            dict if same controller is to be used for all robots or else it should be a list of the same length as
            "robots" param

        gripper_types (str or list of str): type of gripper, used to instantiate
            gripper models from gripper factory. Default is "default", which is the default grippers(s) associated
            with the robot(s) the 'robots' specification. None removes the gripper, and any other (valid) model
            overrides the default gripper. Should either be single str if same gripper type is to be used for all
            robots or else it should be a list of the same length as "robots" param

        initialization_noise (dict or list of dict): Dict containing the initialization noise parameters.
            The expected keys and corresponding value types are specified below:

            :`'magnitude'`: The scale factor of uni-variate random noise applied to each of a robot's given initial
                joint positions. Setting this value to `None` or 0.0 results in no noise being applied.
                If "gaussian" type of noise is applied then this magnitude scales the standard deviation applied,
                If "uniform" type of noise is applied then this magnitude sets the bounds of the sampling range
            :`'type'`: Type of noise to apply. Can either specify "gaussian" or "uniform"

            Should either be single dict if same noise value is to be used for all robots or else it should be a
            list of the same length as "robots" param

            :Note: Specifying "default" will automatically use the default noise settings.
                Specifying None will automatically create the required dict with "magnitude" set to 0.0.

        table_full_size (3-tuple): x, y, and z dimensions of the table.

        table_friction (3-tuple): the three mujoco friction parameters for
            the table.

        use_camera_obs (bool): if True, every observation includes rendered image(s)

        use_object_obs (bool): if True, include object (cube) information in
            the observation.

        reward_scale (None or float): Scales the normalized reward function by the amount specified.
            If None, environment reward remains unnormalized

        reward_shaping (bool): if True, use dense rewards.

        placement_initializer (ObjectPositionSampler): if provided, will
            be used to place objects on every reset, else a UniformRandomSampler
            is used by default.

        has_renderer (bool): If true, render the simulation state in
            a viewer instead of headless mode.

        has_offscreen_renderer (bool): True if using off-screen rendering

        render_camera (str): Name of camera to render if `has_renderer` is True. Setting this value to 'None'
            will result in the default angle being applied, which is useful as it can be dragged / panned by
            the user using the mouse

        render_collision_mesh (bool): True if rendering collision meshes in camera. False otherwise.

        render_visual_mesh (bool): True if rendering visual meshes in camera. False otherwise.

        render_gpu_device_id (int): corresponds to the GPU device id to use for offscreen rendering.
            Defaults to -1, in which case the device will be inferred from environment variables
            (GPUS or CUDA_VISIBLE_DEVICES).

        control_freq (float): how many control signals to receive in every second. This sets the amount of
            simulation time that passes between every action input.

        horizon (int): Every episode lasts for exactly @horizon timesteps.

        ignore_done (bool): True if never terminating the environment (ignore @horizon).

        hard_reset (bool): If True, re-loads model, sim, and render object upon a reset call, else,
            only calls sim.reset and resets all robosuite-internal variables

        camera_names (str or list of str): name of camera to be rendered. Should either be single str if
            same name is to be used for all cameras' rendering or else it should be a list of cameras to render.

            :Note: At least one camera must be specified if @use_camera_obs is True.

            :Note: To render all robots' cameras of a certain type (e.g.: "robotview" or "eye_in_hand"), use the
                convention "all-{name}" (e.g.: "all-robotview") to automatically render all camera images from each
                robot's camera list).

        camera_heights (int or list of int): height of camera frame. Should either be single int if
            same height is to be used for all cameras' frames or else it should be a list of the same length as
            "camera names" param.

        camera_widths (int or list of int): width of camera frame. Should either be single int if
            same width is to be used for all cameras' frames or else it should be a list of the same length as
            "camera names" param.

        camera_depths (bool or list of bool): True if rendering RGB-D, and RGB otherwise. Should either be single
            bool if same depth setting is to be used for all cameras or else it should be a list of the same length as
            "camera names" param.

        camera_segmentations (None or str or list of str or list of list of str): Camera segmentation(s) to use
            for each camera. Valid options are:

                `None`: no segmentation sensor used
                `'instance'`: segmentation at the class-instance level
                `'class'`: segmentation at the class level
                `'element'`: segmentation at the per-geom level

            If not None, multiple types of segmentations can be specified. A [list of str / str or None] specifies
            [multiple / a single] segmentation(s) to use for all cameras. A list of list of str specifies per-camera
            segmentation setting(s) to use.

    Raises:
        AssertionError: [Invalid number of robots specified]
    """

    def __init__(
        self,
        robots,
        env_configuration="default",
        controller_configs=None,
        gripper_types="default",
        initialization_noise="default",
        table_full_size=(0.8, 0.8, 0.05),
        table_friction=(1.0, 5e-3, 1e-4),
        use_camera_obs=True,
        use_object_obs=True,
        reward_scale=1.0,
        reward_shaping=True,
        placement_initializer=None,
        has_renderer=False,
        has_offscreen_renderer=True,
        render_camera="frontview",
        render_collision_mesh=False,
        render_visual_mesh=True,
        render_gpu_device_id=-1,
        control_freq=20,
        horizon=1000,
        ignore_done=False,
        hard_reset=True,
        camera_names="frontview",
        camera_heights=256,
        camera_widths=256,
        camera_depths=False,
        camera_segmentations=None,  # {None, instance, class, element}
        renderer="mujoco",
        renderer_config=None,
    ):
        # settings for table top
        self.table_full_size = table_full_size
        self.table_friction = table_friction
        self.table_offset = np.array((0, 0, 0.8))

        # reward configuration
        self.reward_scale = reward_scale
        self.reward_shaping = reward_shaping

        # whether to use ground-truth object states
        self.use_object_obs = use_object_obs

        # object placement initializer
        self.placement_initializer = placement_initializer
        self.deterministic_reset = True
        self.obj_pos = np.array([0,0,0])
        self.obj_quat = np.array([0, 0, 0])
        self.joint_pos = np.array([0.0, 0.5, 0.0, -1.3, 0.0, 1.0, 0.785])

        super().__init__(
            robots=robots,
            env_configuration=env_configuration,
            controller_configs=controller_configs,
            mount_types="default",
            gripper_types=gripper_types,
            initialization_noise=initialization_noise,
            use_camera_obs=use_camera_obs,
            has_renderer=has_renderer,
            has_offscreen_renderer=has_offscreen_renderer,
            render_camera=render_camera,
            render_collision_mesh=render_collision_mesh,
            render_visual_mesh=render_visual_mesh,
            render_gpu_device_id=render_gpu_device_id,
            control_freq=control_freq,
            horizon=horizon,
            ignore_done=ignore_done,
            hard_reset=hard_reset,
            camera_names=camera_names,
            camera_heights=camera_heights,
            camera_widths=camera_widths,
            camera_depths=camera_depths,
            camera_segmentations=camera_segmentations,
            renderer=renderer,
            renderer_config=renderer_config,
        )

    def reward(self, action):
        """
        Reward function for the task.

        Sparse un-normalized reward:

            - a discrete reward of 2.0 is provided if the red block is stacked on the green block

        Un-normalized components if using reward shaping:

            - Reaching: in [0, 0.25], to encourage the arm to reach the cube
            - Grasping: in {0, 0.25}, non-zero if arm is grasping the cube
            - Lifting: in {0, 1}, non-zero if arm has lifted the cube
            - Aligning: in [0, 0.5], encourages aligning one cube over the other
            - Stacking: in {0, 2}, non-zero if cube is stacked on other cube

        The reward is max over the following:

            - Reaching + Grasping
            - Lifting + Aligning
            - Stacking

        The sparse reward only consists of the stacking component.

        Note that the final reward is normalized and scaled by
        reward_scale / 2.0 as well so that the max score is equal to reward_scale

        Args:
            action (np array): [NOT USED]

        Returns:
            float: reward value
        """
        r_reach, r_lift, r_stack = self.staged_rewards()
        if self.reward_shaping:
            reward = max(r_reach, r_lift, r_stack)
        else:
            reward = 2.0 if r_stack > 0 else 0.0

        if self.reward_scale is not None:
            reward *= self.reward_scale / 2.0

        return reward

    def staged_rewards(self):
        """
        Helper function to calculate staged rewards based on current physical states.

        Returns:
            3-tuple:

                - (float): reward for reaching and grasping
                - (float): reward for lifting and aligning
                - (float): reward for stacking
        """
        # reaching is successful when the gripper site is close to the center of the cube
        cubeA_pos = self.sim.data.body_xpos[self.cubeA_body_id]
        cubeB_pos = self.sim.data.body_xpos[self.cubeB_body_id]
        gripper_site_pos = self.sim.data.site_xpos[self.robots[0].eef_site_id]
        dist = np.linalg.norm(gripper_site_pos - cubeA_pos)
        r_reach = (1 - np.tanh(10.0 * dist)) * 0.25

        # grasping reward
        grasping_cubeA = self._check_grasp(gripper=self.robots[0].gripper, object_geoms=self.cubeA)
        if grasping_cubeA:
            r_reach += 0.25

        # lifting is successful when the cube is above the table top by a margin
        cubeA_height = cubeA_pos[2]
        table_height = self.table_offset[2]
        cubeA_lifted = cubeA_height > table_height + 0.04
        r_lift = 1.0 if cubeA_lifted else 0.0

        # Aligning is successful when cubeA is right above cubeB
        if cubeA_lifted:
            horiz_dist = np.linalg.norm(np.array(cubeA_pos[:2]) - np.array(cubeB_pos[:2]))
            r_lift += 0.5 * (1 - np.tanh(horiz_dist))

        # stacking is successful when the block is lifted and the gripper is not holding the object
        r_stack = 0
        cubeA_touching_cubeB = self.check_contact(self.cubeA, self.cubeB)
        if not grasping_cubeA and r_lift > 0 and cubeA_touching_cubeB:
            r_stack = 2.0

        return r_reach, r_lift, r_stack

    def _load_model(self):
        """
        Loads an xml model, puts it in self.model
        """
        super()._load_model()

        # Adjust base pose accordingly
        xpos = self.robots[0].robot_model.base_xpos_offset["table"](self.table_full_size[0])
        self.robots[0].robot_model.set_base_xpos(xpos)

        # load model for table top workspace
        mujoco_arena = TableArena(
            table_full_size=self.table_full_size,
            table_friction=self.table_friction,
            table_offset=self.table_offset,
        )

        # Arena always gets set to zero origin
        mujoco_arena.set_origin([0, 0, 0])

        # initialize objects of interest
        tex_attrib = {
            "type": "cube",
        }
        mat_attrib = {
            "texrepeat": "1 1",
            "specular": "0.4",
            "shininess": "0.1",
        }
        redwood = CustomMaterial(
            texture="WoodRed",
            tex_name="redwood",
            mat_name="redwood_mat",
            tex_attrib=tex_attrib,
            mat_attrib=mat_attrib,
        )
        greenwood = CustomMaterial(
            texture="WoodGreen",
            tex_name="greenwood",
            mat_name="greenwood_mat",
            tex_attrib=tex_attrib,
            mat_attrib=mat_attrib,
        )
        self.cubeA = BoxObject(
            name="cubeA",
            size_min=[0.02, 0.02, 0.02],
            size_max=[0.02, 0.02, 0.02],
            rgba=[1, 0, 0, 1],
            material=redwood,
        )
        self.cubeB = BoxObject(
            name="cubeB",
            size_min=[0.025, 0.025, 0.025],
            size_max=[0.025, 0.025, 0.025],
            rgba=[0, 1, 0, 1],
            material=greenwood,
        )
        cubes = [self.cubeA, self.cubeB]
        # Create placement initializer
        if self.placement_initializer is not None:
            self.placement_initializer.reset()
            self.placement_initializer.add_objects(cubes)
        else:
            self.placement_initializer = UniformRandomSampler(
                name="ObjectSampler",
                mujoco_objects=cubes,
                x_range=[-0.08, 0.08],
                y_range=[-0.08, 0.08],
                rotation=None,
                ensure_object_boundary_in_range=False,
                ensure_valid_placement=True,
                reference_pos=self.table_offset,
                z_offset=0.01,
            )

        # task includes arena, robot, and objects of interest
        self.model = ManipulationTask(
            mujoco_arena=mujoco_arena,
            mujoco_robots=[robot.robot_model for robot in self.robots],
            mujoco_objects=cubes,
        )

    def _setup_references(self):
        """
        Sets up references to important components. A reference is typically an
        index or a list of indices that point to the corresponding elements
        in a flatten array, which is how MuJoCo stores physical simulation data.
        """
        super()._setup_references()

        # Additional object references from this env
        self.cubeA_body_id = self.sim.model.body_name2id(self.cubeA.root_body)
        self.cubeB_body_id = self.sim.model.body_name2id(self.cubeB.root_body)

    def _reset_internal(self):
        """
        Resets simulation internal configurations.
        """
        super()._reset_internal()

        self.robots[0].set_robot_joint_positions(self.joint_pos)

        # Reset all object positions using initializer sampler if we're not directly loading from an xml
        if not self.deterministic_reset:

            # Sample from the placement initializer for all objects
            object_placements = self.placement_initializer.sample()

            # Loop through all objects and reset their positions
            for obj_pos, obj_quat, obj in object_placements.values():
              self.sim.data.set_joint_qpos(obj.joints[0], np.concatenate([np.array(obj_pos), np.array(obj_quat)]))

        else:
          self.sim.data.set_joint_qpos(obj.joints[0], np.concatenate([self.obj_pos, self.obj_quat]))

    def _setup_observables(self):
        """
        Sets up observables to be used for this environment. Creates object-based observables if enabled

        Returns:
            OrderedDict: Dictionary mapping observable names to its corresponding Observable object
        """
        observables = super()._setup_observables()

        # low-level object information
        if self.use_object_obs:
            # Get robot prefix and define observables modality
            pf = self.robots[0].robot_model.naming_prefix
            modality = "object"

            # position and rotation of the first cube
            @sensor(modality=modality)
            def cubeA_pos(obs_cache):
                return np.array(self.sim.data.body_xpos[self.cubeA_body_id])

            @sensor(modality=modality)
            def cubeA_quat(obs_cache):
                return convert_quat(np.array(self.sim.data.body_xquat[self.cubeA_body_id]), to="xyzw")

            @sensor(modality=modality)
            def cubeB_pos(obs_cache):
                return np.array(self.sim.data.body_xpos[self.cubeB_body_id])

            @sensor(modality=modality)
            def cubeB_quat(obs_cache):
                return convert_quat(np.array(self.sim.data.body_xquat[self.cubeB_body_id]), to="xyzw")

            @sensor(modality=modality)
            def gripper_to_cubeA(obs_cache):
                return (
                    obs_cache["cubeA_pos"] - obs_cache[f"{pf}eef_pos"]
                    if "cubeA_pos" in obs_cache and f"{pf}eef_pos" in obs_cache
                    else np.zeros(3)
                )

            @sensor(modality=modality)
            def gripper_to_cubeB(obs_cache):
                return (
                    obs_cache["cubeB_pos"] - obs_cache[f"{pf}eef_pos"]
                    if "cubeB_pos" in obs_cache and f"{pf}eef_pos" in obs_cache
                    else np.zeros(3)
                )

            @sensor(modality=modality)
            def cubeA_to_cubeB(obs_cache):
                return (
                    obs_cache["cubeB_pos"] - obs_cache["cubeA_pos"]
                    if "cubeA_pos" in obs_cache and "cubeB_pos" in obs_cache
                    else np.zeros(3)
                )

            sensors = [cubeA_pos, cubeA_quat, cubeB_pos, cubeB_quat, gripper_to_cubeA, gripper_to_cubeB, cubeA_to_cubeB]
            names = [s.__name__ for s in sensors]

            # Create observables
            for name, s in zip(names, sensors):
                observables[name] = Observable(
                    name=name,
                    sensor=s,
                    sampling_rate=self.control_freq,
                )

        return observables

    def _check_success(self):
        """
        Check if blocks are stacked correctly.

        Returns:
            bool: True if blocks are correctly stacked
        """
        _, _, r_stack = self.staged_rewards()
        return r_stack > 0

    def visualize(self, vis_settings):
        """
        In addition to super call, visualize gripper site proportional to the distance to the cube.

        Args:
            vis_settings (dict): Visualization keywords mapped to T/F, determining whether that specific
                component should be visualized. Should have "grippers" keyword as well as any other relevant
                options specified.
        """
        # Run superclass method first
        super().visualize(vis_settings=vis_settings)

        # Color the gripper visualization site according to its distance to the cube
        if vis_settings["grippers"]:
            self._visualize_gripper_to_target(gripper=self.robots[0].gripper, target=self.cubeA)




  and should_run_async(code)


In [4]:
class PrioritizedReplayBuffer(ReplayBuffer):
    """Implementation of Prioritized Experience Replay. arXiv:1511.05952.

    :param float alpha: the prioritization exponent.
    :param float beta: the importance sample soft coefficient.
    :param bool weight_norm: whether to normalize returned weights with the maximum
        weight value within the batch. Default to True.

    .. seealso::

        Please refer to :class:`~tianshou.data.ReplayBuffer` for other APIs' usage.
    """

    def __init__(
        self,
        size: int,
        alpha: float,
        beta: float,
        weight_norm: bool = True,
        **kwargs: Any
    ) -> None:
        # will raise KeyError in PrioritizedVectorReplayBuffer
        # super().__init__(size, **kwargs)
        ReplayBuffer.__init__(self, size, **kwargs)
        assert alpha > 0.0 and beta >= 0.0
        self._alpha, self._beta = alpha, beta
        self._max_prio = self._min_prio = 1.0
        # save weight directly in this class instead of self._meta
        self.weight = SegmentTree(size)
        self.__eps = np.finfo(np.float32).eps.item()
        self.options.update(alpha=alpha, beta=beta)
        self._weight_norm = weight_norm

    def init_weight(self, index: Union[int, np.ndarray]) -> None:
        self.weight[index] = self._max_prio**self._alpha


    def update(self, buffer: ReplayBuffer) -> np.ndarray:
        indices = super().update(buffer)
        self.init_weight(indices)
        return indices


    def add(
        self,
        batch: Batch,
        buffer_ids: Optional[Union[np.ndarray, List[int]]] = None
    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        ptr, ep_rew, ep_len, ep_idx = super().add(batch, buffer_ids)
        self.init_weight(ptr)
        return ptr, ep_rew, ep_len, ep_idx


    def sample_indices(self, batch_size: int) -> np.ndarray:
        if batch_size > 0 and len(self) > 0:
            scalar = np.random.rand(batch_size) * self.weight.reduce()
            return self.weight.get_prefix_sum_idx(scalar)  # type: ignore
        else:
            return super().sample_indices(batch_size)


    def get_weight(self, index: Union[int, np.ndarray]) -> Union[float, np.ndarray]:
        """Get the importance sampling weight.

        The "weight" in the returned Batch is the weight on loss function to debias
        the sampling process (some transition tuples are sampled more often so their
        losses are weighted less).
        """
        # important sampling weight calculation
        # original formula: ((p_j/p_sum*N)**(-beta))/((p_min/p_sum*N)**(-beta))
        # simplified formula: (p_j/p_min)**(-beta)
        return (self.weight[index] / self._min_prio)**(-self._beta)


    def update_weight(
        self, index: np.ndarray, new_weight: Union[np.ndarray, torch.Tensor]
    ) -> None:
        """Update priority weight by index in this buffer.

        :param np.ndarray index: index you want to update weight.
        :param np.ndarray new_weight: new priority weight you want to update.
        """
        weight = np.abs(to_numpy(new_weight)) + self.__eps
        self.weight[index] = weight**self._alpha
        self._max_prio = max(self._max_prio, weight.max())
        self._min_prio = min(self._min_prio, weight.min())


    def __getitem__(self, index: Union[slice, int, List[int], np.ndarray]) -> Batch:
        if isinstance(index, slice):  # change slice to np array
            # buffer[:] will get all available data
            indices = self.sample_indices(0) if index == slice(None) \
                else self._indices[:len(self)][index]
        else:
            indices = index  # type: ignore
        batch = super().__getitem__(indices)
        weight = self.get_weight(indices)
        # ref: https://github.com/Kaixhin/Rainbow/blob/master/memory.py L154
        batch.weight = weight / np.max(weight) if self._weight_norm else weight
        return batch


    def set_beta(self, beta: float) -> None:
        self._beta = beta

In [5]:
class OrnsteinUhlenbeckProcess:
    def __init__(self, theta=0.15, mu=0.0, sigma=0.2, dt=1e-2, x0=None, size=1, sigma_min=None, n_steps_annealing=1000):
        self.theta = theta
        self.mu = mu
        self.sigma = sigma
        self.dt = dt
        self.x0 = x0
        self.size = size
        self.num_steps = 0

        self.x_prev = self.x0 if self.x0 is not None else np.zeros(self.size)

        if sigma_min is not None:
            self.m = -float(sigma - sigma_min) / float(n_steps_annealing)
            self.c = sigma
            self.sigma_min = sigma_min
        else:
            self.m = 0
            self.c = sigma
            self.sigma_min = sigma

    def current_sigma(self):
        sigma = max(self.sigma_min, self.m * float(self.num_steps) + self.c)
        return sigma

    def sample(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.current_sigma() * np.sqrt(self.dt) * np.random.normal(size=self.size)
        self.x_prev = x
        self.num_steps += 1
        return x

In [6]:
class Actor(nn.Module):
    def __init__(self, action_shape):
      super().__init__()
      self.model = torch.nn.Sequential(
          # CNN Layers
          torch.nn.Conv2d(1, 32, kernel_size=8, stride=4),
          torch.nn.ReLU(),
          torch.nn.Conv2d(32, 64, kernel_size=4, stride=2),
          torch.nn.ReLU(),
          torch.nn.Conv2d(64, 64, kernel_size=3, stride=1),
          torch.nn.ReLU(),
          torch.nn.Flatten(),
          # Fully Connected Layers
          torch.nn.Linear(1024, 256),
          torch.nn.ReLU(),
          torch.nn.Linear(256, action_shape)
      )

    def forward(self, obs):
        action = self.model(obs)
        return action


class Critic(nn.Module):
    def __init__(self, action_shape):
        super(Critic, self).__init__()

        # CNN Layers for processing the observation
        self.obs_net = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten()
        )

        # Fully Connected Layers
        # Assuming the flattened output from CNN is of size 1024
        # We concatenate it with the action tensor, so the input size becomes 1024 + action_shape
        self.fc_net = nn.Sequential(
            nn.Linear(1024 + action_shape, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )

    def forward(self, obs, act):

      if not isinstance(obs, torch.Tensor):
        obs = torch.tensor(obs, dtype=torch.float32)
      if not isinstance(act, torch.Tensor):
        act = torch.tensor(act, dtype=torch.float32)

      obs_repr = self.obs_net(obs)

      # Concatenate the CNN output with the action tensor along dimension 1 (columns)
      combined = torch.cat([obs_repr, act], dim=1)

      q_value = self.fc_net(combined)

      return q_value

In [7]:
class Learner:
    def __init__(self, action_shape, num_agent, gamma=0.95,lr=0.001,batch_size=1024,memory_size=int(1e6),tau=0.01,grad_norm_clipping = 0.5):
        self.action_shape = action_shape
        self.gamma = gamma
        self.actor = Actor(self.action_shape)
        self.target_actor = copy.deepcopy(self.actor)
        self.actor_optimizer = optim.Adam(self.actor.parameters(),lr=lr)
        self.critic = Critic(self.action_shape)
        self.target_critic = copy.deepcopy(self.critic)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),lr=lr)

        # Wrap your models with DataParallel
        if torch.cuda.device_count() > 1:
          print("Using", torch.cuda.device_count(), "GPUs!")
          self.actor = torch.nn.DataParallel(self.actor)
          self.target_actor = torch.nn.DataParallel(self.target_actor)
          self.critic = torch.nn.DataParallel(self.critic)
          self.target_critic = torch.nn.DataParallel(self.target_critic)
        else:
          self.actor = self.actor.to(device)
          self.target_actor = self.target_actor.to(device)
          self.critic = self.critic.to(device)
          self.target_critic = self.target_critic.to(device)

        self.pri_buffer = PrioritizedReplayBuffer(memory_size, alpha=0.6, beta=0.4)
        self.loss_fn = torch.nn.MSELoss()
        self.batch_size = batch_size
        self.is_gpu = torch.cuda.is_available
        self.noise = OrnsteinUhlenbeckProcess(size=self.action_shape)
        self.grad_norm_clipping = grad_norm_clipping
        self.tau = tau
        self.num_agent = num_agent

    @torch.no_grad()
    def td_targeti(self,reward,obs,next_obs,done):
        next_action = torch.tanh(self.target_actor(obs))
        next_q = self.target_critic(next_obs,next_action)
        td_targeti = reward.unsqueeze(1) + self.gamma * next_q*(1.-done.unsqueeze(1))
        return td_targeti.float()

    def update(self):
      indice = self.pri_buffer.sample_indices(self.batch_size)
      sample = self.pri_buffer.__getitem__(indice)
      obs, action, reward, next_obs, done = sample['obs'], sample['act'], sample['rew'], sample['obs_next'], sample['terminated']

      obs = obs.to(device)
      next_obs = next_obs.to(device)
      action = action.to(device)

      reward = torch.FloatTensor(reward).to(device)
      done = np.array(done)
      done = torch.IntTensor(done).to(device)

      td_targeti = self.td_targeti(reward,obs,next_obs,done)
      current_q = self.critic(obs,action)

      critic_loss = self.loss_fn(current_q,td_targeti)
      """ Update priorities based on TD errors """
      td_errors = (td_targeti - current_q).t()          # Calculate the TD Errors
      self.pri_buffer.update_weight(indice, td_errors.data.detach().cpu().numpy())

      self.critic_optimizer.zero_grad()
      critic_loss.backward()
      clip_grad_norm_(self.critic.parameters(),max_norm=self.grad_norm_clipping)
      self.critic_optimizer.step()
      ac_up = self.actor(obs)
      ac = torch.tanh(ac_up)
      pr = -self.critic(obs,ac).mean()
      pg = (ac.pow(2)).mean()
      actor_loss = pr + pg*1e-3
      self.actor_optimizer.zero_grad()
      clip_grad_norm_(self.actor.parameters(),max_norm=self.grad_norm_clipping)
      actor_loss.backward()
      self.actor_optimizer.step()

      for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()):
        target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data)
      for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
        target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data)

    def inference(self,obs,greedy=False):
        obs = obs.to(device)
        action = torch.tanh(self.actor(obs))
        if not greedy:
            action += torch.tensor(self.noise.sample(),dtype=torch.float).cuda()
        return np.clip(action.detach().cpu().numpy(),-1.0,1.0)

    def load_checkpoint(self, filename):
      checkpoint = torch.load(filename)

      self.actor.load_state_dict(checkpoint['actor_state_dict'])
      self.target_actor.load_state_dict(checkpoint['target_actor_state_dict'])
      self.actor_optimizer.load_state_dict(checkpoint['actor_optimizer_state_dict'])

      self.critic.load_state_dict(checkpoint['critic_state_dict'])
      self.target_critic.load_state_dict(checkpoint['target_critic_state_dict'])
      self.critic_optimizer.load_state_dict(checkpoint['critic_optimizer_state_dict'])

In [8]:
class Agent:
    def __init__(self, num_agent, learner):
        self.num_agent = num_agent
        self.learner = learner

    def get_action(self,states, greedy=False):
      actions = self.learner.inference(states)
      return actions

In [9]:
def imagePreProcess(obs):
    obs = torch.tensor(obs.transpose(2, 0, 1), dtype=torch.float32).unsqueeze(0)
    obs = 0.2989 * obs[:, 0, :, :] + 0.5870 * obs[:, 1, :, :] + 0.1140 * obs[:, 2, :, :]
    obs = obs.unsqueeze(1)
    obs = F.interpolate(obs, size=(64, 64), mode='bilinear', align_corners=False)
    return obs

In [10]:
num_agent = 2
num_episode = 1000
initial_memory_size = 1000
memory_size = 1000
episode_rewards = []
num_average_epidodes = 100
save_every = 500
batch_size=3072
max_steps = 100

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

envs = [Stack('Panda') for _ in range(num_agent)]

action_shape = envs[0].robots[0].robot_model.dof + envs[0].robots[0].gripper.dof

learner = Learner(action_shape, num_agent, memory_size)
agent = Agent(num_agent, learner)

cuda


## Initialize memory buffer

In [11]:
""" Reset the environment """
states = [OrderedDict() for _ in range(num_agent)]
next_states = [OrderedDict() for _ in range(num_agent)]
dones = [False for _ in range(num_agent)]
rewards = [0 for _ in range(num_agent)]

for i in range(num_agent):
  states[i] = envs[i].reset()

""" Initially, put the data into the replay buffer when an action with noise was taken """
for step in range(initial_memory_size):
  for i in range(num_agent):
    if step % max_steps == 0:
      states[i] = envs[i].reset()

    obs = imagePreProcess(states[i]['frontview_image'])

    action = np.random.randn(action_shape) # sample random action
    next_states[i], rewards[i], dones[i], info = envs[i].step(action)

    obs_next = imagePreProcess(next_states[i]['frontview_image'])

    obs = obs.squeeze(0)
    obs_next = obs_next.squeeze(0)
    action = torch.tensor(action, dtype=torch.float32)
    action = action.squeeze(0)

    batch = Batch({'obs': obs, 'act': action, 'rew': rewards[i], 'obs_next': obs_next, 'terminated': dones[i], 'truncated': dones[i]})

    agent.learner.pri_buffer.add(batch)
    states[i] = next_states[i]
print('%d Data collected' % (initial_memory_size*num_agent))

  and should_run_async(code)


2000 Data collected


## Train agent

In [None]:
""" Train model """
for episode in range(num_episode):
  for i in range(num_agent):
    states[i] = envs[i].reset()
  episode_reward = 0
  for t in range(max_steps):
    for i in range(num_agent):
      obs = imagePreProcess(states[i]['frontview_image'])

      action = agent.get_action(obs)
      next_states[i], rewards[i], dones[i], info = envs[i].step(action[0])

      obs_next = imagePreProcess(next_states[i]['frontview_image'])

      obs = obs.squeeze(0)
      obs_next = obs_next.squeeze(0)
      action = torch.tensor(action, dtype=torch.float32)
      action = action.squeeze(0)

      batch = Batch({'obs': obs, 'act': action, 'rew': rewards[i], 'obs_next': obs_next, 'terminated': dones[i], 'truncated': dones[i]})
      agent.learner.pri_buffer.add(batch)
      states[i] = next_states[i]
      if any(dones):
        break

    episode_reward += sum(rewards)
  if episode % 5 == 0:
    agent.learner.update()
  episode_rewards.append(episode_reward)
  if episode % 100 == 0:
    print("Episode %d finished | Episode reward %f" % (episode, episode_reward))
  if episode % save_every == 0:
    checkpoint = {'episode': episode,
    'actor_state_dict': agent.learner.actor.state_dict(),
    'target_actor_state_dict': agent.learner.target_actor.state_dict(),
    'actor_optimizer_state_dict': agent.learner.actor_optimizer.state_dict(),
    'critic_state_dict': agent.learner.critic.state_dict(),
    'target_critic_state_dict': agent.learner.target_critic.state_dict(),
    'critic_optimizer_state_dict': agent.learner.critic_optimizer.state_dict()}
    torch.save(checkpoint, 'check_point')
    print('Model Saved')

for i in range(num_agent):
  envs[i].close()

# Compute the moving average of cumulative rewards
moving_average = np.convolve(episode_rewards, np.ones(num_average_epidodes)/num_average_epidodes, mode='valid')
plt.plot(np.arange(len(moving_average)),moving_average)
plt.title('Average rewards in %d episodes' % num_average_epidodes)
plt.xlabel('episode')
plt.ylabel('rewards')
plt.show()

Episode 0 finished | Episode reward 0.000425
Model Saved


## Test Model

In [None]:
""" Reset the environment """
env = Stack('Panda')
states = env.reset()

agent.learner.load_checkpoint('check_point')

frames = []
for i in range(100):

    frames.append(states['frontview_image'])  # Append the image to the frames list
    obs = imagePreProcess(states['frontview_image'])

    actions = agent.get_action(obs)

    next_states, rewards, dones, info = env.step(actions[0])
    states = next_states

    # if i == 50:
    #   env.joint_pos = np.array([0.0, 0.5, 1.0, 1.3, 1.0, 1.0, 0.785])
    #   env.object_pos = np.array([10,10,0])
    #   env.object_quat = np.array([1, 0, 0])
    #   states = env.reset()


imageio.mimwrite('robosuite_video.mp4', frames, fps=20)

In [None]:
""" Load video and encode it in base64 format """
video_path = 'robosuite_video.mp4'
video_data = open(video_path, 'rb').read()
video_encoded = b64encode(video_data).decode()

# Display video using HTML
HTML(f"""
<video width="640" height="480" controls>
  <source src="data:video/mp4;base64,{video_encoded}" type="video/mp4">
</video>
""")