**Load the libraries**

In [1]:
import gymnasium as gym
import numpy as np
import torch as th

from rllte.env.utils import Gymnasium2Torch
from rllte.xplore.reward import ICM

**Create a fake Atari environment with image observations**

In [2]:
class FakeAtari(gym.Env):
    def __init__(self):
        self.action_space = gym.spaces.Discrete(7)
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(4, 84, 84))
        self.count = 0

    def reset(self):
        self.count = 0
        return self.observation_space.sample(), {}

    def step(self, action):
        self.count += 1
        if self.count > 100 and np.random.rand() < 0.1:
            term = trunc = True
        else:
            term = trunc = False
        return self.observation_space.sample(), 0, term, trunc, {}

**Synchronous Mode**:

**The `.update()` will be automatically invoked in the `.compute()` function, usually for on-policy RL algorithms.**

In [3]:
# set the parameters
device = 'cuda' if th.cuda.is_available() else 'cpu'
n_steps = 128
n_envs = 8
# create the vectorized environments
envs = gym.vector.AsyncVectorEnv([FakeAtari for _ in range(n_envs)])
# wrap the environments to convert the observations to torch tensors
envs = Gymnasium2Torch(envs, device)
# create the intrinsic reward module
irs = ICM(envs, device)
# reset the environments and get the initial observations
obs, infos = envs.reset()
# create a dictionary to store the samples
samples = {'observations':[], 
           'actions':[], 
           'rewards':[],
           'terminateds':[],
           'truncateds':[],
           'next_observations':[]}
# sampling loop
for _ in range(n_steps):
    # sample random actions
    actions = th.stack([th.as_tensor(envs.action_space.sample()) for _ in range(n_envs)])
    # environment step
    next_obs, rewards, terminateds, truncateds, infos = envs.step(actions)
    # watch the interactions and get necessary information for the intrinsic reward computation
    irs.watch(observations=obs, 
              actions=actions, 
              rewards=rewards,
              terminateds=terminateds,
              truncateds=truncateds,
              next_observations=next_obs)
    # store the samples
    samples['observations'].append(obs)
    samples['actions'].append(actions)
    samples['rewards'].append(rewards)
    samples['terminateds'].append(terminateds)
    samples['truncateds'].append(truncateds)
    samples['next_observations'].append(next_obs)
    obs = next_obs
# compute the intrinsic rewards
samples = {k: th.stack(v) for k, v in samples.items()}
intrinsic_rewards = irs.compute(samples=samples)
print(intrinsic_rewards)
print(intrinsic_rewards.shape)

  return F.conv2d(input, weight, bias, self.stride,


tensor([[6.5928, 5.5006, 5.3346,  ..., 5.3286, 6.5831, 5.1960],
        [3.4611, 3.4754, 5.3265,  ..., 4.9442, 5.3422, 3.7767],
        [3.7612, 3.7736, 6.5909,  ..., 3.7735, 4.9679, 6.5922],
        ...,
        [3.4737, 4.9781, 6.5358,  ..., 5.2204, 5.3287, 6.5794],
        [3.7659, 5.3463, 5.3620,  ..., 6.5735, 5.3437, 3.7666],
        [5.4956, 4.9599, 5.3435,  ..., 6.5689, 5.2174, 3.7587]],
       device='cuda:0')
torch.Size([128, 8])


**Asynchronous Mode**:

**The `.update()` must be invoked separately, usually for off-policy RL algorithms.**

In [4]:
# set the parameters
device = 'cuda' if th.cuda.is_available() else 'cpu'
n_steps = 128
n_envs = 8
# create the vectorized environments
envs = gym.vector.AsyncVectorEnv([FakeAtari for _ in range(n_envs)])
# wrap the environments to convert the observations to torch tensors
envs = Gymnasium2Torch(envs, device)
# create the intrinsic reward module
irs = ICM(envs, device)
# reset the environments and get the initial observations
obs, infos = envs.reset()
# create a dictionary to store the samples
samples = {'observations':[], 
           'actions':[], 
           'rewards':[],
           'terminateds':[],
           'truncateds':[],
           'next_observations':[]}
# sampling loop
for _ in range(n_steps):
    # sample random actions
    actions = th.stack([th.as_tensor(envs.action_space.sample()) for _ in range(n_envs)])
    # environment step
    next_obs, rewards, terminateds, truncateds, infos = envs.step(actions)
    # watch the interactions and get necessary information for the intrinsic reward computation
    irs.watch(observations=obs, 
              actions=actions, 
              rewards=rewards,
              terminateds=terminateds,
              truncateds=truncateds,
              next_observations=next_obs)
    # compute the intrinsic rewards at each step
    intrinsic_rewards = irs.compute(samples={'observations':obs.unsqueeze(0), 
                                            'actions':actions.unsqueeze(0), 
                                            'rewards':rewards.unsqueeze(0),
                                            'terminateds':terminateds.unsqueeze(0),
                                            'truncateds':truncateds.unsqueeze(0),
                                            'next_observations':next_obs.unsqueeze(0)}, 
                                            sync=False)
    print(intrinsic_rewards, intrinsic_rewards.shape)
    # store the samples
    samples['observations'].append(obs)
    samples['actions'].append(actions)
    samples['rewards'].append(rewards)
    samples['terminateds'].append(terminateds)
    samples['truncateds'].append(truncateds)
    samples['next_observations'].append(next_obs)
    obs = next_obs
# update the intrinsic reward module
samples = {k: th.stack(v) for k, v in samples.items()}
irs.update(samples=samples)

tensor([[2.5189, 2.5474, 2.5163, 2.5503, 2.1224, 2.1203, 2.5226, 2.6890]],
       device='cuda:0') torch.Size([1, 8])
tensor([[3.5146, 2.5905, 3.6144, 3.4424, 3.3997, 3.4378, 3.5162, 2.5951]],
       device='cuda:0') torch.Size([1, 8])
tensor([[3.9397, 3.0138, 3.0003, 3.9741, 3.3031, 4.1907, 2.9930, 3.3006]],
       device='cuda:0') torch.Size([1, 8])
tensor([[3.7179, 3.7295, 3.7109, 4.5688, 3.3561, 3.7105, 4.4071, 3.7139]],
       device='cuda:0') torch.Size([1, 8])
tensor([[4.8140, 3.6262, 4.7395, 4.9179, 3.6130, 4.7960, 3.6326, 4.8056]],
       device='cuda:0') torch.Size([1, 8])
tensor([[5.0398, 5.0510, 5.1055, 5.3550, 3.8483, 4.2502, 5.3718, 4.2484]],
       device='cuda:0') torch.Size([1, 8])
tensor([[5.6157, 4.4320, 4.0090, 5.6285, 5.4394, 4.4302, 5.2698, 5.5971]],
       device='cuda:0') torch.Size([1, 8])
tensor([[5.6835, 5.6864, 4.1652, 5.6790, 4.6121, 4.6099, 5.6657, 5.6570]],
       device='cuda:0') torch.Size([1, 8])
tensor([[5.6814, 5.7421, 4.7672, 4.7597, 4.7601, 4.3097,