# Test Door Key Offline Training with d3rlpy and Decision Transformer

We will use the Door Key 16x16 environment from Minigrid Gym to test the Decision Transformer algorithm from d3rlpy.

In [1]:
# Test if we are running on CoLab or not
if 'google.colab' in str(get_ipython()):
  print('Running on CoLab')
  !apt-get install -y xvfb ffmpeg > /dev/null 2>&1
  %pip install pyvirtualdisplay pygame moviepy > /dev/null 2>&1
  %pip install d3rlpy
else:
  print('Not running on CoLab')

Not running on CoLab


In [2]:
# Directory creation
import os
path = "./models"
isExist = os.path.exists(path)
if not isExist:
  os.makedirs(path)

path = "./videos/video-doorkey-d3rlpy"
isExist = os.path.exists(path)
if not isExist:
  os.makedirs(path)

In [3]:
import gymnasium as gym
from gymnasium import spaces
from minigrid.envs import DoorKeyEnv
from gymnasium.core import ActType, ObsType
from typing import Any, SupportsFloat
import random, math

from gymnasium.envs.registration import register

register(
    id="MiniGrid-DoorKey-16x16-v0",
    entry_point="minigrid.envs:DoorKeyEnv",
    kwargs={"size": 16},
)
    
class CurriculumStatesWrapper(gym.Wrapper):

    def __init__(self, env, beta: int = 0.05):
        """A wrapper that adds an exploration bonus to less visited (state,action) pairs.

        Args:
            env: The environment to apply the wrapper
        """
        super().__init__(env)
        self.state_carrying = False
        self.state_door_opened = False
        self.count_frames = 0
        self.door = None
        self.key = None
        self.total_reward = 0

        self.beta = beta


    def reset(self, **kwargs):
        """Resets the environment."""
        
        env : DoorKeyEnv = self.env.unwrapped
       
        if (self.total_reward > 0.2):
            print("Reward: ",self.total_reward)

        obs = self.env.reset(**kwargs)

        self.state_carrying = False
        self.state_door_opened = False
        self.count_frames = 0
        self.count_carryings = 0
        self.total_reward = 0
        
        # Randomize the position of the agent
        curriculum = random.random()

        for j in range(self.env.grid.height):
            for i in range(self.env.grid.width):
                tile = self.grid.get(i, j)
                if tile != None and tile.type == "door":
                    self.door = tile

                if tile != None and tile.type == "key":
                    self.key = tile


        if curriculum < self.beta:
            env.agent_pos = (-1, -1)
            pos = env.place_obj(None, top=(0,0), size=None, max_tries=math.inf)
            env.agent_pos = pos

            self.door.state = 0
            self.state_door_opened = True
                        
            env.carrying = self.key
            self.grid.set(self.key.cur_pos[0], self.key.cur_pos[1], None)
            self.carrying.cur_pos = np.array([-1, -1])
            self.state_carrying = True



        return obs
    
    def step(self, action):
        """Steps through the environment with `action`."""

        if (self.state_carrying == True and action == self.actions.drop):
            action = self.actions.pickup     

        obs, reward, terminated, truncated, info = self.env.step(action)

        env = self.unwrapped

        self.count_frames = self.count_frames + 1

        if (env.carrying is not None and self.state_carrying == False and self.count_carryings < 1):
            self.count_carryings = self.count_carryings + 1
            reward += 0.25
            self.state_carrying = True

        if self.door != None and self.door.is_open == True and self.actions.toggle and self.state_door_opened == False:
            print("Is Opened", self.door.is_open)
            reward += 0.25
            self.state_door_opened = True


        self.total_reward += reward

        return obs, reward, terminated, truncated, info



pygame 2.5.2 (SDL 2.28.2, Python 3.10.0)
Hello from the pygame community. https://www.pygame.org/contribute.html


  logger.warn(f"Overriding environment {new_spec.id} already in registry.")


In [4]:
import minigrid
def create_env(env_key, max_episode_steps=100, is_video=False, curriculum_mode=True, beta=0.05):

    render_mode = None

    if is_video == True:
        render_mode = 'rgb_array'

    env = gym.make(env_key, max_episode_steps=max_episode_steps, render_mode=render_mode, see_through_walls=True)

    if (curriculum_mode):
        env = CurriculumStatesWrapper(env, beta=beta)
        
    env = minigrid.wrappers.FullyObsWrapper(env)
    env = minigrid.wrappers.ImgObsWrapper(env)

    return env


In [5]:
import d3rlpy
import gymnasium as gym
import torch
import torch.nn as nn
from d3rlpy.models.encoders import EncoderFactory


env_key = "MiniGrid-DoorKey-16x16-v0"

env = create_env(env_key, max_episode_steps=200, curriculum_mode=True)
eval_env = create_env(env_key, max_episode_steps=200, curriculum_mode=True)

class CustomConvEncoder(nn.Module):
    def __init__(self, observation_shape):
        super().__init__()
        print(observation_shape)

        self.conv1 = nn.Conv2d(3, 32, kernel_size=1, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1)
        self.conv2Dropout = nn.Dropout(0.25)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1)
        self.conv3Dropout = nn.Dropout(0.5)
        self.conv4 = nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)
        self.conv4Dropout = nn.Dropout(0.5)
        self.conv5 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1)
        self.conv5Dropout = nn.Dropout(0.5)


    def forward(self, x):
        h = x.permute(0, 3, 1, 2)
        h = torch.relu(self.conv1(h))
        h = torch.relu(self.conv2Dropout(self.conv2(h)))
        h = torch.relu(self.conv3Dropout(self.conv3(h)))
        h = torch.relu(self.conv4Dropout(self.conv4(h)))
        h = torch.relu(self.conv5Dropout(self.conv5(h)))

        h = torch.flatten(h, start_dim=1)
        #print(h.shape)

        return h
    
class CustomConvEncoderFactory(EncoderFactory):

    def create(self, observation_shape):
        return CustomConvEncoder(observation_shape)

    @staticmethod
    def get_type() -> str:
        return "custom"
    


  logger.warn(
  logger.warn(


In [6]:
#dqn = d3rlpy.algos.DiscreteSACConfig(
#    actor_encoder_factory=CustomConvEncoderFactory(),
#    critic_encoder_factory=CustomConvEncoderFactory(),
#).create(device="cuda:0")


#dqn = d3rlpy.algos.DoubleDQNConfig(
#    encoder_factory=CustomConvEncoderFactory()
#).create(device="cuda:0")

dqn = d3rlpy.algos.DQNConfig(
    encoder_factory=CustomConvEncoderFactory(),
    batch_size=100,
    gamma=0.9,
    target_update_interval=1000,
    learning_rate=2.5e-4
).create(device="cuda:0")

In [7]:
import numpy as np

from collections import deque
from typing import Deque, List, Sequence, Tuple

from typing_extensions import Protocol

from d3rlpy.dataset.components import EpisodeBase

from d3rlpy.dataset.buffers import BufferProtocol

from d3rlpy.dataset.writers import ExperienceWriter, _ActiveEpisode, WriterPreprocessProtocol
from d3rlpy.dataset.components import Signature


class CustomReplayBuffer(d3rlpy.dataset.ReplayBuffer):

    def clip_episode(self, terminated: bool) -> None:
        r"""Clips the current episode.

        Args:
            terminated: Flag to represent environment termination.
        """

        episode_to_remove = None
        # Check if the episode's reward is 0 or negative
        if not terminated and self._writer._active_episode.rewards.mean() <= 0:
            episode_to_remove = self._writer._active_episode
            
        self._writer.clip_episode(terminated)

        if episode_to_remove is not None:
            # Remove all transitions associated with the episode to remove
            self._buffer._transitions = [(ep, idx) for ep, idx in self._buffer._transitions if ep is not episode_to_remove]
            self._buffer.episodes.remove(episode_to_remove)  


class CustomWriterPreprocess(d3rlpy.dataset.WriterPreprocessProtocol):

    def process_observation(self, observation: d3rlpy.types.Observation) -> d3rlpy.types.Observation:
        return observation

    def process_action(self, action: np.ndarray) -> np.ndarray:
        #print(action)
        return action

    def process_reward(self, reward: np.ndarray) -> np.ndarray:
        #if (reward >= 0.2):
        #    print(reward)
        return reward
    
writer_preprocessor = CustomWriterPreprocess()

#buffer = PriorityBuffer(200)
buffer = d3rlpy.dataset.FIFOBuffer(10000)
buffer = CustomReplayBuffer(
    buffer,
    env=env, 
    #observation_signature=observation_signature,
    writer_preprocessor=writer_preprocessor
)

#buffer = d3rlpy.dataset.create_fifo_replay_buffer(
#    limit=10000, env=env)

explorer = d3rlpy.algos.LinearDecayEpsilonGreedy(0.8, 0.3)
dqn.fit_online(
    env,
    buffer,
    explorer,
    n_steps=1000000,  # train for 100K steps
    eval_env=eval_env,
    n_steps_per_epoch=100000,  # evaluation is performed every 1K steps
    update_start_step=20000,  # parameter update starts after 1K steps
    update_interval=10
)

dqn.save_model("./models/model_door-key-dqn-16x16.d3")


[2m2024-01-19 12:18.22[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('int64')], shape=[()])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('uint8')], shape=[(16, 16, 3)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[[1]])[0m
[2m2024-01-19 12:18.22[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.DISCRETE: 2>[0m
[2m2024-01-19 12:18.22[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m7[0m
[2m2024-01-19 12:18.22[0m [[32m[1minfo     [0m] [1mDirectory is created at d3rlpy_logs/DQN_online_20240119121822[0m
[2m2024-01-19 12:18.22[0m [[32m[1mdebug    [0m] [1mBuilding model...             [0m
(16, 16, 3)
(16, 16, 3)
[2m2024-01-19 12:18.23[0m [[32m[1mdebug    [0m] [1mModel has been built.         

  logger.warn(


  0%|          | 0/1000000 [00:00<?, ?it/s]

  logger.warn(


Reward:  0.25
Reward:  0.25
Reward:  0.25


  logger.warn(


Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Is Opened True
Reward:  0.5
Reward:  0.25
Reward:  0.25
Is Opened True
Reward:  0.5
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.9690625
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.9898046875
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Is Opened True
Reward:  0.5
Reward:  0.25
Is Opened True
Reward:  0.5
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Is Opened True
Reward:  0.5
Reward:  0.25
Reward:  0.25
Is Opened True
Reward:  0.5
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:

In [10]:

env = create_env(env_key, max_episode_steps=100, curriculum_mode=True)
buffer = d3rlpy.dataset.FIFOBuffer(2000)
buffer = CustomReplayBuffer(
    buffer,
    env=env, 
    #observation_signature=observation_signature,
    writer_preprocessor=writer_preprocessor
)
eval_env = create_env(env_key, max_episode_steps=150, curriculum_mode=True, beta=0)
explorer = d3rlpy.algos.ConstantEpsilonGreedy(0.3)


dqn.fit_online(
    env,
    buffer,
    explorer,
    n_steps=1000000,  # train for 100K steps
    eval_env=eval_env,
    n_steps_per_epoch=100000,  # evaluation is performed every 1K steps
    update_start_step=20000,  # parameter update starts after 1K steps
    update_interval=10
)

dqn.save_model("./models/model_door-key-dqn-16x16.d3")

[2m2024-01-19 14:24.07[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('int64')], shape=[()])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('uint8')], shape=[(16, 16, 3)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[[1]])[0m
[2m2024-01-19 14:24.07[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.DISCRETE: 2>[0m
[2m2024-01-19 14:24.07[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m7[0m
[2m2024-01-19 14:24.07[0m [[32m[1minfo     [0m] [1mDirectory is created at d3rlpy_logs/DQN_online_20240119142407[0m
[2m2024-01-19 14:24.07[0m [[32m[1minfo     [0m] [1mParameters                    [0m [36mparams[0m=[35m{'observation_shape': [16, 16, 3], 'action_size': 7, 'config': {'type': 'dqn', 'params': {

  logger.warn(
  logger.warn(
  logger.warn(


  0%|          | 0/1000000 [00:00<?, ?it/s]

  logger.warn(


Reward:  0.25
Is Opened True
Reward:  1.4859375
Reward:  0.25
Reward:  0.25
Is Opened True
Reward:  0.5
Is Opened True
Reward:  0.5
Is Opened True
Reward:  1.4750390625
Reward:  0.25
Is Opened True
Reward:  1.4729296875


  logger.warn(


Reward:  0.983125
Is Opened True
Reward:  1.4834765625
Is Opened True
Reward:  1.48171875
Is Opened True
Reward:  1.4855859375
Reward:  0.25
Reward:  0.25
Reward:  0.25
Is Opened True
Reward:  1.4785546875
Is Opened True
Reward:  1.4799609375
Is Opened True
Reward:  1.4813671875
Is Opened True
Reward:  1.4658984375
Is Opened True
Reward:  1.4708203125
Is Opened True
Reward:  1.483828125
Is Opened True
Reward:  1.4792578125
Reward:  0.25
Reward:  0.25
Reward:  0.25
Reward:  0.25
Is Opened True
Reward:  1.481015625
Reward:  0.25
Reward:  0.9912109375
Is Opened True
Reward:  1.4669531249999999
Is Opened True
Reward:  0.5
Reward:  0.25
Is Opened True
Reward:  1.4845312499999999
Reward:  0.25
Reward:  0.25
Reward:  0.25
Is Opened True
Reward:  0.5
Reward:  0.25
Is Opened True
Reward:  1.489453125
Reward:  0.25
Reward:  0.25
Reward:  0.25
Is Opened True
Reward:  1.4785546875
Reward:  0.25
Is Opened True
Reward:  1.4658984375
Is Opened True
Reward:  1.4827734375000001
Reward:  0.25
Is Opened 

KeyboardInterrupt: 

In [9]:
import gymnasium as gym

import numpy as np
from gym.wrappers import RecordVideo

# start virtual display
d3rlpy.notebook_utils.start_virtual_display()

env = create_env(env_key, max_episode_steps=200, is_video=True)

env = RecordVideo(env, './videos/video-doorkey-d3rlpy')

seed = 3

explorer = d3rlpy.algos.ConstantEpsilonGreedy(0.3)
done = False

observation, reward = env.reset(seed=seed)

while True:

    x = np.expand_dims(observation, axis=0)
    action = explorer.sample(dqn, x, 0)[0]

    observation, reward, done, truncated, _ = env.step(action)

    if done:
        print("reward:", reward)
        print("DONE!!!")
        break
    elif truncated:
        print("Truncated")
        break

env.close()

d3rlpy.notebook_utils.render_video("./videos/video-doorkey-d3rlpy/rl-video-episode-0.mp4")


  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(


Moviepy - Building video /home/drinf/Desktop/PROJECTS/machine_learning/fib_postgraduate/GameMindsDT/experiments/notebooks/videos/video-doorkey-d3rlpy/rl-video-episode-0.mp4.
Moviepy - Writing video /home/drinf/Desktop/PROJECTS/machine_learning/fib_postgraduate/GameMindsDT/experiments/notebooks/videos/video-doorkey-d3rlpy/rl-video-episode-0.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /home/drinf/Desktop/PROJECTS/machine_learning/fib_postgraduate/GameMindsDT/experiments/notebooks/videos/video-doorkey-d3rlpy/rl-video-episode-0.mp4
Truncated
