# Test Door Key Offline Training with d3rlpy and Decision Transformer

We will use the Door Key 16x16 environment from Minigrid Gym to test the Decision Transformer algorithm from d3rlpy.

In [1]:
# Test if we are running on CoLab or not
if 'google.colab' in str(get_ipython()):
  print('Running on CoLab')
  !apt-get install -y xvfb ffmpeg > /dev/null 2>&1
  %pip install pyvirtualdisplay pygame moviepy > /dev/null 2>&1
  %pip install d3rlpy
else:
  print('Not running on CoLab')

Not running on CoLab


In [2]:
# Directory creation
import os
path = "./models"
isExist = os.path.exists(path)
if not isExist:
  os.makedirs(path)

path = "./datasets"
isExist = os.path.exists(path)
if not isExist:
  os.makedirs(path)

path = "./videos/video-doorkey-dt-d3rlpy"
isExist = os.path.exists(path)
if not isExist:
  os.makedirs(path)

In [3]:
import gymnasium as gym
from gymnasium import spaces
from minigrid.envs import DoorKeyEnv
from gymnasium.core import ActType, ObsType
from typing import Any, SupportsFloat
import random, math

from gymnasium.envs.registration import register

register(
    id="MiniGrid-DoorKey-16x16-v0",
    entry_point="minigrid.envs:DoorKeyEnv",
    kwargs={"size": 16},
)
    

pygame 2.5.2 (SDL 2.28.2, Python 3.10.0)
Hello from the pygame community. https://www.pygame.org/contribute.html


  logger.warn(f"Overriding environment {new_spec.id} already in registry.")


In [4]:
import minigrid
def create_env(env_key, max_episode_steps=100, is_video=False):

    render_mode = None

    if is_video == True:
        render_mode = 'rgb_array'

    env = gym.make(env_key, max_episode_steps=max_episode_steps, render_mode=render_mode, see_through_walls=True)

    env = minigrid.wrappers.FullyObsWrapper(env)
    env = minigrid.wrappers.ImgObsWrapper(env)

    return env


In [5]:
import d3rlpy
import gymnasium as gym
import torch
import torch.nn as nn
from d3rlpy.models.encoders import EncoderFactory

class CustomConvEncoder(nn.Module):
    def __init__(self, observation_shape):
        super().__init__()
        print(observation_shape)

        self.conv1 = nn.Conv2d(3, 32, kernel_size=1, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1)
        self.conv2Dropout = nn.Dropout(0.25)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1)
        self.conv3Dropout = nn.Dropout(0.5)
        self.conv4 = nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)
        self.conv4Dropout = nn.Dropout(0.5)
        self.conv5 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1)
        self.conv5Dropout = nn.Dropout(0.5)


    def forward(self, x):
        h = x.permute(0, 3, 1, 2)
        h = torch.relu(self.conv1(h))
        h = torch.relu(self.conv2Dropout(self.conv2(h)))
        h = torch.relu(self.conv3Dropout(self.conv3(h)))
        h = torch.relu(self.conv4Dropout(self.conv4(h)))
        h = torch.relu(self.conv5Dropout(self.conv5(h)))

        h = torch.flatten(h, start_dim=1)
        #print(h.shape)

        return h
    
class CustomConvEncoderFactory(EncoderFactory):

    def create(self, observation_shape):
        return CustomConvEncoder(observation_shape)

    @staticmethod
    def get_type() -> str:
        return "custom"
    


In [6]:
config = {
    "seed": 1,
    "dataset_size": 1000,
    "epsilon": 0.3,
    "max_episode_steps": 200,
    "experiment_name": "door-key-14x14",
    "device": "cuda:0"
}

env_key = "MiniGrid-DoorKey-16x16-v0"

env = create_env(env_key, max_episode_steps=config["max_episode_steps"])
eval_env = create_env(env_key, max_episode_steps=config["max_episode_steps"])

dqn = d3rlpy.algos.DQNConfig(
    encoder_factory=CustomConvEncoderFactory(),
    batch_size=100,
    gamma=0.9,
    target_update_interval=1000,
    learning_rate=2.5e-4
).create(device=config["device"])

dqn.build_with_env(env)

dqn.load_model('./models/model_door-key-dqn-16x16.d3')


  logger.warn(
  logger.warn(


(16, 16, 3)
(16, 16, 3)


In [7]:
import numpy as np
from tqdm import tqdm

# prepare utilities
buffer = d3rlpy.dataset.InfiniteBuffer()
dataset = d3rlpy.dataset.ReplayBuffer(buffer, env=env, cache_size=config["max_episode_steps"])
explorer = d3rlpy.algos.ConstantEpsilonGreedy(epsilon=config["epsilon"])

seed = 1

done = False

observation, _ = env.reset(seed=seed)

num_steps = 0

for episode in tqdm(range(config["dataset_size"])):

    while True:
        x = np.expand_dims(observation, axis=0)

        action = explorer.sample(dqn, x, 0)[0]

        next_observation, reward, done, truncated, _ = env.step(action)

        clip_episode = done or truncated

        # store observation
        dataset.append(observation, action, float(reward))

        # reset if terminated
        if clip_episode:
            dataset.clip_episode(done)
            observation, _ = env.reset()
            break
        else:
            observation = next_observation

env.close()

dataset.dump('./datasets/dataset_door-key-dqn-16x16.d3')


[2m2024-01-19 15:54.04[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('int64')], shape=[()])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('uint8')], shape=[(16, 16, 3)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[[1]])[0m
[2m2024-01-19 15:54.04[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.DISCRETE: 2>[0m
[2m2024-01-19 15:54.04[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m7[0m


100%|██████████| 1000/1000 [04:40<00:00,  3.57it/s]


In [8]:
target_return = 1
experiment_name = config["experiment_name"]
experiment_seed = config["seed"]

dt = d3rlpy.algos.DiscreteDecisionTransformerConfig(
    batch_size=8,
    learning_rate=1e-4,
    context_size=20,
    num_heads=1,
    num_layers=3,
    encoder_factory=CustomConvEncoderFactory(),
).create(device=config["device"])


dt.fit(
    dataset,
    n_steps=10000,
    n_steps_per_epoch=1000,
    save_interval=10,
    eval_env=env,
    eval_target_return=target_return,
    experiment_name=f"DT_{experiment_name}_{experiment_seed}"
)

[2m2024-01-19 15:58.45[0m [[32m[1minfo     [0m] [1mdataset info                  [0m [36mdataset_info[0m=[35mDatasetInfo(observation_signature=Signature(dtype=[dtype('uint8')], shape=[(16, 16, 3)]), action_signature=Signature(dtype=[dtype('int64')], shape=[()]), reward_signature=Signature(dtype=[dtype('float32')], shape=[[1]]), action_space=<ActionSpace.DISCRETE: 2>, action_size=7)[0m
[2m2024-01-19 15:58.45[0m [[32m[1minfo     [0m] [1mDirectory is created at d3rlpy_logs/DT_door-key-14x14_1_20240119155845[0m
[2m2024-01-19 15:58.45[0m [[32m[1mdebug    [0m] [1mBuilding models...            [0m
(16, 16, 3)
[2m2024-01-19 15:58.47[0m [[32m[1mdebug    [0m] [1mModels have been built.       [0m
[2m2024-01-19 15:58.47[0m [[32m[1minfo     [0m] [1mParameters                    [0m [36mparams[0m=[35m{'observation_shape': [16, 16, 3], 'action_size': 7, 'config': {'type': 'discrete_decision_transformer', 'params': {'batch_size': 8, 'gamma': 0.99, 'observation

Epoch 1/10:   0%|          | 0/1000 [00:00<?, ?it/s]

[2m2024-01-19 16:01.15[0m [[32m[1minfo     [0m] [1mDT_door-key-14x14_1_20240119155845: epoch=1 step=1000[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.001347700595855713, 'time_algorithm_update': 0.12472886943817138, 'loss': 1.624453355550766, 'learning_rate': 9.662062632550849e-05, 'time_step': 0.12617412042617798, 'environment': 0.0}[0m [36mstep[0m=[35m1000[0m


Epoch 2/10:   0%|          | 0/1000 [00:00<?, ?it/s]

[2m2024-01-19 16:03.39[0m [[32m[1minfo     [0m] [1mDT_door-key-14x14_1_20240119155845: epoch=2 step=2000[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0013260700702667236, 'time_algorithm_update': 0.1232634437084198, 'loss': 1.412864797592163, 'learning_rate': 9.998674832792865e-05, 'time_step': 0.12467044067382813, 'environment': 0.0}[0m [36mstep[0m=[35m2000[0m


Epoch 3/10:   0%|          | 0/1000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [10]:
import gymnasium as gym

import numpy as np
from gym.wrappers import RecordVideo

# start virtual display
d3rlpy.notebook_utils.start_virtual_display()

env_video = create_env(env_key, max_episode_steps=200, is_video=True)

env_video = RecordVideo(env_video, './videos/video-doorkey-dt-d3rlpy')

seed = 14

explorer = d3rlpy.algos.ConstantEpsilonGreedy(0.3)

# wrap as stateful actor for interaction
actor = dt.as_stateful_wrapper(
    target_return=1,
    action_sampler=d3rlpy.algos.SoftmaxTransformerActionSampler(temperature=1.0,)
)

done = False

observation, reward = env_video.reset(seed=seed)

while True:

    #x = np.expand_dims(observation, axis=0)
    action = actor.predict(observation, 1) #explorer.sample(actor, x, 0)[0]

    observation, reward, done, truncated, _ = env_video.step(action)

    if done:
        print("reward:", reward)
        print("DONE!!!")
        break
    elif truncated:
        print("Truncated")
        break

env.close()

d3rlpy.notebook_utils.render_video("./videos/video-doorkey-dt-d3rlpy/rl-video-episode-0.mp4")


  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(


Moviepy - Building video /home/drinf/Desktop/PROJECTS/machine_learning/fib_postgraduate/GameMindsDT/experiments/notebooks/videos/video-doorkey-dt-d3rlpy/rl-video-episode-0.mp4.
Moviepy - Writing video /home/drinf/Desktop/PROJECTS/machine_learning/fib_postgraduate/GameMindsDT/experiments/notebooks/videos/video-doorkey-dt-d3rlpy/rl-video-episode-0.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /home/drinf/Desktop/PROJECTS/machine_learning/fib_postgraduate/GameMindsDT/experiments/notebooks/videos/video-doorkey-dt-d3rlpy/rl-video-episode-0.mp4
Truncated


