In [13]:
%load_ext autoreload
%autoreload 2

import os
import sys
sys.path.append(r'C:\Users\shaya\Documents\TU_projects\random\offline_multi_task_rl')

import gymnasium as gym
import dill

from four_room.env import FourRoomsEnv
from four_room.wrappers import gym_wrapper
from four_room.shortest_path import find_all_action_values
from four_room.utils import obs_to_state
from four_room_extensions import fourrooms_dataset_gen
from d3rlpy.algos import DiscreteDecisionTransformerConfig
from d3rlpy.metrics import EnvironmentEvaluator, TDErrorEvaluator, DiscreteActionMatchEvaluator, evaluate_transformer_with_environment
from d3rlpy.datasets import MDPDataset
from d3rlpy.logging import WanDBAdapterFactory
from d3rlpy.ope import FQEConfig, DiscreteFQE
from d3rlpy import load_learnable
import wandb
import numpy as np
import utils
from datetime import datetime
import imageio
import torch
from functools import partial
from tqdm import tqdm
from utils import get_DQN_checkpoints, create_env
import pickle
from four_room_extensions.fourrooms_dataset_gen import get_mixed_policy_dataset
import random

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
random.seed(seed)

train_config_path = 'train'
reachable_test_config_path = 'test_100'
unreachable_test_config_path = 'test_0'
device = True if torch.cuda.is_available() else None
n_epochs = 100
n_steps_per_epoch = 50
# wandb_explanation = f"train on mixed {episode_length} with best_policy={best_policy}, test on (train, test_100, test_0)"
render = False
dataset_to_train_on_name = "mixed"
wandb_project_name = "DT_mixed"
mixed_data_file = "[470000, 390000, 330000, 110000]--[0, 25, 50, 75].pkl"
wandb_run_name = f"{mixed_data_file}-seed_{seed}-DT_mixed"
DQN_mixed_data_path = os.path.join("..", "datasets", "dataset_from_models_", mixed_data_file)

# if simulating, use these
# best_policy = True
# DQN_models_path = os.path.join("/kaggle/working/offline_multi_task_rl", "four_room_extensions", "DQN_models", "performance_per_model.txt")
# episode_length = [0, 25, 50, 75, 100]
# wandb_run_name = f"{episode_length}-best_policy-100epochs-50stepsPerEpoch"


In [25]:
train_config = fourrooms_dataset_gen.get_config(train_config_path)
train_dataset, train_env, tasks_finished, tasks_failed = fourrooms_dataset_gen.get_expert_dataset_from_config(train_config, render=render, render_name="DT_train_expert")

train_dataset = MDPDataset(
    observations=train_dataset.get("observations"),
    actions=train_dataset.get("actions"),
    rewards=train_dataset.get("rewards"),
    terminals=train_dataset.get("terminals"),
)

test_config_reachable = fourrooms_dataset_gen.get_config(reachable_test_config_path)
test_dataset_reachable, test_env_reachable, tasks_finished, tasks_failed = fourrooms_dataset_gen.get_expert_dataset_from_config(test_config_reachable, render=render, render_name="DT_test_expert_reachable")

test_dataset_reachable = MDPDataset(
    observations=test_dataset_reachable.get("observations"),
    actions=test_dataset_reachable.get("actions"),
    rewards=test_dataset_reachable.get("rewards"),
    terminals=test_dataset_reachable.get("terminals"),
)

test_config_unreachable = fourrooms_dataset_gen.get_config(unreachable_test_config_path)
test_dataset_unreachable, test_env_unreachable, tasks_finished, tasks_failed = fourrooms_dataset_gen.get_expert_dataset_from_config(test_config_unreachable, render=render, render_name="DT_test_expert_unreachable")

test_dataset_unreachable = MDPDataset(
    observations=test_dataset_unreachable.get("observations"),
    actions=test_dataset_unreachable.get("actions"),
    rewards=test_dataset_unreachable.get("rewards"),
    terminals=test_dataset_unreachable.get("terminals"),
)

# checkpoints = get_DQN_checkpoints(DQN_models_path, episode_length, best_policy=best_policy)
train_env = create_env(train_config)
# mixed_dataset, finished, failed = get_mixed_policy_dataset(train_config, train_env, checkpoints)
with open(DQN_mixed_data_path, 'rb') as f:
            mixed_dataset = pickle.load(f)

mixed_dataset = MDPDataset(
    observations=mixed_dataset.get("observations"),
    actions=mixed_dataset.get("actions"),
    rewards=mixed_dataset.get("rewards"),
    terminals=mixed_dataset.get("terminals"),
)

2024-06-21 18:55.34 [info     ] Signatures have been automatically determined. action_signature=Signature(dtype=[dtype('int64')], shape=[(1,)]) observation_signature=Signature(dtype=[dtype('uint8')], shape=[(324,)]) reward_signature=Signature(dtype=[dtype('int32')], shape=[(1,)])
2024-06-21 18:55.34 [info     ] Action-space has been automatically determined. action_space=<ActionSpace.DISCRETE: 2>
2024-06-21 18:55.34 [info     ] Action size has been automatically determined. action_size=3
2024-06-21 18:55.38 [info     ] Signatures have been automatically determined. action_signature=Signature(dtype=[dtype('int64')], shape=[(1,)]) observation_signature=Signature(dtype=[dtype('uint8')], shape=[(324,)]) reward_signature=Signature(dtype=[dtype('int32')], shape=[(1,)])
2024-06-21 18:55.38 [info     ] Action-space has been automatically determined. action_space=<ActionSpace.DISCRETE: 2>
2024-06-21 18:55.38 [info     ] Action size has been automatically determined. action_size=3
2024-06-21 18:

<d3rlpy.dataset.compat.MDPDataset object at 0x000001ED27626680>


In [6]:
print(len(mixed_dataset.episodes))

122


In [26]:
def model_saver_d3rlpy_callback(algo, epoch, total_step, n_epochs, n_steps_per_epoch, title_addition = ""):
    """
    Callback to save the model at the end of each epoch

    Args:
        algo: The algorithm object
        epoch: The current epoch
        total_step: The total number of steps taken so far
        n_epochs: The total number of epochs
        n_steps_per_epoch: The number of steps in each epoch
    # """
    if total_step % 1000 == 0:
        algo.save(f"dt_{title_addition}_model_at_step_{total_step}_{datetime.now().strftime('%Y%m%d-%H%M%S')}.d3")
        
def eval_model(policy, env, n_episodes):
    total_reward = 0
    n_steps_taken = 0
    for _ in range(n_episodes):
        policy.reset()
        observation, reward = env.reset(seed=seed)[0], 0.0

        done = False
        while not done:
            # take action
            n_steps_taken += 1
            action = policy.predict(observation, reward)

            observation, _reward, terminated, truncated, _ = env.step(action)
            reward = float(_reward)
            total_reward += reward
            done = terminated or truncated

        # episode_rewards.append(episode_reward)
    return total_reward / n_episodes, n_steps_taken / n_episodes

# dt = load_learnable("/kaggle/input/model-checkpoints/dt_model_at_epoch_1_20240522-124018.d3")

# Train and Test

In [27]:
# initialize neural networks with the given observation shape and action size.
# this is not necessary when you directly call fit or fit_online method.
# dt.build_with_dataset(dataset)
device="cpu"
wandb_config = {
#     "explanation": wandb_explanation,
                "n_epochs": n_epochs,
                "n_steps_per_epoch": n_steps_per_epoch,
                "dataset": mixed_data_file,
                "group": "DT",
#                 "episode length": episode_length,
                # "checkpoints": checkpoints,
                # "best_policy": best_policy,
                }

dt = DiscreteDecisionTransformerConfig().create(device=device)

model_saver_d3rlpy_callback_partial = partial(model_saver_d3rlpy_callback, n_epochs=n_epochs, n_steps_per_epoch=n_steps_per_epoch, title_addition=wandb_run_name)

train_env = utils.ObservationFlattenerWrapper(train_env)
test_env_reachable = utils.ObservationFlattenerWrapper(test_env_reachable)
test_env_unreachable = utils.ObservationFlattenerWrapper(test_env_unreachable)

if dataset_to_train_on_name == "train":
    dataset_to_train_on = train_dataset
elif dataset_to_train_on_name == "mixed":
    dataset_to_train_on = mixed_dataset

with wandb.init(project=wandb_project_name, name=wandb_run_name, config=wandb_config, entity="gold-ai"):
    for epoch in tqdm(range(n_epochs)):
        # n_steps = n_steps_per_epoch, because we want to do manual evaluations
        # n_steps_per_epoch = n_steps_per_epoch, because we want to save the model at the end of each epoch with the callback
        dt.fit(dataset_to_train_on, n_steps=n_steps_per_epoch, n_steps_per_epoch=n_steps_per_epoch, callback=model_saver_d3rlpy_callback_partial, show_progress=False, save_interval=1000)

        train_eval_score, train_num_steps = eval_model(dt.as_stateful_wrapper(target_return=1, action_sampler=None),
                                                                 train_env,
                                                                 len(train_config["topologies"])
                                                                 )
        test_reachable_eval_score, test_reachable_num_steps = eval_model(dt.as_stateful_wrapper(target_return=1, action_sampler=None),
                                                               test_env_reachable,
                                                               len(test_config_reachable["topologies"])
                                                               )
        test_unreachable_eval_score, test_unreachable_num_steps = eval_model(dt.as_stateful_wrapper(target_return=1, action_sampler=None),
                                                                test_env_unreachable,
                                                                len(test_config_unreachable["topologies"])
                                                                )
        
        wandb.log({"Cumulative Reward": {"Train": train_eval_score, "Test_reachable": test_reachable_eval_score, "Test_unreachable": test_unreachable_eval_score}}, step=(epoch+1) * n_steps_per_epoch)
        wandb.log({"Number of steps taken": {"Train": train_num_steps, "Test_reachable": test_reachable_num_steps, "Test_unreachable": test_unreachable_num_steps}}, step=(epoch+1) * n_steps_per_epoch)
        


        # testing with d3rlpy library
        # train_eval_score = evaluate_transformer_with_environment(
        #                 algo=dt.as_stateful_wrapper(
        #                     target_return=len(train_config["topologies"]),
        #                     action_sampler=None,
        #                 ),
        #                 env=train_env,
        #                 n_trials=len(train_config["topologies"]),
        #             )
        
        # test_reachable_eval_score = evaluate_transformer_with_environment(
        #                 algo=dt.as_stateful_wrapper(
        #                     target_return=len(test_config_reachable["topologies"]),
        #                     action_sampler=None,
        #                 ),
        #                 env=test_env_reachable,
        #                 n_trials=len(test_config_reachable["topologies"]),
        #             )
        
        # test_unreachable_eval_score = evaluate_transformer_with_environment(
        #                 algo=dt.as_stateful_wrapper(
        #                     target_return=len(test_config_unreachable["topologies"]),
        #                     action_sampler=None,
        #                 ),
        #                 env=test_env_unreachable,
        #                 n_trials=len(test_config_unreachable["topologies"]),
        #             )
        


# offline training
# dt.fit(train_dataset,
#         n_steps=n_steps,
#         n_steps_per_epoch=n_steps_per_epoch,
#         eval_env=train_env,
#         eval_target_return=0,
#         logger_adapter=WanDBAdapterFactory(project="DT"),
#         callback=model_saver_d3rlpy_callback_partial)

# save final model
dt.save(f"dt_final_model_{datetime.now().strftime('%Y%m%d-%H%M%S')}.d3")

  0%|          | 0/100 [00:00<?, ?it/s]

2024-06-21 18:55.53 [info     ] dataset info                   dataset_info=DatasetInfo(observation_signature=Signature(dtype=[dtype('uint8')], shape=[(324,)]), action_signature=Signature(dtype=[dtype('int64')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('int32')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=3)
2024-06-21 18:55.53 [info     ] Directory is created at d3rlpy_logs\DiscreteDecisionTransformer_20240621185553
2024-06-21 18:55.53 [debug    ] Building models...            
2024-06-21 18:55.54 [debug    ] Models have been built.       
2024-06-21 18:55.54 [info     ] Parameters                     params={'observation_shape': [324], 'action_size': 3, 'config': {'type': 'discrete_decision_transformer', 'params': {'batch_size': 128, 'gamma': 0.99, 'observation_scaler': {'type': 'none', 'params': {}}, 'action_scaler': {'type': 'none', 'params': {}}, 'reward_scaler': {'type': 'none', 'params': {}}, 'context_size': 20, 'max_timestep': 1000, 'learni

  0%|          | 0/100 [02:24<?, ?it/s]
Traceback (most recent call last):
  File "C:\Users\shaya\AppData\Local\Temp\ipykernel_30396\331117765.py", line 33, in <module>
    dt.fit(dataset_to_train_on, n_steps=n_steps_per_epoch, n_steps_per_epoch=n_steps_per_epoch, callback=model_saver_d3rlpy_callback_partial, show_progress=False, save_interval=1000)
  File "c:\Users\shaya\Documents\TU_projects\random\offline_multi_task_rl\.venv\lib\site-packages\d3rlpy\algos\transformer\base.py", line 479, in fit
    loss = self.update(batch)
  File "c:\Users\shaya\Documents\TU_projects\random\offline_multi_task_rl\.venv\lib\site-packages\d3rlpy\algos\transformer\base.py", line 536, in update
    loss = self._impl.update(torch_batch, self._grad_step)
  File "c:\Users\shaya\Documents\TU_projects\random\offline_multi_task_rl\.venv\lib\site-packages\d3rlpy\torch_utility.py", line 398, in wrapper
    return f(self, *args, **kwargs)  # type: ignore
  File "c:\Users\shaya\Documents\TU_projects\random\offline

KeyboardInterrupt: 

## Hyperparam optimizaiton

In [None]:
import wandb
from d3rlpy.algos import DiscreteDecisionTransformerConfig
from datetime import datetime
from tqdm import tqdm
from functools import partial
import utils

# Define the sweep configuration for Bayesian optimization
sweep_config = {
    'method': 'bayes',
    'metric': {
        'name': 'Cumulative Reward.Test_unreachable',
        'goal': 'maximize'
    },
    # 'early_terminate': {
    #     'type': 'hyperband',
    #     'min_iter': 10
    # },
    'parameters': {
        'batch_size': {
            'values': [16, 32, 64, 128]
        },
        'gamma': {
            'values': [0.95, 0.98, 0.99, 0.999]
        },
        'context_size': {   # TODO what is context_size?
            'values': [5, 10, 15, 20, 25, 30]
        },
        'max_timestep': {   # TODO what is max_timestep?
            'values': [50, 100, 300, 700, 1000]
        },
        'learning_rate': {
            'values': [1e-2, 1e-3, 1e-4, 6e-4, 1e-5]
        },
        'num_heads': {
            'values': [3, 6, 8, 10]
        },
        'num_layers': {
            'values': [2, 4, 6, 8]
        },
        'attn_dropout': {
            'values': [0.05, 0.1, 0.15, 0.2, 0.25]
        },
        'resid_dropout': {
            'values': [0.05, 0.1, 0.15, 0.2, 0.25]
        },
        'embed_dropout': {
            'values': [0.05, 0.1, 0.15, 0.2, 0.25]
        },
        'warmup_tokens': {
            'values': [512, 1024, 4096, 10240]
        },
        'n_steps_per_epoch': {
            'values': [50, 100, 250, 500, 1000, 2000]
        }
    }
}

n_total_steps = 10000


train_env = utils.ObservationFlattenerWrapper(train_env)
test_env_reachable = utils.ObservationFlattenerWrapper(test_env_reachable)
test_env_unreachable = utils.ObservationFlattenerWrapper(test_env_unreachable)

# Define the training function
def train(config=None):
    run_name = f"DT_{datetime.now().strftime('%Y%m%d-%H%M%S')}"
    # Initialize wandb
    with wandb.init(config=config, name=run_name):
        # Retrieve hyperparameters
        config = wandb.config
        
        # Initialize Decision Transformer with hyperparameters
        dt_config = DiscreteDecisionTransformerConfig(
            learning_rate=config.learning_rate,
            batch_size=config.batch_size,
            num_layers=config.num_layers,
            num_heads=config.num_heads,
            context_size=config.context_size,
            attn_dropout=config.attn_dropout,
            resid_dropout=config.resid_dropout,
            embed_dropout=config.embed_dropout,
            max_timestep=config.max_timestep,
            warmup_tokens=config.warmup_tokens,
            gamma=config.gamma
        )
        dt = dt_config.create(device=device)

        
        n_steps_per_epoch = config.n_steps_per_epoch
        n_epochs = n_total_steps // n_steps_per_epoch

        total_steps = 0
        for epoch in tqdm(range(n_epochs)):
            # Train the model
            dt.fit(train_dataset, n_steps=n_steps_per_epoch, n_steps_per_epoch=n_steps_per_epoch, show_progress=False, save_interval=10000000)    # Don't save here

            # Evaluate the model on training and test environments
            train_eval_score, train_num_steps = eval_model(dt.as_stateful_wrapper(target_return=len(train_config["topologies"]), action_sampler=None),
                                                                    train_env,
                                                                    len(train_config["topologies"])
                                                                    )
            test_reachable_eval_score, test_reachable_num_steps = eval_model(dt.as_stateful_wrapper(target_return=len(train_config["topologies"]), action_sampler=None),
                                                                    test_env_reachable,
                                                                    len(test_config_reachable["topologies"])
                                                                    )
            test_unreachable_eval_score, test_unreachable_num_steps = eval_model(dt.as_stateful_wrapper(target_return=len(train_config["topologies"]), action_sampler=None),
                                                                    test_env_unreachable,
                                                                    len(test_config_unreachable["topologies"])
                                                                    )

            wandb.log({"Cumulative Reward": {"Train": train_eval_score, "Test_reachable": test_reachable_eval_score, "Test_unreachable": test_unreachable_eval_score}}, step=(epoch+1) * n_steps_per_epoch)
            wandb.log({"Total Number of steps taken": {"Train": train_num_steps, "Test_reachable": test_reachable_num_steps, "Test_unreachable": test_unreachable_num_steps}}, step=(epoch+1) * n_steps_per_epoch)

        # Save the final model, with config
        dt.save(f"run_{run_name}.d3")
        config_dict = config.as_dict()
        config_dict["run_name"] = run_name
        # save as txt
        with open(f"run_{run_name}.txt", "w") as f:
            f.write(str(config_dict))
        
        

# Initialize the sweep
sweep_id = wandb.sweep(sweep_config, project="DT_hyperparameter_tuning")

# Run the sweep
wandb.agent(sweep_id, function=train, count=50)


### Reachable test set evaluation

In [15]:
# Test the trained model with offline policy evaluator on reachable test set

imgs = []
# # wrap as stateful actor for interaction
try:
    actor = dt.as_stateful_wrapper(target_return=0)

    offline_policy_evaluator = DiscreteFQE(algo=actor, config=FQEConfig())
    # test with offline dataset
    offline_policy_evaluator.fit(test_dataset_reachable,
                                n_steps=2,
                                n_steps_per_epoch=1,
                                evaluators={"metric_td_error": TDErrorEvaluator(),
                                            "metric_discrete_action_match": DiscreteActionMatchEvaluator()},
                                logger_adapter=WanDBAdapterFactory(project="DT")
                        )

    # reset history
    actor.reset()
except:
    print("here")
    # do hardcoded test
    # Test model explicitly
    test_env_reachable = utils.ObservationFlattenerWrapper(test_env_reachable)
    rewards = []
    # wrap as stateful actor for interaction
    actor = dt.as_stateful_wrapper(target_return=0)
    # explicitly evaluate the model
    for _ in range(len(test_config_reachable["topologies"])):
        img = test_env_reachable.render()
        imgs.append(img)
        observation, _ = test_env_reachable.reset()
        reward = 0.0
        done = False
        while not done:
            action = actor.predict(observation, reward)
            observation, reward, terminated, truncated, _ = test_env_reachable.step(action)
            done = terminated or truncated
            rewards.append(reward)

    actor.reset()

    print("Cumulative Reward: ", np.sum(rewards))
    imageio.mimsave('test_rendered_episode.gif', [np.array(img) for i, img in enumerate(imgs) if i%1 == 0], duration=200)

2024-05-16 12:56.43 [info     ] dataset info                   dataset_info=DatasetInfo(observation_signature=Signature(dtype=[dtype('uint8')], shape=[(324,)]), action_signature=Signature(dtype=[dtype('int64')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('int32')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=3)


2024-05-16 12:56.50 [debug    ] Building models...            
here
2024-05-16 12:56.50 [info     ] dataset info                   dataset_info=DatasetInfo(observation_signature=Signature(dtype=[dtype('uint8')], shape=[(324,)]), action_signature=Signature(dtype=[dtype('int64')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('int32')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=3)
2024-05-16 12:56.50 [info     ] Directory is created at d3rlpy_logs\DiscreteDecisionTransformer_20240516125650
2024-05-16 12:56.50 [info     ] Parameters                     params={'observation_shape': [324], 'action_size': 3, 'config': {'type': 'discrete_decision_transformer', 'params': {'batch_size': 128, 'gamma': 0.99, 'observation_scaler': {'type': 'none', 'params': {}}, 'action_scaler': {'type': 'none', 'params': {}}, 'reward_scaler': {'type': 'none', 'params': {}}, 'context_size': 20, 'max_timestep': 1000, 'learning_rate': 0.0006, 'encoder_factory': {'type': 'default', '

KeyboardInterrupt: 

### Unreachable test set evaluation

In [None]:
# Test the trained model with offline policy evaluator on unreachable test set

imgs = []
# # wrap as stateful actor for interaction
try:
    actor = dt.as_stateful_wrapper(target_return=0)

    offline_policy_evaluator = DiscreteFQE(algo=actor, config=FQEConfig())
    # test with offline dataset
    offline_policy_evaluator.fit(test_dataset_unreachable,
                                n_steps=2,
                                n_steps_per_epoch=1,
                                evaluators={"metric_td_error": TDErrorEvaluator(),
                                            "metric_discrete_action_match": DiscreteActionMatchEvaluator()},
                                logger_adapter=WanDBAdapterFactory(project="DT")
                        )

    # reset history
    actor.reset()
except:
    print("here")
    # do hardcoded test
    # Test model explicitly
    test_env_unreachable = utils.ObservationFlattenerWrapper(test_env_unreachable)
    rewards = []
    # wrap as stateful actor for interaction
    actor = dt.as_stateful_wrapper(target_return=0)
    # explicitly evaluate the model
    for _ in range(len(test_config_unreachable["topologies"])):
        img = test_env_unreachable.render()
        imgs.append(img)
        observation, _ = test_env_unreachable.reset()
        reward = 0.0
        done = False
        while not done:
            action = actor.predict(observation, reward)
            observation, reward, terminated, truncated, _ = test_env_unreachable.step(action)
            done = terminated or truncated
            rewards.append(reward)

    actor.reset()

    print("Cumulative Reward: ", np.sum(rewards))
    imageio.mimsave('test_unreachable_rendered_episode.gif', [np.array(img) for i, img in enumerate(imgs) if i%1 == 0], duration=200)

# Testing with tutorial

In [None]:
from d3rlpy.datasets import get_cartpole # CartPole-v1 dataset
dataset, env = get_cartpole()
from d3rlpy.algos import DQNConfig

# if you don't use GPU, set device=None instead.
dqn = DQNConfig().create(device=device)

# initialize neural networks with the given observation shape and action size.
# this is not necessary when you directly call fit or fit_online method.
dqn.build_with_dataset(dataset)
from d3rlpy.metrics import TDErrorEvaluator

# calculate metrics with training dataset
td_error_evaluator = TDErrorEvaluator(episodes=dataset.episodes)


from d3rlpy.metrics import EnvironmentEvaluator

# set environment in scorer function
env_evaluator = EnvironmentEvaluator(env)

# evaluate algorithm on the environment
rewards = env_evaluator(dqn, dataset=None)

dqn.fit(
    dataset,
    n_steps=10,
    n_steps_per_epoch=2,
    evaluators={
        'td_error': td_error_evaluator,
        'reward': env_evaluator,
        "metric_discrete_action_match": DiscreteActionMatchEvaluator()
    },
    logger_adapter=WanDBAdapterFactory(project="random_test_runs")
)

import d3rlpy

# prepare the trained algorithm

# dataset to evaluate with
dataset, env = get_cartpole()

# off-policy evaluation algorithm
fqe = d3rlpy.ope.DiscreteFQE(algo=dqn, config=d3rlpy.ope.FQEConfig(), device=device)

# train estimators to evaluate the trained policy
fqe.fit(
   dataset,
   n_steps=10,
   n_steps_per_epoch=2,
   evaluators={
        'td_error': td_error_evaluator,
        'environment': env_evaluator,
        "metric_discrete_action_match": DiscreteActionMatchEvaluator()
    },
   logger_adapter=WanDBAdapterFactory(project="random_test_runs")
)

In [20]:
import d3rlpy

dataset, env = d3rlpy.datasets.get_cartpole()

dt = d3rlpy.algos.DiscreteDecisionTransformerConfig().create(device=device)

# offline training
print(dt.fit(
   dataset,
   n_steps=2,
   n_steps_per_epoch=2,
   eval_env=env,
   eval_target_return=0,  # specify target environment return
))

# wrap as stateful actor for interaction
actor = dt.as_stateful_wrapper(target_return=0)

# interaction
observation, reward = env.reset(), 0.0
observation = observation[0]
while True:
    action = actor.predict(observation, reward)
    observation, reward, done, truncated, _ = env.step(action)
    if done or truncated:
        break

# reset history
actor.reset()

2024-05-16 08:58.11 [info     ] Signatures have been automatically determined. action_signature=Signature(dtype=[dtype('int32')], shape=[(1,)]) observation_signature=Signature(dtype=[dtype('float32')], shape=[(4,)]) reward_signature=Signature(dtype=[dtype('float32')], shape=[(1,)])
2024-05-16 08:58.11 [info     ] Action-space has been automatically determined. action_space=<ActionSpace.DISCRETE: 2>
2024-05-16 08:58.11 [info     ] Action size has been automatically determined. action_size=2
2024-05-16 08:58.11 [info     ] dataset info                   dataset_info=DatasetInfo(observation_signature=Signature(dtype=[dtype('float32')], shape=[(4,)]), action_signature=Signature(dtype=[dtype('int32')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('float32')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=2)
2024-05-16 08:58.11 [info     ] Directory is created at d3rlpy_logs\DiscreteDecisionTransformer_20240516085811
2024-05-16 08:58.11 [debug    ] Building mod

Epoch 1/1: 100%|██████████| 2/2 [00:06<00:00,  3.07s/it, loss=0.733, learning_rate=0.000131]
  if not isinstance(terminated, (bool, np.bool8)):


2024-05-16 08:58.20 [info     ] DiscreteDecisionTransformer_20240516085811: epoch=1 step=2 epoch=1 metrics={'time_sample_batch': 0.014497637748718262, 'time_algorithm_update': 3.049818992614746, 'loss': 1.3235219717025757, 'learning_rate': 0.00019494140625, 'time_step': 3.0648165941238403, 'environment': 9.4} step=2
2024-05-16 08:58.21 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteDecisionTransformer_20240516085811\model_2.d3
None
