In [17]:
import minari
import d3rlpy
import numpy as np
import os
import torch
import logging
from datetime import datetime

# Version checking
print(f"d3rlpy version: {d3rlpy.__version__}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

d3rlpy version: 2.8.1
PyTorch version: 2.6.0
CUDA available: False


# Parameters of the experiments

In [18]:
# Optimized parameters for A4000 16GB
CONFIG = {
    'training': {
        'n_steps': 1_000_000,        # Proper training duration
        'n_steps_per_epoch': 5_000,   # More frequent checkpoints
        'batch_size': 1024,          # Optimize A4000 memory usage
        'seed': 42
    },
    'evaluation': {
        'n_episodes': 10,            # Faster evaluation
        'eval_interval': 5_000       # Every epoch
    },
    'device': 'cuda',
    'mixed_precision': True          # Memory optimization
}

# Set random seeds for reproducibility
np.random.seed(CONFIG['training']['seed'])
torch.manual_seed(CONFIG['training']['seed'])

<torch._C.Generator at 0x10948a7b0>

# Loading and preparation of datasets and environments

In [19]:
# Load Minari datasets for Adroit manipulation tasks
pen_dataset = minari.load_dataset("D4RL/pen/expert-v2")
relocate_dataset = minari.load_dataset("D4RL/relocate/expert-v2")
hammer_dataset = minari.load_dataset("D4RL/hammer/expert-v2")
door_dataset = minari.load_dataset("D4RL/door/expert-v2")

# Reconstruct the corresponding environments from each dataset
pen_env = pen_dataset.recover_environment()
relocate_env = relocate_dataset.recover_environment()
hammer_env = hammer_dataset.recover_environment()
door_env = door_dataset.recover_environment()

In [20]:
def prepare_d3_dataset(minari_dataset):
    # Lists to collect observations, actions, rewards, and terminals from all episodes
    observations = []
    actions = []
    rewards = []
    terminals = []

    # Iterate over episodes in the Minari dataset
    for episode in minari_dataset.iterate_episodes():
        # Extract sequences of data, removing the last observation
        obs = episode.observations[:-1]
        actions_ep = episode.actions
        rewards_ep = episode.rewards
        dones = np.array(episode.terminations) | np.array(episode.truncations)

        observations.append(obs)
        actions.append(actions_ep)
        rewards.append(rewards_ep)
        terminals.append(dones)

    # Merge all episodes into single arrays
    observations = np.concatenate(observations)
    actions = np.concatenate(actions)
    rewards = np.concatenate(rewards)
    terminals = np.concatenate(terminals)

    # Build the final dataset in d3rlpy's MDPDataset format
    d3_dataset = d3rlpy.datasets.MDPDataset(
        observations=observations,
        actions=actions,
        rewards=rewards,
        terminals=terminals,
        action_space=d3rlpy.constants.ActionSpace.CONTINUOUS
    )

    return d3_dataset

In [21]:
# Conversion of Minari datasets into MDPDataset format for training
pen_d3_dataset = prepare_d3_dataset(pen_dataset)
relocate_d3_dataset = prepare_d3_dataset(relocate_dataset)
hammer_d3_dataset = prepare_d3_dataset(hammer_dataset)
door_d3_dataset = prepare_d3_dataset(door_dataset)

[2m2025-06-20 09:44.41[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(24,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(45,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(1,)])[0m
[2m2025-06-20 09:44.41[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m24[0m
[2m2025-06-20 09:44.44[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(30,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(39,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(1,)])[0m
[2m2025-06-20 09:44.44[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction

# Creation of folders for policies and logs

In [22]:
# Create policies/offline
policies_path = os.path.join("policies", "offline")
if not os.path.exists(policies_path):
    os.makedirs(policies_path)
    print(f"Created: {policies_path}")
else:
    print(f"Already exists: {policies_path}")

# Create training_logs/offline/{task}
training_base = os.path.join("training_logs", "offline")
task_dirs = ["pen", "relocate", "hammer", "door"]

for task in task_dirs:
    task_path = os.path.join(training_base, task)
    if not os.path.exists(task_path):
        os.makedirs(task_path)
        print(f"Created: {task_path}")
    else:
        print(f"Already exists: {task_path}")

Already exists: policies/offline
Already exists: training_logs/offline/pen
Already exists: training_logs/offline/relocate
Already exists: training_logs/offline/hammer
Already exists: training_logs/offline/door


# Training Algorithm

In [23]:
def train_offline_algorithm(config_class, dataset, env, filename, task):
    config = config_class()
    algo_name = type(config).__name__.replace('Config', '')

    # hyperparameters based on literature
    hyperparams = {
        'IQLConfig': {
            'expectile': 0.8,  # Higher for expert data
            'weight_temp': 10.0,  # Higher for more focused weighting
            'max_weight': 100.0,
            'actor_learning_rate': 3e-4,
            'critic_learning_rate': 3e-4
        },
        'CQLConfig': {
            'conservative_weight': 10.0,  # Higher for expert data
            'initial_alpha': 1.0,
            'soft_q_backup': True,
            'actor_learning_rate': 1e-4,
            'critic_learning_rate': 3e-4
        },
        'TD3PlusBCConfig': {
            'alpha': 2.5,
            'actor_learning_rate': 3e-4,  # Consistent learning rates
            'critic_learning_rate': 3e-4,
            'target_smoothing_sigma': 0.2,
            'target_smoothing_clip': 0.5
        },
        'BCConfig': {
            'learning_rate': 1e-3,  # Higher for BC
            'policy_type': 'stochastic'  # Better for expert data
        },
        'AWACConfig': {
            'lam': 1.0,  # Higher for expert data
            'actor_learning_rate': 3e-4,
            'critic_learning_rate': 3e-4
        }
    }

    if type(config).__name__ in hyperparams:
        for key, value in hyperparams[type(config).__name__].items():
            setattr(config, key, value)

    config.observation_scaler = d3rlpy.preprocessing.StandardObservationScaler()
    config.batch_size = CONFIG['training']['batch_size']
    config.use_gpu = (CONFIG['device'] == 'cuda')

    # Ora si può creare l'algoritmo con device e seed
    algo = config.create(device="cpu")

    algo.build_with_dataset(dataset)

    try:
        algo.fit(
            dataset=dataset,
            n_steps=CONFIG['training']['n_steps'],
            n_steps_per_epoch=CONFIG['training']['n_steps_per_epoch'],
            evaluators={"environment": d3rlpy.metrics.EnvironmentEvaluator(env, n_trials=CONFIG['evaluation']['n_episodes'])},
            logger_adapter=d3rlpy.logging.FileAdapterFactory(root_dir=f"training_logs/offline/{task}"),
            show_progress=True
        )
        algo.save(f'policies/offline/{filename}.d3')
    except Exception as e:
        print(f"Training failed for {algo_name} on {task}: {e}")
        return None

# Policy training

### Door

In [28]:
# Train each offline RL algorithm on the Door task using the D3RLpy configuration interface
train_offline_algorithm(d3rlpy.algos.IQLConfig, door_d3_dataset, door_env, 'door_iql', 'door')
train_offline_algorithm(d3rlpy.algos.CQLConfig, door_d3_dataset, door_env, 'door_cql', 'door')
train_offline_algorithm(d3rlpy.algos.TD3PlusBCConfig, door_d3_dataset, door_env, 'door_td3bc', 'door')
train_offline_algorithm(d3rlpy.algos.BCConfig, door_d3_dataset, door_env, 'door_bc', 'door')
train_offline_algorithm(d3rlpy.algos.AWACConfig, door_d3_dataset, door_env, 'door_awac', 'door')

[2m2025-06-20 09:46.33[0m [[32m[1minfo     [0m] [1mdataset info                  [0m [36mdataset_info[0m=[35mDatasetInfo(observation_signature=Signature(dtype=[dtype('float64')], shape=[(39,)]), action_signature=Signature(dtype=[dtype('float32')], shape=[(28,)]), reward_signature=Signature(dtype=[dtype('float64')], shape=[(1,)]), action_space=<ActionSpace.CONTINUOUS: 1>, action_size=28)[0m
[2m2025-06-20 09:46.33[0m [[32m[1mdebug    [0m] [1mFitting observation scaler... [0m [36mobservation_scaler[0m=[35mstandard[0m
[2m2025-06-20 09:46.37[0m [[32m[1minfo     [0m] [1mDirectory is created at training_logs/offline/door/AWAC_20250620094637[0m
[2m2025-06-20 09:46.37[0m [[32m[1minfo     [0m] [1mParameters                    [0m [36mparams[0m=[35m{'observation_shape': [39], 'action_size': 28, 'config': {'type': 'awac', 'params': {'batch_size': 1024, 'gamma': 0.99, 'observation_scaler': {'type': 'standard', 'params': {'mean': [-0.0011907210252678387, -0.545

Epoch 1/200:   0%|          | 0/5000 [00:00<?, ?it/s]

KeyboardInterrupt: 

### Relocate

In [None]:
# Train each offline RL algorithm on the Relocate task using the D3RLpy configuration interface
train_offline_algorithm(d3rlpy.algos.IQLConfig, relocate_d3_dataset, relocate_env, 'relocate_iql', 'relocate')
train_offline_algorithm(d3rlpy.algos.CQLConfig, relocate_d3_dataset, relocate_env, 'relocate_cql', 'relocate')
train_offline_algorithm(d3rlpy.algos.TD3PlusBCConfig, relocate_d3_dataset, relocate_env, 'relocate_td3bc', 'relocate')
train_offline_algorithm(d3rlpy.algos.BCConfig, relocate_d3_dataset, relocate_env, 'relocate_bc', 'relocate')
train_offline_algorithm(d3rlpy.algos.AWACConfig, relocate_d3_dataset, relocate_env, 'relocate_awac', 'relocate')

### Pen

In [None]:
# Train each offline RL algorithm on the Pen task using the D3RLpy configuration interface
train_offline_algorithm(d3rlpy.algos.IQLConfig, pen_d3_dataset, pen_env, 'pen_iql', 'pen')
train_offline_algorithm(d3rlpy.algos.CQLConfig, pen_d3_dataset, pen_env, 'pen_cql', 'pen')
train_offline_algorithm(d3rlpy.algos.TD3PlusBCConfig, pen_d3_dataset, pen_env, 'pen_td3bc', 'pen')
train_offline_algorithm(d3rlpy.algos.BCConfig, pen_d3_dataset, pen_env, 'pen_bc', 'pen')
train_offline_algorithm(d3rlpy.algos.AWACConfig, pen_d3_dataset, pen_env, 'pen_awac', 'pen')

### Hammer

In [None]:
# Train each offline RL algorithm on the Hammer task using the D3RLpy configuration interface
train_offline_algorithm(d3rlpy.algos.IQLConfig, hammer_d3_dataset, hammer_env, 'hammer_iql', 'hammer')
train_offline_algorithm(d3rlpy.algos.CQLConfig, hammer_d3_dataset, hammer_env, 'hammer_cql', 'hammer')
train_offline_algorithm(d3rlpy.algos.TD3PlusBCConfig, hammer_d3_dataset, hammer_env, 'hammer_td3bc', 'hammer')
train_offline_algorithm(d3rlpy.algos.BCConfig, hammer_d3_dataset, hammer_env, 'hammer_bc', 'hammer')
train_offline_algorithm(d3rlpy.algos.AWACConfig, hammer_d3_dataset, hammer_env, 'hammer_awac', 'hammer')