In [1]:
import minari
import d3rlpy
import os
import copy

# Parameters of the experiment

In [2]:
# Online RL configuration: total steps, steps per epoch, and when to start updating
n_steps = 100
n_steps_per_epoch = 10
update_start_step = 50

# Loading of environments

In [3]:
# Load expert datasets and directly recover the corresponding environments for each Adroit task
pen_env = minari.load_dataset("D4RL/pen/expert-v2").recover_environment()
relocate_env = minari.load_dataset("D4RL/relocate/expert-v2").recover_environment()
hammer_env = minari.load_dataset("D4RL/hammer/expert-v2").recover_environment()
door_env = minari.load_dataset("D4RL/door/expert-v2").recover_environment()

# Creation of folders for policies and logs

In [2]:
# Create policies/online
policies_path = os.path.join("policies", "online")
if not os.path.exists(policies_path):
    os.makedirs(policies_path)
    print(f"Created: {policies_path}")
else:
    print(f"Already exists: {policies_path}")

# Create training_logs/online/{task}
training_base = os.path.join("training_logs", "online")
task_dirs = ["pen", "relocate", "hammer", "door"]

for task in task_dirs:
    task_path = os.path.join(training_base, task)
    if not os.path.exists(task_path):
        os.makedirs(task_path)
        print(f"Created: {task_path}")
    else:
        print(f"Already exists: {task_path}")

Created: policies/online
Created: training_logs/online/pen
Created: training_logs/online/relocate
Created: training_logs/online/hammer
Created: training_logs/online/door


# Training Algorithm

In [5]:
def train_online_algorithm(config_class, env, filename, task):
    # Create a FIFO replay buffer for online data collection
    buffer = d3rlpy.dataset.create_fifo_replay_buffer(limit=100000, env=env)
    
    # Define an exploration strategy (epsilon-greedy with constant epsilon)
    explorer = d3rlpy.algos.ConstantEpsilonGreedy(0.3)
    
    # Initialize the algorithm with the given configuration on CPU
    algo = config_class().create(device="cpu")

    # Train the algorithm fully online with periodic evaluation in a separate environment
    algo.fit_online(
        env,
        buffer,
        explorer,
        n_steps=n_steps,
        eval_env=copy.deepcopy(env),
        n_steps_per_epoch=n_steps_per_epoch,
        update_start_step=update_start_step,
        logger_adapter=d3rlpy.logging.FileAdapterFactory(root_dir=f"training_logs/online/{task}"),
    )

    # Save the trained policy to file
    algo.save(f'policies/online/{filename}.d3')

# Policy training

### Pen

In [6]:
# Train each algorithm from scratch in an online setting on the Pen task
train_online_algorithm(d3rlpy.algos.IQLConfig, pen_env, 'pen_iql', 'pen')
train_online_algorithm(d3rlpy.algos.CQLConfig, pen_env, 'pen_cql', 'pen')
train_online_algorithm(d3rlpy.algos.TD3PlusBCConfig, pen_env, 'pen_td3bc', 'pen')
train_online_algorithm(d3rlpy.algos.AWACConfig, pen_env, 'pen_awac', 'pen')

[2m2025-05-24 10:02.30[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(24,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(45,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[[1]])[0m
[2m2025-05-24 10:02.30[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.CONTINUOUS: 1>[0m
[2m2025-05-24 10:02.30[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m24[0m
[2m2025-05-24 10:02.30[0m [[32m[1mdebug    [0m] [1mBuilding model...             [0m
[2m2025-05-24 10:02.31[0m [[32m[1mdebug    [0m] [1mModel has been built.         [0m
[2m2025-05-24 10:02.31[0m [[32m[1minfo     [0m] [1mDirectory is created at training_logs/online/pen/IQL_online_20250524100231[0m
[

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:02.31[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/pen/IQL_online_20250524100231/model_10.d3[0m
[2m2025-05-24 10:02.31[0m [[32m[1minfo     [0m] [1mIQL_online_20250524100231: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.0004892110824584961, 'time_environment_step': 0.0004047870635986328, 'time_step': 0.0009168624877929687, 'evaluation': 273.13161606365304}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:02.32[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/pen/IQL_online_20250524100231/model_20.d3[0m
[2m2025-05-24 10:02.32[0m [[32m[1minfo     [0m] [1mIQL_online_20250524100231: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.0001165628433227539, 'time_environment_step': 0.00028781890869140626, 'time_step': 0.00041322708129882814, 'evaluation': -9.301654342465714}[0m [36mstep[0m=[35m20[0m
[2m20

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:02.34[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/pen/CQL_online_20250524100234/model_10.d3[0m
[2m2025-05-24 10:02.34[0m [[32m[1minfo     [0m] [1mCQL_online_20250524100234: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00017976760864257812, 'time_environment_step': 0.0003695964813232422, 'time_step': 0.0005625724792480469, 'evaluation': 244.4638515782745}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:02.35[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/pen/CQL_online_20250524100234/model_20.d3[0m
[2m2025-05-24 10:02.35[0m [[32m[1minfo     [0m] [1mCQL_online_20250524100234: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.0001439809799194336, 'time_environment_step': 0.0002608299255371094, 'time_step': 0.0004144430160522461, 'evaluation': 787.9554209017721}[0m [36mstep[0m=[35m20[0m
[2m2025-

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:02.37[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/pen/TD3PlusBC_online_20250524100237/model_10.d3[0m
[2m2025-05-24 10:02.37[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250524100237: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00021226406097412108, 'time_environment_step': 0.0003108978271484375, 'time_step': 0.0005338191986083984, 'evaluation': 40.50007692776695}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:02.37[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/pen/TD3PlusBC_online_20250524100237/model_20.d3[0m
[2m2025-05-24 10:02.37[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250524100237: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00011448860168457031, 'time_environment_step': 0.0002675771713256836, 'time_step': 0.0003910541534423828, 'evaluation': 417.5602188942351}[0m [36mstep

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:02.40[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/pen/AWAC_online_20250524100240/model_10.d3[0m
[2m2025-05-24 10:02.40[0m [[32m[1minfo     [0m] [1mAWAC_online_20250524100240: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00024137496948242187, 'time_environment_step': 0.00031671524047851565, 'time_step': 0.0005704641342163086, 'evaluation': 65.46801105508864}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:02.41[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/pen/AWAC_online_20250524100240/model_20.d3[0m
[2m2025-05-24 10:02.41[0m [[32m[1minfo     [0m] [1mAWAC_online_20250524100240: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00012390613555908204, 'time_environment_step': 0.000400543212890625, 'time_step': 0.0005357742309570312, 'evaluation': 41.98446826720032}[0m [36mstep[0m=[35m20[0m
[2m

### Relocate

In [7]:
# Train each algorithm from scratch in an online setting on the Relocate task
train_online_algorithm(d3rlpy.algos.IQLConfig, relocate_env, 'relocate_iql', 'relocate')
train_online_algorithm(d3rlpy.algos.CQLConfig, relocate_env, 'relocate_cql', 'relocate')
train_online_algorithm(d3rlpy.algos.TD3PlusBCConfig, relocate_env, 'relocate_td3bc', 'relocate')
train_online_algorithm(d3rlpy.algos.AWACConfig, relocate_env, 'relocate_awac', 'relocate')

[2m2025-05-24 10:02.43[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(30,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(39,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[[1]])[0m
[2m2025-05-24 10:02.43[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.CONTINUOUS: 1>[0m
[2m2025-05-24 10:02.43[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m30[0m
[2m2025-05-24 10:02.43[0m [[32m[1mdebug    [0m] [1mBuilding model...             [0m
[2m2025-05-24 10:02.43[0m [[32m[1mdebug    [0m] [1mModel has been built.         [0m
[2m2025-05-24 10:02.43[0m [[32m[1minfo     [0m] [1mDirectory is created at training_logs/online/relocate/IQL_online_20250524100243[

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:02.44[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/relocate/IQL_online_20250524100243/model_10.d3[0m
[2m2025-05-24 10:02.44[0m [[32m[1minfo     [0m] [1mIQL_online_20250524100243: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.0003106117248535156, 'time_environment_step': 0.0002701044082641602, 'time_step': 0.0005921602249145507, 'evaluation': 4.341970731227002}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:02.44[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/relocate/IQL_online_20250524100243/model_20.d3[0m
[2m2025-05-24 10:02.44[0m [[32m[1minfo     [0m] [1mIQL_online_20250524100243: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00011472702026367187, 'time_environment_step': 0.00027444362640380857, 'time_step': 0.0003974437713623047, 'evaluation': 3.0640708008795565}[0m [36mstep[0m=[35m20[

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:02.50[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/relocate/CQL_online_20250524100249/model_10.d3[0m
[2m2025-05-24 10:02.50[0m [[32m[1minfo     [0m] [1mCQL_online_20250524100249: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.0002626657485961914, 'time_environment_step': 0.0003226280212402344, 'time_step': 0.0006015539169311523, 'evaluation': 4.283619014994538}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:02.50[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/relocate/CQL_online_20250524100249/model_20.d3[0m
[2m2025-05-24 10:02.50[0m [[32m[1minfo     [0m] [1mCQL_online_20250524100249: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.0001955747604370117, 'time_environment_step': 0.00043745040893554686, 'time_step': 0.0006534576416015625, 'evaluation': 4.213105643286518}[0m [36mstep[0m=[35m20[0m

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:02.55[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/relocate/TD3PlusBC_online_20250524100254/model_10.d3[0m
[2m2025-05-24 10:02.55[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250524100254: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.0002129077911376953, 'time_environment_step': 0.0003205776214599609, 'time_step': 0.0005481481552124023, 'evaluation': 4.908452616660396}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:02.55[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/relocate/TD3PlusBC_online_20250524100254/model_20.d3[0m
[2m2025-05-24 10:02.55[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250524100254: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00014886856079101562, 'time_environment_step': 0.0003590106964111328, 'time_step': 0.00052032470703125, 'evaluation': 4.619340784274071}[0m [3

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:03.00[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/relocate/AWAC_online_20250524100259/model_10.d3[0m
[2m2025-05-24 10:03.00[0m [[32m[1minfo     [0m] [1mAWAC_online_20250524100259: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00027678012847900393, 'time_environment_step': 0.00037000179290771487, 'time_step': 0.0006643056869506836, 'evaluation': 3.5729346863505}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:03.01[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/relocate/AWAC_online_20250524100259/model_20.d3[0m
[2m2025-05-24 10:03.01[0m [[32m[1minfo     [0m] [1mAWAC_online_20250524100259: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00015649795532226562, 'time_environment_step': 0.00033388137817382815, 'time_step': 0.0005075931549072266, 'evaluation': 4.32446361278244}[0m [36mstep[0m=[35m20

### Hammer

In [8]:
# Train each algorithm from scratch in an online setting on the Hammer task
train_online_algorithm(d3rlpy.algos.IQLConfig, hammer_env, 'hammer_iql', 'hammer')
train_online_algorithm(d3rlpy.algos.CQLConfig, hammer_env, 'hammer_cql', 'hammer')
train_online_algorithm(d3rlpy.algos.TD3PlusBCConfig, hammer_env, 'hammer_td3bc', 'hammer')
train_online_algorithm(d3rlpy.algos.AWACConfig, hammer_env, 'hammer_awac', 'hammer')

[2m2025-05-24 10:03.06[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(26,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(46,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[[1]])[0m
[2m2025-05-24 10:03.06[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.CONTINUOUS: 1>[0m
[2m2025-05-24 10:03.06[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m26[0m
[2m2025-05-24 10:03.06[0m [[32m[1mdebug    [0m] [1mBuilding model...             [0m
[2m2025-05-24 10:03.06[0m [[32m[1mdebug    [0m] [1mModel has been built.         [0m
[2m2025-05-24 10:03.06[0m [[32m[1minfo     [0m] [1mDirectory is created at training_logs/online/hammer/IQL_online_20250524100306[0m

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:03.07[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/hammer/IQL_online_20250524100306/model_10.d3[0m
[2m2025-05-24 10:03.07[0m [[32m[1minfo     [0m] [1mIQL_online_20250524100306: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.0004920005798339844, 'time_environment_step': 0.0005228757858276367, 'time_step': 0.001033186912536621, 'evaluation': -232.1180375663177}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:03.07[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/hammer/IQL_online_20250524100306/model_20.d3[0m
[2m2025-05-24 10:03.07[0m [[32m[1minfo     [0m] [1mIQL_online_20250524100306: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00011985301971435547, 'time_environment_step': 0.00027914047241210936, 'time_step': 0.0004093408584594727, 'evaluation': -230.2959848448589}[0m [36mstep[0m=[35m20[0m


  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:03.15[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/hammer/CQL_online_20250524100314/model_10.d3[0m
[2m2025-05-24 10:03.15[0m [[32m[1minfo     [0m] [1mCQL_online_20250524100314: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00037066936492919923, 'time_environment_step': 0.0005822896957397461, 'time_step': 0.0009709358215332031, 'evaluation': -233.98824440007053}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:03.15[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/hammer/CQL_online_20250524100314/model_20.d3[0m
[2m2025-05-24 10:03.15[0m [[32m[1minfo     [0m] [1mCQL_online_20250524100314: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.0003164529800415039, 'time_environment_step': 0.000534677505493164, 'time_step': 0.0008699893951416016, 'evaluation': -230.1890783324945}[0m [36mstep[0m=[35m20[0m


  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:03.21[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/hammer/TD3PlusBC_online_20250524100321/model_10.d3[0m
[2m2025-05-24 10:03.21[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250524100321: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.000247955322265625, 'time_environment_step': 0.0004309415817260742, 'time_step': 0.0006973743438720703, 'evaluation': -230.76406042923713}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:03.22[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/hammer/TD3PlusBC_online_20250524100321/model_20.d3[0m
[2m2025-05-24 10:03.22[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250524100321: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00010483264923095703, 'time_environment_step': 0.0003906965255737305, 'time_step': 0.0005073308944702149, 'evaluation': -230.40906716381852}[0m [

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:03.28[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/hammer/AWAC_online_20250524100327/model_10.d3[0m
[2m2025-05-24 10:03.28[0m [[32m[1minfo     [0m] [1mAWAC_online_20250524100327: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.0001842021942138672, 'time_environment_step': 0.00030171871185302734, 'time_step': 0.0004950523376464844, 'evaluation': -232.7513458907776}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:03.28[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/hammer/AWAC_online_20250524100327/model_20.d3[0m
[2m2025-05-24 10:03.28[0m [[32m[1minfo     [0m] [1mAWAC_online_20250524100327: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00010709762573242187, 'time_environment_step': 0.00030918121337890624, 'time_step': 0.0004243612289428711, 'evaluation': -232.27605519203183}[0m [36mstep[0m=[35m2

### Door

In [9]:
# Train each algorithm from scratch in an online setting on the Door task
train_online_algorithm(d3rlpy.algos.IQLConfig, door_env, 'door_iql', 'door')
train_online_algorithm(d3rlpy.algos.CQLConfig, door_env, 'door_cql', 'door')
train_online_algorithm(d3rlpy.algos.TD3PlusBCConfig, door_env, 'door_td3bc', 'door')
train_online_algorithm(d3rlpy.algos.AWACConfig, door_env, 'door_awac', 'door')

[2m2025-05-24 10:03.33[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(28,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(39,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[[1]])[0m
[2m2025-05-24 10:03.33[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.CONTINUOUS: 1>[0m
[2m2025-05-24 10:03.33[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m28[0m
[2m2025-05-24 10:03.34[0m [[32m[1mdebug    [0m] [1mBuilding model...             [0m
[2m2025-05-24 10:03.34[0m [[32m[1mdebug    [0m] [1mModel has been built.         [0m
[2m2025-05-24 10:03.34[0m [[32m[1minfo     [0m] [1mDirectory is created at training_logs/online/door/IQL_online_20250524100334[0m


  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:03.34[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/door/IQL_online_20250524100334/model_10.d3[0m
[2m2025-05-24 10:03.34[0m [[32m[1minfo     [0m] [1mIQL_online_20250524100334: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.0004510164260864258, 'time_environment_step': 0.0005024909973144531, 'time_step': 0.0009673833847045898, 'evaluation': -43.83995888576616}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:03.35[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/door/IQL_online_20250524100334/model_20.d3[0m
[2m2025-05-24 10:03.35[0m [[32m[1minfo     [0m] [1mIQL_online_20250524100334: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.0001222372055053711, 'time_environment_step': 0.0002108335494995117, 'time_step': 0.0003411054611206055, 'evaluation': -44.03152374496591}[0m [36mstep[0m=[35m20[0m
[2m20

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:03.39[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/door/CQL_online_20250524100338/model_10.d3[0m
[2m2025-05-24 10:03.39[0m [[32m[1minfo     [0m] [1mCQL_online_20250524100338: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00020678043365478515, 'time_environment_step': 0.00036892890930175783, 'time_step': 0.0005890369415283203, 'evaluation': -44.59077279951282}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:03.39[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/door/CQL_online_20250524100338/model_20.d3[0m
[2m2025-05-24 10:03.39[0m [[32m[1minfo     [0m] [1mCQL_online_20250524100338: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00011394023895263671, 'time_environment_step': 0.0002567291259765625, 'time_step': 0.000380706787109375, 'evaluation': -44.957045744426125}[0m [36mstep[0m=[35m20[0m
[2

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:03.44[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/door/TD3PlusBC_online_20250524100343/model_10.d3[0m
[2m2025-05-24 10:03.44[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250524100343: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00017642974853515625, 'time_environment_step': 0.00022394657135009765, 'time_step': 0.0004101991653442383, 'evaluation': -45.0896305765717}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:03.44[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/door/TD3PlusBC_online_20250524100343/model_20.d3[0m
[2m2025-05-24 10:03.44[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250524100343: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00010347366333007812, 'time_environment_step': 0.0002661228179931641, 'time_step': 0.0003803014755249023, 'evaluation': -44.65503785200754}[0m [36ms

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:03.49[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/door/AWAC_online_20250524100348/model_10.d3[0m
[2m2025-05-24 10:03.49[0m [[32m[1minfo     [0m] [1mAWAC_online_20250524100348: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.0003305673599243164, 'time_environment_step': 0.0003139972686767578, 'time_step': 0.0006565332412719726, 'evaluation': -44.46445604269353}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:03.49[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/door/AWAC_online_20250524100348/model_20.d3[0m
[2m2025-05-24 10:03.49[0m [[32m[1minfo     [0m] [1mAWAC_online_20250524100348: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00017957687377929689, 'time_environment_step': 0.0002959728240966797, 'time_step': 0.0004939556121826172, 'evaluation': -44.25105896495791}[0m [36mstep[0m=[35m20[0m
