In [1]:
import minari
import d3rlpy
import os
import copy

# Parameters of the experiment

In [2]:
n_steps = 100
n_steps_per_epoch = 10
update_start_step = 50

# Loading of environments

In [3]:
pen_env = minari.load_dataset("D4RL/pen/expert-v2").recover_environment()
relocate_env = minari.load_dataset("D4RL/relocate/expert-v2").recover_environment()
hammer_env = minari.load_dataset("D4RL/hammer/expert-v2").recover_environment()
door_env = minari.load_dataset("D4RL/door/expert-v2").recover_environment()

# Training Algorithm

In [4]:
def train_online_algorithm(config_class, env, filename, task):
    
    buffer = d3rlpy.dataset.create_fifo_replay_buffer(limit=100000, env=env)
    explorer = d3rlpy.algos.ConstantEpsilonGreedy(0.3)
    
    # Initialize the algorithm on CPU
    algo = config_class().create(device="cpu")

    # Train the algorithm on the offline dataset and periodically evaluate online, saving the training history
    algo.fit_online(
        env,
        buffer,
        explorer,
        n_steps = n_steps,
        eval_env = copy.deepcopy(env),
        n_steps_per_epoch=n_steps_per_epoch,
        update_start_step=update_start_step,
        logger_adapter=d3rlpy.logging.FileAdapterFactory(root_dir=f"training_logs/online/{task}"),
    )

    algo.save(f'policies/online/{filename}.d3')

# Creation of folders for policies and logs

In [5]:
# Create policies/online
policies_path = os.path.join("policies", "online")
if not os.path.exists(policies_path):
    os.makedirs(policies_path)
    print(f"Created: {policies_path}")
else:
    print(f"Already exists: {policies_path}")

# Create training_logs/offline/{task}
training_base = os.path.join("training_logs", "online")
task_dirs = ["pen", "relocate", "hammer", "door"]

for task in task_dirs:
    task_path = os.path.join(training_base, task)
    if not os.path.exists(task_path):
        os.makedirs(task_path)
        print(f"Created: {task_path}")
    else:
        print(f"Already exists: {task_path}")

Created: policies/online
Created: training_logs/online/pen
Created: training_logs/online/relocate
Created: training_logs/online/hammer
Created: training_logs/online/door


# Policy training

### Pen

In [6]:
train_online_algorithm(d3rlpy.algos.IQLConfig, pen_env, 'pen_iql', 'pen')
train_online_algorithm(d3rlpy.algos.CQLConfig, pen_env, 'pen_cql', 'pen')
train_online_algorithm(d3rlpy.algos.TD3PlusBCConfig, pen_env, 'pen_td3bc', 'pen')
train_online_algorithm(d3rlpy.algos.AWACConfig, pen_env, 'pen_awac', 'pen')

[2m2025-05-18 19:30.12[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(24,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(45,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[[1]])[0m
[2m2025-05-18 19:30.12[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.CONTINUOUS: 1>[0m
[2m2025-05-18 19:30.12[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m24[0m
[2m2025-05-18 19:30.12[0m [[32m[1mdebug    [0m] [1mBuilding model...             [0m
[2m2025-05-18 19:30.13[0m [[32m[1mdebug    [0m] [1mModel has been built.         [0m
[2m2025-05-18 19:30.13[0m [[32m[1minfo     [0m] [1mDirectory is created at training_logs/online/pen/IQL_online_20250518193013[0m
[

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-18 19:30.13[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/pen/IQL_online_20250518193013/model_10.d3[0m
[2m2025-05-18 19:30.13[0m [[32m[1minfo     [0m] [1mIQL_online_20250518193013: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.0004105091094970703, 'time_environment_step': 0.00033195018768310546, 'time_step': 0.0007580757141113281, 'evaluation': 297.9923724789885}[0m [36mstep[0m=[35m10[0m
[2m2025-05-18 19:30.13[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/pen/IQL_online_20250518193013/model_20.d3[0m
[2m2025-05-18 19:30.13[0m [[32m[1minfo     [0m] [1mIQL_online_20250518193013: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00011453628540039062, 'time_environment_step': 0.0002645254135131836, 'time_step': 0.0003880500793457031, 'evaluation': 548.0928691930555}[0m [36mstep[0m=[35m20[0m
[2m2025

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-18 19:30.16[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/pen/CQL_online_20250518193016/model_10.d3[0m
[2m2025-05-18 19:30.16[0m [[32m[1minfo     [0m] [1mCQL_online_20250518193016: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00022077560424804688, 'time_environment_step': 0.0002650737762451172, 'time_step': 0.0004955053329467774, 'evaluation': 119.73935504340258}[0m [36mstep[0m=[35m10[0m
[2m2025-05-18 19:30.16[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/pen/CQL_online_20250518193016/model_20.d3[0m
[2m2025-05-18 19:30.16[0m [[32m[1minfo     [0m] [1mCQL_online_20250518193016: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00011203289031982421, 'time_environment_step': 0.00021352767944335938, 'time_step': 0.0003337860107421875, 'evaluation': -37.22762810390058}[0m [36mstep[0m=[35m20[0m
[2m2

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-18 19:30.18[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/pen/TD3PlusBC_online_20250518193018/model_10.d3[0m
[2m2025-05-18 19:30.18[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250518193018: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00019464492797851562, 'time_environment_step': 0.00024995803833007815, 'time_step': 0.00045535564422607424, 'evaluation': 276.72156030937947}[0m [36mstep[0m=[35m10[0m
[2m2025-05-18 19:30.19[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/pen/TD3PlusBC_online_20250518193018/model_20.d3[0m
[2m2025-05-18 19:30.19[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250518193018: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 9.260177612304687e-05, 'time_environment_step': 0.00022430419921875, 'time_step': 0.0003264188766479492, 'evaluation': 154.5041674284832}[0m [36mstep

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-18 19:30.21[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/pen/AWAC_online_20250518193021/model_10.d3[0m
[2m2025-05-18 19:30.21[0m [[32m[1minfo     [0m] [1mAWAC_online_20250518193021: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.0002124309539794922, 'time_environment_step': 0.0002768993377685547, 'time_step': 0.000498819351196289, 'evaluation': -2.7400050212726663}[0m [36mstep[0m=[35m10[0m
[2m2025-05-18 19:30.21[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/pen/AWAC_online_20250518193021/model_20.d3[0m
[2m2025-05-18 19:30.21[0m [[32m[1minfo     [0m] [1mAWAC_online_20250518193021: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00010144710540771484, 'time_environment_step': 0.0002626180648803711, 'time_step': 0.00037274360656738283, 'evaluation': 254.22237950195648}[0m [36mstep[0m=[35m20[0m
[

### Relocate

In [7]:
train_online_algorithm(d3rlpy.algos.IQLConfig, relocate_env, 'relocate_iql', 'relocate')
train_online_algorithm(d3rlpy.algos.CQLConfig, relocate_env, 'relocate_cql', 'relocate')
train_online_algorithm(d3rlpy.algos.TD3PlusBCConfig, relocate_env, 'relocate_td3bc', 'relocate')
train_online_algorithm(d3rlpy.algos.AWACConfig, relocate_env, 'relocate_awac', 'relocate')

[2m2025-05-18 19:30.24[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(30,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(39,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[[1]])[0m
[2m2025-05-18 19:30.24[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.CONTINUOUS: 1>[0m
[2m2025-05-18 19:30.24[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m30[0m
[2m2025-05-18 19:30.24[0m [[32m[1mdebug    [0m] [1mBuilding model...             [0m
[2m2025-05-18 19:30.24[0m [[32m[1mdebug    [0m] [1mModel has been built.         [0m
[2m2025-05-18 19:30.24[0m [[32m[1minfo     [0m] [1mDirectory is created at training_logs/online/relocate/IQL_online_20250518193024[

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-18 19:30.24[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/relocate/IQL_online_20250518193024/model_10.d3[0m
[2m2025-05-18 19:30.24[0m [[32m[1minfo     [0m] [1mIQL_online_20250518193024: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.0002890110015869141, 'time_environment_step': 0.00026783943176269533, 'time_step': 0.0005694866180419922, 'evaluation': 3.2441628630113177}[0m [36mstep[0m=[35m10[0m
[2m2025-05-18 19:30.25[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/relocate/IQL_online_20250518193024/model_20.d3[0m
[2m2025-05-18 19:30.25[0m [[32m[1minfo     [0m] [1mIQL_online_20250518193024: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00010313987731933594, 'time_environment_step': 0.0002578258514404297, 'time_step': 0.00036966800689697266, 'evaluation': 4.07505456931554}[0m [36mstep[0m=[35m20[

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-18 19:30.29[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/relocate/CQL_online_20250518193028/model_10.d3[0m
[2m2025-05-18 19:30.29[0m [[32m[1minfo     [0m] [1mCQL_online_20250518193028: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.0001881122589111328, 'time_environment_step': 0.00024042129516601562, 'time_step': 0.00043871402740478513, 'evaluation': 4.006903850458192}[0m [36mstep[0m=[35m10[0m
[2m2025-05-18 19:30.29[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/relocate/CQL_online_20250518193028/model_20.d3[0m
[2m2025-05-18 19:30.29[0m [[32m[1minfo     [0m] [1mCQL_online_20250518193028: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00012104511260986328, 'time_environment_step': 0.0002362966537475586, 'time_step': 0.0003666877746582031, 'evaluation': 3.872748751686428}[0m [36mstep[0m=[35m20[

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-18 19:30.34[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/relocate/TD3PlusBC_online_20250518193033/model_10.d3[0m
[2m2025-05-18 19:30.34[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250518193033: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00016732215881347655, 'time_environment_step': 0.00025184154510498046, 'time_step': 0.00042862892150878905, 'evaluation': 4.665766110895869}[0m [36mstep[0m=[35m10[0m
[2m2025-05-18 19:30.34[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/relocate/TD3PlusBC_online_20250518193033/model_20.d3[0m
[2m2025-05-18 19:30.34[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250518193033: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00012030601501464844, 'time_environment_step': 0.00024745464324951174, 'time_step': 0.00037815570831298826, 'evaluation': 4.83721109061747}[

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-18 19:30.39[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/relocate/AWAC_online_20250518193038/model_10.d3[0m
[2m2025-05-18 19:30.39[0m [[32m[1minfo     [0m] [1mAWAC_online_20250518193038: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00022344589233398439, 'time_environment_step': 0.00024228096008300782, 'time_step': 0.00047483444213867185, 'evaluation': 3.766323004197139}[0m [36mstep[0m=[35m10[0m
[2m2025-05-18 19:30.39[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/relocate/AWAC_online_20250518193038/model_20.d3[0m
[2m2025-05-18 19:30.39[0m [[32m[1minfo     [0m] [1mAWAC_online_20250518193038: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00011188983917236328, 'time_environment_step': 0.0002591133117675781, 'time_step': 0.00037996768951416017, 'evaluation': 3.925611409620751}[0m [36mstep[0m=[3

### Hammer

In [8]:
train_online_algorithm(d3rlpy.algos.IQLConfig, hammer_env, 'hammer_iql', 'hammer')
train_online_algorithm(d3rlpy.algos.CQLConfig, hammer_env, 'hammer_cql', 'hammer')
train_online_algorithm(d3rlpy.algos.TD3PlusBCConfig, hammer_env, 'hammer_td3bc', 'hammer')
train_online_algorithm(d3rlpy.algos.AWACConfig, hammer_env, 'hammer_awac', 'hammer')

[2m2025-05-18 19:30.43[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(26,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(46,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[[1]])[0m
[2m2025-05-18 19:30.43[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.CONTINUOUS: 1>[0m
[2m2025-05-18 19:30.43[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m26[0m
[2m2025-05-18 19:30.43[0m [[32m[1mdebug    [0m] [1mBuilding model...             [0m
[2m2025-05-18 19:30.43[0m [[32m[1mdebug    [0m] [1mModel has been built.         [0m
[2m2025-05-18 19:30.43[0m [[32m[1minfo     [0m] [1mDirectory is created at training_logs/online/hammer/IQL_online_20250518193043[0m

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-18 19:30.44[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/hammer/IQL_online_20250518193043/model_10.d3[0m
[2m2025-05-18 19:30.44[0m [[32m[1minfo     [0m] [1mIQL_online_20250518193043: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.0002064228057861328, 'time_environment_step': 0.0003233194351196289, 'time_step': 0.0005400419235229492, 'evaluation': -231.23753439811475}[0m [36mstep[0m=[35m10[0m
[2m2025-05-18 19:30.45[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/hammer/IQL_online_20250518193043/model_20.d3[0m
[2m2025-05-18 19:30.45[0m [[32m[1minfo     [0m] [1mIQL_online_20250518193043: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00010442733764648438, 'time_environment_step': 0.0002877473831176758, 'time_step': 0.000400853157043457, 'evaluation': -231.5891623882299}[0m [36mstep[0m=[35m20[0m


  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-18 19:30.50[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/hammer/CQL_online_20250518193049/model_10.d3[0m
[2m2025-05-18 19:30.50[0m [[32m[1minfo     [0m] [1mCQL_online_20250518193049: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00020475387573242186, 'time_environment_step': 0.0003032684326171875, 'time_step': 0.0005173206329345704, 'evaluation': -230.5710958793437}[0m [36mstep[0m=[35m10[0m
[2m2025-05-18 19:30.51[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/hammer/CQL_online_20250518193049/model_20.d3[0m
[2m2025-05-18 19:30.51[0m [[32m[1minfo     [0m] [1mCQL_online_20250518193049: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00010342597961425782, 'time_environment_step': 0.0002915143966674805, 'time_step': 0.00040242671966552737, 'evaluation': -230.24117719522854}[0m [36mstep[0m=[35m20[0

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-18 19:30.56[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/hammer/TD3PlusBC_online_20250518193056/model_10.d3[0m
[2m2025-05-18 19:30.56[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250518193056: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00016562938690185548, 'time_environment_step': 0.000301051139831543, 'time_step': 0.00047605037689208985, 'evaluation': -232.2052589335799}[0m [36mstep[0m=[35m10[0m
[2m2025-05-18 19:30.57[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/hammer/TD3PlusBC_online_20250518193056/model_20.d3[0m
[2m2025-05-18 19:30.57[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250518193056: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 7.674694061279296e-05, 'time_environment_step': 0.0003001928329467773, 'time_step': 0.00038478374481201174, 'evaluation': -233.67116832400956}[0m 

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-18 19:31.02[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/hammer/AWAC_online_20250518193102/model_10.d3[0m
[2m2025-05-18 19:31.02[0m [[32m[1minfo     [0m] [1mAWAC_online_20250518193102: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.0001902341842651367, 'time_environment_step': 0.0003495216369628906, 'time_step': 0.0005487680435180664, 'evaluation': -231.63075123682148}[0m [36mstep[0m=[35m10[0m
[2m2025-05-18 19:31.03[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/hammer/AWAC_online_20250518193102/model_20.d3[0m
[2m2025-05-18 19:31.03[0m [[32m[1minfo     [0m] [1mAWAC_online_20250518193102: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00010724067687988282, 'time_environment_step': 0.0003195047378540039, 'time_step': 0.00043377876281738283, 'evaluation': -231.47461269321957}[0m [36mstep[0m=[35m2

### Door

In [9]:
train_online_algorithm(d3rlpy.algos.IQLConfig, door_env, 'door_iql', 'door')
train_online_algorithm(d3rlpy.algos.CQLConfig, door_env, 'door_cql', 'door')
train_online_algorithm(d3rlpy.algos.TD3PlusBCConfig, door_env, 'door_td3bc', 'door')
train_online_algorithm(d3rlpy.algos.AWACConfig, door_env, 'door_awac', 'door')

[2m2025-05-18 19:31.08[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(28,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(39,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[[1]])[0m
[2m2025-05-18 19:31.08[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.CONTINUOUS: 1>[0m
[2m2025-05-18 19:31.08[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m28[0m
[2m2025-05-18 19:31.08[0m [[32m[1mdebug    [0m] [1mBuilding model...             [0m
[2m2025-05-18 19:31.08[0m [[32m[1mdebug    [0m] [1mModel has been built.         [0m
[2m2025-05-18 19:31.08[0m [[32m[1minfo     [0m] [1mDirectory is created at training_logs/online/door/IQL_online_20250518193108[0m


  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-18 19:31.08[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/door/IQL_online_20250518193108/model_10.d3[0m
[2m2025-05-18 19:31.08[0m [[32m[1minfo     [0m] [1mIQL_online_20250518193108: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.0001968860626220703, 'time_environment_step': 0.00020558834075927733, 'time_step': 0.00041158199310302733, 'evaluation': -44.94519145197609}[0m [36mstep[0m=[35m10[0m
[2m2025-05-18 19:31.09[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/door/IQL_online_20250518193108/model_20.d3[0m
[2m2025-05-18 19:31.09[0m [[32m[1minfo     [0m] [1mIQL_online_20250518193108: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.0001054525375366211, 'time_environment_step': 0.00020258426666259767, 'time_step': 0.00031511783599853517, 'evaluation': -45.153707190683136}[0m [36mstep[0m=[35m20[0m


  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-18 19:31.13[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/door/CQL_online_20250518193113/model_10.d3[0m
[2m2025-05-18 19:31.13[0m [[32m[1minfo     [0m] [1mCQL_online_20250518193113: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.0005377769470214844, 'time_environment_step': 0.0004079103469848633, 'time_step': 0.000973963737487793, 'evaluation': -45.00815998925763}[0m [36mstep[0m=[35m10[0m
[2m2025-05-18 19:31.14[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/door/CQL_online_20250518193113/model_20.d3[0m
[2m2025-05-18 19:31.14[0m [[32m[1minfo     [0m] [1mCQL_online_20250518193113: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00011081695556640625, 'time_environment_step': 0.00019676685333251952, 'time_step': 0.0003148555755615234, 'evaluation': -44.45012632238425}[0m [36mstep[0m=[35m20[0m
[2m2

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-18 19:31.19[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/door/TD3PlusBC_online_20250518193118/model_10.d3[0m
[2m2025-05-18 19:31.19[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250518193118: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.0001642465591430664, 'time_environment_step': 0.00024068355560302734, 'time_step': 0.0004143476486206055, 'evaluation': -43.36801148568946}[0m [36mstep[0m=[35m10[0m
[2m2025-05-18 19:31.19[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/door/TD3PlusBC_online_20250518193118/model_20.d3[0m
[2m2025-05-18 19:31.19[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250518193118: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 8.780956268310547e-05, 'time_environment_step': 0.000209808349609375, 'time_step': 0.00030496120452880857, 'evaluation': -43.22618846013376}[0m [36mst

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-18 19:31.23[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/door/AWAC_online_20250518193123/model_10.d3[0m
[2m2025-05-18 19:31.23[0m [[32m[1minfo     [0m] [1mAWAC_online_20250518193123: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00017855167388916015, 'time_environment_step': 0.00021989345550537108, 'time_step': 0.0004069805145263672, 'evaluation': -43.977673850329936}[0m [36mstep[0m=[35m10[0m
[2m2025-05-18 19:31.24[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/online/door/AWAC_online_20250518193123/model_20.d3[0m
[2m2025-05-18 19:31.24[0m [[32m[1minfo     [0m] [1mAWAC_online_20250518193123: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00010523796081542968, 'time_environment_step': 0.00020813941955566406, 'time_step': 0.0003220081329345703, 'evaluation': -44.12429605994741}[0m [36mstep[0m=[35m20[