In [1]:
import minari
import d3rlpy
import copy
import os

# Parameters of the experiments

In [2]:
# Number of evaluation episodes during testing
N = 50

# Training duration and evaluation frequency
n_steps = 100
n_steps_per_epoch = 10
save_interval = 1

# Loading and preparation of datasets

In [3]:
# Load expert datasets and directly recover the corresponding environments for each Adroit task
pen_env = minari.load_dataset("D4RL/pen/expert-v2").recover_environment()
relocate_env = minari.load_dataset("D4RL/relocate/expert-v2").recover_environment()
hammer_env = minari.load_dataset("D4RL/hammer/expert-v2").recover_environment()
door_env = minari.load_dataset("D4RL/door/expert-v2").recover_environment()

# Creation of folders for policies and logs

In [2]:
# Create policies/finetuning
policies_path = os.path.join("policies", "finetuning")
if not os.path.exists(policies_path):
    os.makedirs(policies_path)
    print(f"Created: {policies_path}")
else:
    print(f"Already exists: {policies_path}")

# Create training_logs/finetuning/{task}
training_base = os.path.join("training_logs", "finetuning")
task_dirs = ["pen", "relocate", "hammer", "door"]

for task in task_dirs:
    task_path = os.path.join(training_base, task)
    if not os.path.exists(task_path):
        os.makedirs(task_path)
        print(f"Created: {task_path}")
    else:
        print(f"Already exists: {task_path}")

Created: policies/finetuning
Created: training_logs/finetuning/pen
Created: training_logs/finetuning/relocate
Created: training_logs/finetuning/hammer
Created: training_logs/finetuning/door


# Loading saved policies

### Pen

In [5]:
# Load the trained offline policies for the Pen task from disk
pen_iql = d3rlpy.load_learnable("policies/offline/pen_iql.d3")
pen_cql = d3rlpy.load_learnable("policies/offline/pen_cql.d3")
pen_bc = d3rlpy.load_learnable("policies/offline/pen_bc.d3")
pen_td3bc = d3rlpy.load_learnable("policies/offline/pen_td3bc.d3")
pen_awac = d3rlpy.load_learnable("policies/offline/pen_awac.d3")

### Relocate

In [6]:
# Load the trained offline policies for the Relocate task from disk
relocate_iql = d3rlpy.load_learnable("policies/offline/relocate_iql.d3")
relocate_cql = d3rlpy.load_learnable("policies/offline/relocate_cql.d3")
relocate_bc = d3rlpy.load_learnable("policies/offline/relocate_bc.d3")
relocate_td3bc = d3rlpy.load_learnable("policies/offline/relocate_td3bc.d3")
relocate_awac = d3rlpy.load_learnable("policies/offline/relocate_awac.d3")

### Hammer

In [7]:
# Load the trained offline policies for the Hammer task from disk
hammer_iql = d3rlpy.load_learnable("policies/offline/hammer_iql.d3")
hammer_cql = d3rlpy.load_learnable("policies/offline/hammer_cql.d3")
hammer_bc = d3rlpy.load_learnable("policies/offline/hammer_bc.d3")
hammer_td3bc = d3rlpy.load_learnable("policies/offline/hammer_td3bc.d3")
hammer_awac = d3rlpy.load_learnable("policies/offline/hammer_awac.d3")

### Door

In [8]:
# Load the trained offline policies for the Door task from disk
door_iql = d3rlpy.load_learnable("policies/offline/door_iql.d3")
door_cql = d3rlpy.load_learnable("policies/offline/door_cql.d3")
door_bc = d3rlpy.load_learnable("policies/offline/door_bc.d3")
door_td3bc = d3rlpy.load_learnable("policies/offline/door_td3bc.d3")
door_awac = d3rlpy.load_learnable("policies/offline/door_awac.d3")

# Finetuning Algorithm

In [9]:
def finetune_algorithm(algo, env, filename, task):
    # Create a FIFO replay buffer for online interaction
    buffer = d3rlpy.dataset.create_fifo_replay_buffer(limit=10000, env=env)

    # Fine-tune the pretrained policy through online interaction with the environment
    algo.fit_online(
        env,
        buffer=buffer,
        n_steps=n_steps,
        n_steps_per_epoch=n_steps_per_epoch,
        save_interval=save_interval,
        eval_env=copy.deepcopy(env),  # use a separate copy for evaluation
        logger_adapter=d3rlpy.logging.FileAdapterFactory(root_dir=f"training_logs/finetuning/{task}"),
    )

    # Save the fine-tuned policy to file
    algo.save(f'policies/finetuning/{filename}.d3')

# Online Finetuning

### Pen

In [10]:
# Fine-tune the offline-trained policies for the Pen task using online interaction
finetune_algorithm(pen_iql, pen_env, 'pen_iql', 'pen')
finetune_algorithm(pen_cql, pen_env, 'pen_cql', 'pen')
finetune_algorithm(pen_td3bc, pen_env, 'pen_td3bc', 'pen')
finetune_algorithm(pen_awac, pen_env, 'pen_awac', 'pen')

[2m2025-05-24 10:00.11[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(24,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(45,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[[1]])[0m
[2m2025-05-24 10:00.11[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.CONTINUOUS: 1>[0m
[2m2025-05-24 10:00.11[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m24[0m
[2m2025-05-24 10:00.11[0m [[32m[1minfo     [0m] [1mDirectory is created at training_logs/finetuning/pen/IQL_online_20250524100011[0m
[2m2025-05-24 10:00.11[0m [[32m[1minfo     [0m] [1mParameters                    [0m [36mparams[0m=[35m{'observation_shape': [45], 'action_size': 24, 'config': {'type': 'iq

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:00.11[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/pen/IQL_online_20250524100011/model_10.d3[0m
[2m2025-05-24 10:00.11[0m [[32m[1minfo     [0m] [1mIQL_online_20250524100011: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.0015054941177368164, 'time_environment_step': 0.0004440784454345703, 'time_step': 0.0019750356674194335, 'evaluation': 1093.0118582194439}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:00.12[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/pen/IQL_online_20250524100011/model_20.d3[0m
[2m2025-05-24 10:00.12[0m [[32m[1minfo     [0m] [1mIQL_online_20250524100011: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00018131732940673828, 'time_environment_step': 0.0003402233123779297, 'time_step': 0.0005390167236328125, 'evaluation': 117.92094856584697}[0m [36mstep[0m=[35m20[0m

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:00.15[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/pen/CQL_online_20250524100015/model_10.d3[0m
[2m2025-05-24 10:00.15[0m [[32m[1minfo     [0m] [1mCQL_online_20250524100015: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00030527114868164065, 'time_environment_step': 0.00037157535552978516, 'time_step': 0.0006926536560058593, 'evaluation': 14.676558529759925}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:00.16[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/pen/CQL_online_20250524100015/model_20.d3[0m
[2m2025-05-24 10:00.16[0m [[32m[1minfo     [0m] [1mCQL_online_20250524100015: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00015909671783447265, 'time_environment_step': 0.00029942989349365237, 'time_step': 0.00046911239624023435, 'evaluation': -10.386878946088114}[0m [36mstep[0m=[35m2

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:00.20[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/pen/TD3PlusBC_online_20250524100020/model_10.d3[0m
[2m2025-05-24 10:00.20[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250524100020: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.0002110004425048828, 'time_environment_step': 0.0003468513488769531, 'time_step': 0.0005741596221923828, 'evaluation': -5.489987502955908}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:00.20[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/pen/TD3PlusBC_online_20250524100020/model_20.d3[0m
[2m2025-05-24 10:00.20[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250524100020: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 8.988380432128906e-05, 'time_environment_step': 0.00030765533447265627, 'time_step': 0.0004082202911376953, 'evaluation': 6.484507168972047}[0m [

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:00.23[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/pen/AWAC_online_20250524100023/model_10.d3[0m
[2m2025-05-24 10:00.23[0m [[32m[1minfo     [0m] [1mAWAC_online_20250524100023: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00028884410858154297, 'time_environment_step': 0.0003201484680175781, 'time_step': 0.0006247520446777344, 'evaluation': 575.7049645858141}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:00.24[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/pen/AWAC_online_20250524100023/model_20.d3[0m
[2m2025-05-24 10:00.24[0m [[32m[1minfo     [0m] [1mAWAC_online_20250524100023: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00019788742065429688, 'time_environment_step': 0.0004449605941772461, 'time_step': 0.0006540536880493164, 'evaluation': -1.0066694091407284}[0m [36mstep[0m=[35m2

### Relocate

In [11]:
# Fine-tune the offline-trained policies for the Relocate task using online interaction
finetune_algorithm(relocate_iql, relocate_env, 'relocate_iql', 'relocate')
finetune_algorithm(relocate_cql, relocate_env, 'relocate_cql', 'relocate')
finetune_algorithm(relocate_td3bc, relocate_env, 'relocate_td3bc', 'relocate')
finetune_algorithm(relocate_awac, relocate_env, 'relocate_awac', 'relocate')

[2m2025-05-24 10:00.27[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(30,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(39,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[[1]])[0m
[2m2025-05-24 10:00.27[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.CONTINUOUS: 1>[0m
[2m2025-05-24 10:00.27[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m30[0m
[2m2025-05-24 10:00.27[0m [[32m[1minfo     [0m] [1mDirectory is created at training_logs/finetuning/relocate/IQL_online_20250524100027[0m
[2m2025-05-24 10:00.27[0m [[32m[1minfo     [0m] [1mParameters                    [0m [36mparams[0m=[35m{'observation_shape': [39], 'action_size': 30, 'config': {'type'

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:00.28[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/relocate/IQL_online_20250524100027/model_10.d3[0m
[2m2025-05-24 10:00.28[0m [[32m[1minfo     [0m] [1mIQL_online_20250524100027: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.000938582420349121, 'time_environment_step': 0.0006072282791137696, 'time_step': 0.0015754222869873047, 'evaluation': 5.600986735907755}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:00.28[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/relocate/IQL_online_20250524100027/model_20.d3[0m
[2m2025-05-24 10:00.28[0m [[32m[1minfo     [0m] [1mIQL_online_20250524100027: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00020456314086914062, 'time_environment_step': 0.00025594234466552734, 'time_step': 0.00047206878662109375, 'evaluation': 5.476295977972123}[0m [36mstep[0m=[

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:00.33[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/relocate/CQL_online_20250524100033/model_10.d3[0m
[2m2025-05-24 10:00.33[0m [[32m[1minfo     [0m] [1mCQL_online_20250524100033: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00035660266876220704, 'time_environment_step': 0.00030817985534667967, 'time_step': 0.0006773471832275391, 'evaluation': 12.449073735454814}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:00.34[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/relocate/CQL_online_20250524100033/model_20.d3[0m
[2m2025-05-24 10:00.34[0m [[32m[1minfo     [0m] [1mCQL_online_20250524100033: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00016388893127441406, 'time_environment_step': 0.0002604484558105469, 'time_step': 0.0004350185394287109, 'evaluation': 13.54987992317935}[0m [36mstep[0m=

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:00.38[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/relocate/TD3PlusBC_online_20250524100038/model_10.d3[0m
[2m2025-05-24 10:00.38[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250524100038: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00014450550079345704, 'time_environment_step': 0.0002736568450927734, 'time_step': 0.0004289865493774414, 'evaluation': 20.422595437456206}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:00.39[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/relocate/TD3PlusBC_online_20250524100038/model_20.d3[0m
[2m2025-05-24 10:00.39[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250524100038: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00011870861053466796, 'time_environment_step': 0.0003371477127075195, 'time_step': 0.00046870708465576174, 'evaluation': 21.1422378012

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:00.44[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/relocate/AWAC_online_20250524100043/model_10.d3[0m
[2m2025-05-24 10:00.44[0m [[32m[1minfo     [0m] [1mAWAC_online_20250524100043: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00033416748046875, 'time_environment_step': 0.00027251243591308594, 'time_step': 0.0006201982498168946, 'evaluation': 5.932503767036418}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:00.44[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/relocate/AWAC_online_20250524100043/model_20.d3[0m
[2m2025-05-24 10:00.44[0m [[32m[1minfo     [0m] [1mAWAC_online_20250524100043: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00017485618591308593, 'time_environment_step': 0.00022921562194824218, 'time_step': 0.0004134178161621094, 'evaluation': 6.278545744352945}[0m [36mstep[0m

### Hammer

In [12]:
# Fine-tune the offline-trained policies for the Hammer task using online interaction
finetune_algorithm(hammer_iql, hammer_env, 'hammer_iql', 'hammer')
finetune_algorithm(hammer_cql, hammer_env, 'hammer_cql', 'hammer')
finetune_algorithm(hammer_td3bc, hammer_env, 'hammer_td3bc', 'hammer')
finetune_algorithm(hammer_awac, hammer_env, 'hammer_awac', 'hammer')

[2m2025-05-24 10:00.48[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(26,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(46,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[[1]])[0m
[2m2025-05-24 10:00.48[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.CONTINUOUS: 1>[0m
[2m2025-05-24 10:00.48[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m26[0m
[2m2025-05-24 10:00.49[0m [[32m[1minfo     [0m] [1mDirectory is created at training_logs/finetuning/hammer/IQL_online_20250524100049[0m
[2m2025-05-24 10:00.49[0m [[32m[1minfo     [0m] [1mParameters                    [0m [36mparams[0m=[35m{'observation_shape': [46], 'action_size': 26, 'config': {'type': 

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:00.49[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/hammer/IQL_online_20250524100049/model_10.d3[0m
[2m2025-05-24 10:00.49[0m [[32m[1minfo     [0m] [1mIQL_online_20250524100049: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00035648345947265626, 'time_environment_step': 0.0005321502685546875, 'time_step': 0.0009084224700927734, 'evaluation': -232.31303438566397}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:00.50[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/hammer/IQL_online_20250524100049/model_20.d3[0m
[2m2025-05-24 10:00.50[0m [[32m[1minfo     [0m] [1mIQL_online_20250524100049: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.0002714395523071289, 'time_environment_step': 0.0006236553192138672, 'time_step': 0.0009125471115112305, 'evaluation': -230.14227023417737}[0m [36mstep[0m=[3

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:00.56[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/hammer/CQL_online_20250524100055/model_10.d3[0m
[2m2025-05-24 10:00.56[0m [[32m[1minfo     [0m] [1mCQL_online_20250524100055: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00023310184478759767, 'time_environment_step': 0.00031332969665527345, 'time_step': 0.000557851791381836, 'evaluation': -234.7212737286039}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:00.57[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/hammer/CQL_online_20250524100055/model_20.d3[0m
[2m2025-05-24 10:00.57[0m [[32m[1minfo     [0m] [1mCQL_online_20250524100055: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00019233226776123048, 'time_environment_step': 0.000358271598815918, 'time_step': 0.0005614519119262696, 'evaluation': -236.76669397842224}[0m [36mstep[0m=[35

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:01.03[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/hammer/TD3PlusBC_online_20250524100102/model_10.d3[0m
[2m2025-05-24 10:01.03[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250524100102: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00019876956939697267, 'time_environment_step': 0.0003216028213500977, 'time_step': 0.0005319595336914062, 'evaluation': -236.15581803619472}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:01.03[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/hammer/TD3PlusBC_online_20250524100102/model_20.d3[0m
[2m2025-05-24 10:01.03[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250524100102: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00010199546813964844, 'time_environment_step': 0.00033948421478271487, 'time_step': 0.0004523038864135742, 'evaluation': -235.50227967085

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:01.09[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/hammer/AWAC_online_20250524100109/model_10.d3[0m
[2m2025-05-24 10:01.09[0m [[32m[1minfo     [0m] [1mAWAC_online_20250524100109: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.0002526998519897461, 'time_environment_step': 0.00031332969665527345, 'time_step': 0.0005773782730102539, 'evaluation': -230.95969963371212}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:01.10[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/hammer/AWAC_online_20250524100109/model_20.d3[0m
[2m2025-05-24 10:01.10[0m [[32m[1minfo     [0m] [1mAWAC_online_20250524100109: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.0001498699188232422, 'time_environment_step': 0.00032978057861328126, 'time_step': 0.0004888772964477539, 'evaluation': -233.74602199752712}[0m [36mstep[0

### Door

In [13]:
# Fine-tune the offline-trained policies for the Door task using online interaction
finetune_algorithm(door_iql, door_env, 'door_iql', 'door')
finetune_algorithm(door_cql, door_env, 'door_cql', 'door')
finetune_algorithm(door_td3bc, door_env, 'door_td3bc', 'door')
finetune_algorithm(door_awac, door_env, 'door_awac', 'door')

[2m2025-05-24 10:01.15[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(28,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(39,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[[1]])[0m
[2m2025-05-24 10:01.15[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.CONTINUOUS: 1>[0m
[2m2025-05-24 10:01.15[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m28[0m
[2m2025-05-24 10:01.15[0m [[32m[1minfo     [0m] [1mDirectory is created at training_logs/finetuning/door/IQL_online_20250524100115[0m
[2m2025-05-24 10:01.15[0m [[32m[1minfo     [0m] [1mParameters                    [0m [36mparams[0m=[35m{'observation_shape': [39], 'action_size': 28, 'config': {'type': 'i

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:01.16[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/door/IQL_online_20250524100115/model_10.d3[0m
[2m2025-05-24 10:01.16[0m [[32m[1minfo     [0m] [1mIQL_online_20250524100115: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.0004484891891479492, 'time_environment_step': 0.00036156177520751953, 'time_step': 0.0008266925811767578, 'evaluation': 353.6466501862734}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:01.17[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/door/IQL_online_20250524100115/model_20.d3[0m
[2m2025-05-24 10:01.17[0m [[32m[1minfo     [0m] [1mIQL_online_20250524100115: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00016930103302001954, 'time_environment_step': 0.00023064613342285156, 'time_step': 0.000409698486328125, 'evaluation': -43.84017355599464}[0m [36mstep[0m=[35m20[

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:01.21[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/door/CQL_online_20250524100121/model_10.d3[0m
[2m2025-05-24 10:01.21[0m [[32m[1minfo     [0m] [1mCQL_online_20250524100121: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.0003455400466918945, 'time_environment_step': 0.0002337932586669922, 'time_step': 0.0005905389785766602, 'evaluation': -36.716531613884314}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:01.22[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/door/CQL_online_20250524100121/model_20.d3[0m
[2m2025-05-24 10:01.22[0m [[32m[1minfo     [0m] [1mCQL_online_20250524100121: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00015037059783935548, 'time_environment_step': 0.0002069711685180664, 'time_step': 0.00036537647247314453, 'evaluation': -36.87627307503832}[0m [36mstep[0m=[35m20

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:01.26[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/door/TD3PlusBC_online_20250524100125/model_10.d3[0m
[2m2025-05-24 10:01.26[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250524100125: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.0005808115005493164, 'time_environment_step': 0.0008336067199707031, 'time_step': 0.001435708999633789, 'evaluation': -30.86554349068043}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:01.26[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/door/TD3PlusBC_online_20250524100125/model_20.d3[0m
[2m2025-05-24 10:01.26[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250524100125: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 9.205341339111329e-05, 'time_environment_step': 0.00020551681518554688, 'time_step': 0.00030829906463623045, 'evaluation': -30.814479323555997}[0

  0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-05-24 10:01.30[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/door/AWAC_online_20250524100129/model_10.d3[0m
[2m2025-05-24 10:01.30[0m [[32m[1minfo     [0m] [1mAWAC_online_20250524100129: epoch=1 step=10[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00041117668151855467, 'time_environment_step': 0.0002482175827026367, 'time_step': 0.00067138671875, 'evaluation': -41.44465572327317}[0m [36mstep[0m=[35m10[0m
[2m2025-05-24 10:01.30[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/door/AWAC_online_20250524100129/model_20.d3[0m
[2m2025-05-24 10:01.30[0m [[32m[1minfo     [0m] [1mAWAC_online_20250524100129: epoch=2 step=20[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 0.00015668869018554689, 'time_environment_step': 0.0002121448516845703, 'time_step': 0.00037789344787597656, 'evaluation': -41.52749577891568}[0m [36mstep[0m=[35m20