In [9]:
import minari
import d3rlpy
import copy
import os

# Parameters of the experiments

In [2]:
# Number of episodes during the testing phase
N = 50

n_steps = 1000
n_steps_per_epoch = 100
save_interval = 1

# Delay between one step of the episode and the next in simulation
delay = 0.01

# Loading and preparation of datasets

In [3]:
pen_env = minari.load_dataset("D4RL/pen/expert-v2").recover_environment()
relocate_env = minari.load_dataset("D4RL/relocate/expert-v2").recover_environment()
hammer_env = minari.load_dataset("D4RL/hammer/expert-v2").recover_environment()
door_env = minari.load_dataset("D4RL/door/expert-v2").recover_environment()

# Loading saved policies

### Pen

In [4]:
pen_iql = d3rlpy.load_learnable("policies/offline/pen_iql.d3")
pen_cql = d3rlpy.load_learnable("policies/offline/pen_cql.d3")
pen_bc = d3rlpy.load_learnable("policies/offline/pen_bc.d3")
pen_td3bc = d3rlpy.load_learnable("policies/offline/pen_td3bc.d3")
pen_awac = d3rlpy.load_learnable("policies/offline/pen_awac.d3")

### Relocate

In [5]:
relocate_iql = d3rlpy.load_learnable("policies/offline/relocate_iql.d3")
relocate_cql = d3rlpy.load_learnable("policies/offline/relocate_cql.d3")
relocate_bc = d3rlpy.load_learnable("policies/offline/relocate_bc.d3")
relocate_td3bc = d3rlpy.load_learnable("policies/offline/relocate_td3bc.d3")
relocate_awac = d3rlpy.load_learnable("policies/offline/relocate_awac.d3")

### hammer

In [6]:
hammer_iql = d3rlpy.load_learnable("policies/offline/hammer_iql.d3")
hammer_cql = d3rlpy.load_learnable("policies/offline/hammer_cql.d3")
hammer_bc = d3rlpy.load_learnable("policies/offline/hammer_bc.d3")
hammer_td3bc = d3rlpy.load_learnable("policies/offline/hammer_td3bc.d3")
hammer_awac = d3rlpy.load_learnable("policies/offline/hammer_awac.d3")

### Door

In [7]:
door_iql = d3rlpy.load_learnable("policies/offline/door_iql.d3")
door_cql = d3rlpy.load_learnable("policies/offline/door_cql.d3")
door_bc = d3rlpy.load_learnable("policies/offline/door_bc.d3")
door_td3bc = d3rlpy.load_learnable("policies/offline/door_td3bc.d3")
door_awac = d3rlpy.load_learnable("policies/offline/door_awac.d3")

# Creation of folders for policies and logs

In [10]:
# Create policies/finetuning
policies_path = os.path.join("policies", "finetuning")
if not os.path.exists(policies_path):
    os.makedirs(policies_path)
    print(f"Created: {policies_path}")
else:
    print(f"Already exists: {policies_path}")

# Create training_logs/offline/{task}
training_base = os.path.join("training_logs", "finetuning")
task_dirs = ["pen", "relocate", "hammer", "door"]

for task in task_dirs:
    task_path = os.path.join(training_base, task)
    if not os.path.exists(task_path):
        os.makedirs(task_path)
        print(f"Created: {task_path}")
    else:
        print(f"Already exists: {task_path}")

Created: policies/finetuning
Already exists: training_logs/finetuning/pen
Created: training_logs/finetuning/relocate
Created: training_logs/finetuning/hammer
Created: training_logs/finetuning/door


# Finetuning Algorithm

In [11]:
def finetune_algorithm(algo, env, filename, task):

    buffer = d3rlpy.dataset.create_fifo_replay_buffer(limit=10000, env=env) #limit=10000

    algo.fit_online(
        env,
        buffer=buffer,
        n_steps=n_steps,
        n_steps_per_epoch=n_steps_per_epoch, 
        save_interval=save_interval,
        eval_env = copy.deepcopy(env),
        logger_adapter=d3rlpy.logging.FileAdapterFactory(root_dir=f"training_logs/finetuning/{task}"),
    )

    algo.save(f'policies/finetuning/{filename}.d3')

# Online Finetuning

### Pen

In [12]:
finetune_algorithm(pen_iql, pen_env, 'pen_iql', 'pen')
finetune_algorithm(pen_cql, pen_env, 'pen_cql', 'pen')
finetune_algorithm(pen_td3bc, pen_env, 'pen_td3bc', 'pen')
finetune_algorithm(pen_awac, pen_env, 'pen_awac', 'pen')

[2m2025-05-18 18:54.26[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(24,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(45,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[[1]])[0m
[2m2025-05-18 18:54.26[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.CONTINUOUS: 1>[0m
[2m2025-05-18 18:54.26[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m24[0m
[2m2025-05-18 18:54.26[0m [[32m[1minfo     [0m] [1mDirectory is created at training_logs/finetuning/pen/IQL_online_20250518185426[0m
[2m2025-05-18 18:54.26[0m [[32m[1minfo     [0m] [1mParameters                    [0m [36mparams[0m=[35m{'observation_shape': [45], 'action_size': 24, 'config': {'type': 'iq

  0%|          | 0/1000 [00:00<?, ?it/s]

[2m2025-05-18 18:54.27[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/pen/IQL_online_20250518185426/model_100.d3[0m
[2m2025-05-18 18:54.27[0m [[32m[1minfo     [0m] [1mIQL_online_20250518185426: epoch=1 step=100[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00014053821563720704, 'time_environment_step': 0.00037099361419677733, 'time_step': 0.0005243635177612305, 'rollout_return': 55.45103317123636, 'evaluation': 1232.58613689694}[0m [36mstep[0m=[35m100[0m
[2m2025-05-18 18:54.27[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/pen/IQL_online_20250518185426/model_200.d3[0m
[2m2025-05-18 18:54.27[0m [[32m[1minfo     [0m] [1mIQL_online_20250518185426: epoch=2 step=200[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 9.324789047241211e-05, 'time_environment_step': 0.0002753305435180664, 'time_step': 0.0003781723976135254, 'rollout_return': 81.4

  0%|          | 0/1000 [00:00<?, ?it/s]

[2m2025-05-18 18:54.36[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/pen/CQL_online_20250518185435/model_100.d3[0m
[2m2025-05-18 18:54.36[0m [[32m[1minfo     [0m] [1mCQL_online_20250518185435: epoch=1 step=100[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 9.926319122314453e-05, 'time_environment_step': 0.00023224115371704102, 'time_step': 0.00034397363662719727, 'rollout_return': -8.31657113524442, 'evaluation': 805.7957231965231}[0m [36mstep[0m=[35m100[0m
[2m2025-05-18 18:54.36[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/pen/CQL_online_20250518185435/model_200.d3[0m
[2m2025-05-18 18:54.36[0m [[32m[1minfo     [0m] [1mCQL_online_20250518185435: epoch=2 step=200[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 8.79526138305664e-05, 'time_environment_step': 0.00029860734939575196, 'time_step': 0.00039613962173461913, 'rollout_return': 22

  0%|          | 0/1000 [00:00<?, ?it/s]

[2m2025-05-18 18:55.01[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/pen/TD3PlusBC_online_20250518185501/model_100.d3[0m
[2m2025-05-18 18:55.01[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250518185501: epoch=1 step=100[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 6.013631820678711e-05, 'time_environment_step': 0.0003427267074584961, 'time_step': 0.0004121232032775879, 'rollout_return': 51.33205030723591, 'evaluation': 25.376022546017488}[0m [36mstep[0m=[35m100[0m
[2m2025-05-18 18:55.01[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/pen/TD3PlusBC_online_20250518185501/model_200.d3[0m
[2m2025-05-18 18:55.01[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250518185501: epoch=2 step=200[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 4.4758319854736326e-05, 'time_environment_step': 0.0002684617042541504, 'time_step': 0.000321145057678222

  0%|          | 0/1000 [00:00<?, ?it/s]

[2m2025-05-18 18:55.09[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/pen/AWAC_online_20250518185509/model_100.d3[0m
[2m2025-05-18 18:55.09[0m [[32m[1minfo     [0m] [1mAWAC_online_20250518185509: epoch=1 step=100[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 9.359121322631836e-05, 'time_environment_step': 0.00032078981399536133, 'time_step': 0.0004260730743408203, 'rollout_return': -59.792762787795795, 'evaluation': 196.9165140040189}[0m [36mstep[0m=[35m100[0m
[2m2025-05-18 18:55.10[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/pen/AWAC_online_20250518185509/model_200.d3[0m
[2m2025-05-18 18:55.10[0m [[32m[1minfo     [0m] [1mAWAC_online_20250518185509: epoch=2 step=200[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 8.616924285888672e-05, 'time_environment_step': 0.00031558513641357424, 'time_step': 0.0004122614860534668, 'rollout_return

<d3rlpy.algos.qlearning.awac.AWAC at 0x16692a230>

### Relocate

In [13]:
finetune_algorithm(relocate_iql, relocate_env, 'relocate_iql', 'relocate')
finetune_algorithm(relocate_cql, relocate_env, 'relocate_cql', 'relocate')
finetune_algorithm(relocate_td3bc, relocate_env, 'relocate_td3bc', 'relocate')
finetune_algorithm(relocate_awac, relocate_env, 'relocate_awac', 'relocate')

[2m2025-05-18 18:55.13[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(30,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(39,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[[1]])[0m
[2m2025-05-18 18:55.13[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.CONTINUOUS: 1>[0m
[2m2025-05-18 18:55.13[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m30[0m
[2m2025-05-18 18:55.13[0m [[32m[1minfo     [0m] [1mDirectory is created at training_logs/finetuning/relocate/IQL_online_20250518185513[0m
[2m2025-05-18 18:55.13[0m [[32m[1minfo     [0m] [1mParameters                    [0m [36mparams[0m=[35m{'observation_shape': [39], 'action_size': 30, 'config': {'type'

  0%|          | 0/1000 [00:00<?, ?it/s]

[2m2025-05-18 18:55.14[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/relocate/IQL_online_20250518185513/model_100.d3[0m
[2m2025-05-18 18:55.14[0m [[32m[1minfo     [0m] [1mIQL_online_20250518185513: epoch=1 step=100[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00013909101486206054, 'time_environment_step': 0.0002622485160827637, 'time_step': 0.0004100847244262695, 'evaluation': 377.41896823557727}[0m [36mstep[0m=[35m100[0m
[2m2025-05-18 18:55.14[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/relocate/IQL_online_20250518185513/model_200.d3[0m
[2m2025-05-18 18:55.14[0m [[32m[1minfo     [0m] [1mIQL_online_20250518185513: epoch=2 step=200[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 9.415388107299805e-05, 'time_environment_step': 0.00022520065307617186, 'time_step': 0.0003280830383300781, 'rollout_return': 5.055437437045705, 'evaluatio

  0%|          | 0/1000 [00:00<?, ?it/s]

[2m2025-05-18 18:55.26[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/relocate/CQL_online_20250518185525/model_100.d3[0m
[2m2025-05-18 18:55.26[0m [[32m[1minfo     [0m] [1mCQL_online_20250518185525: epoch=1 step=100[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 9.809255599975586e-05, 'time_environment_step': 0.00022925615310668947, 'time_step': 0.0003342556953430176, 'evaluation': 6.082309953601301}[0m [36mstep[0m=[35m100[0m
[2m2025-05-18 18:55.26[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/relocate/CQL_online_20250518185525/model_200.d3[0m
[2m2025-05-18 18:55.26[0m [[32m[1minfo     [0m] [1mCQL_online_20250518185525: epoch=2 step=200[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 9.876012802124024e-05, 'time_environment_step': 0.00022796154022216797, 'time_step': 0.0003380274772644043, 'rollout_return': 5.13331458737354, 'evaluation'

  0%|          | 0/1000 [00:00<?, ?it/s]

[2m2025-05-18 18:55.54[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/relocate/TD3PlusBC_online_20250518185554/model_100.d3[0m
[2m2025-05-18 18:55.54[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250518185554: epoch=1 step=100[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 5.391836166381836e-05, 'time_environment_step': 0.00025336027145385744, 'time_step': 0.0003142356872558594, 'evaluation': 12.34686946599695}[0m [36mstep[0m=[35m100[0m
[2m2025-05-18 18:55.55[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/relocate/TD3PlusBC_online_20250518185554/model_200.d3[0m
[2m2025-05-18 18:55.55[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250518185554: epoch=2 step=200[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 4.3303966522216796e-05, 'time_environment_step': 0.00020636558532714842, 'time_step': 0.00025862932205200195, 'rollout_return': 14.3

  0%|          | 0/1000 [00:00<?, ?it/s]

[2m2025-05-18 18:56.05[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/relocate/AWAC_online_20250518185604/model_100.d3[0m
[2m2025-05-18 18:56.05[0m [[32m[1minfo     [0m] [1mAWAC_online_20250518185604: epoch=1 step=100[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.0001044464111328125, 'time_environment_step': 0.0002234053611755371, 'time_step': 0.0003343057632446289, 'evaluation': 6.458218036536872}[0m [36mstep[0m=[35m100[0m
[2m2025-05-18 18:56.05[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/relocate/AWAC_online_20250518185604/model_200.d3[0m
[2m2025-05-18 18:56.05[0m [[32m[1minfo     [0m] [1mAWAC_online_20250518185604: epoch=2 step=200[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 8.417606353759766e-05, 'time_environment_step': 0.00021026134490966796, 'time_step': 0.00030308961868286133, 'rollout_return': 9.906085778106286, 'evalua

<d3rlpy.algos.qlearning.awac.AWAC at 0x166972740>

### Hammer

In [14]:
finetune_algorithm(hammer_iql, hammer_env, 'hammer_iql', 'hammer')
finetune_algorithm(hammer_cql, hammer_env, 'hammer_cql', 'hammer')
finetune_algorithm(hammer_td3bc, hammer_env, 'hammer_td3bc', 'hammer')
finetune_algorithm(hammer_awac, hammer_env, 'hammer_awac', 'hammer')

[2m2025-05-18 18:56.09[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(26,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(46,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[[1]])[0m
[2m2025-05-18 18:56.09[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.CONTINUOUS: 1>[0m
[2m2025-05-18 18:56.09[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m26[0m
[2m2025-05-18 18:56.09[0m [[32m[1minfo     [0m] [1mDirectory is created at training_logs/finetuning/hammer/IQL_online_20250518185609[0m
[2m2025-05-18 18:56.09[0m [[32m[1minfo     [0m] [1mParameters                    [0m [36mparams[0m=[35m{'observation_shape': [46], 'action_size': 26, 'config': {'type': 

  0%|          | 0/1000 [00:00<?, ?it/s]

[2m2025-05-18 18:56.10[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/hammer/IQL_online_20250518185609/model_100.d3[0m
[2m2025-05-18 18:56.10[0m [[32m[1minfo     [0m] [1mIQL_online_20250518185609: epoch=1 step=100[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00010914325714111327, 'time_environment_step': 0.0002841925621032715, 'time_step': 0.00040043115615844726, 'evaluation': -236.3017042474453}[0m [36mstep[0m=[35m100[0m
[2m2025-05-18 18:56.11[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/hammer/IQL_online_20250518185609/model_200.d3[0m
[2m2025-05-18 18:56.11[0m [[32m[1minfo     [0m] [1mIQL_online_20250518185609: epoch=2 step=200[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 9.61923599243164e-05, 'time_environment_step': 0.0002687835693359375, 'time_step': 0.00037456512451171874, 'rollout_return': -240.32897045576604, 'evaluation'

  0%|          | 0/1000 [00:00<?, ?it/s]

[2m2025-05-18 18:56.23[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/hammer/CQL_online_20250518185622/model_100.d3[0m
[2m2025-05-18 18:56.23[0m [[32m[1minfo     [0m] [1mCQL_online_20250518185622: epoch=1 step=100[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00010489463806152343, 'time_environment_step': 0.00027507543563842773, 'time_step': 0.00038679122924804686, 'evaluation': -233.71239611302812}[0m [36mstep[0m=[35m100[0m
[2m2025-05-18 18:56.23[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/hammer/CQL_online_20250518185622/model_200.d3[0m
[2m2025-05-18 18:56.23[0m [[32m[1minfo     [0m] [1mCQL_online_20250518185622: epoch=2 step=200[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 8.993148803710937e-05, 'time_environment_step': 0.0002585196495056152, 'time_step': 0.0003575730323791504, 'rollout_return': -250.8519107604349, 'evaluation

  0%|          | 0/1000 [00:00<?, ?it/s]

[2m2025-05-18 18:56.52[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/hammer/TD3PlusBC_online_20250518185651/model_100.d3[0m
[2m2025-05-18 18:56.52[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250518185651: epoch=1 step=100[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 5.342245101928711e-05, 'time_environment_step': 0.00028869152069091794, 'time_step': 0.00034843921661376954, 'evaluation': -235.2808138624394}[0m [36mstep[0m=[35m100[0m
[2m2025-05-18 18:56.52[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/hammer/TD3PlusBC_online_20250518185651/model_200.d3[0m
[2m2025-05-18 18:56.52[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250518185651: epoch=2 step=200[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 4.691839218139648e-05, 'time_environment_step': 0.00028176069259643554, 'time_step': 0.0003374576568603516, 'rollout_return': -234.178

  0%|          | 0/1000 [00:00<?, ?it/s]

[2m2025-05-18 18:57.02[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/hammer/AWAC_online_20250518185702/model_100.d3[0m
[2m2025-05-18 18:57.02[0m [[32m[1minfo     [0m] [1mAWAC_online_20250518185702: epoch=1 step=100[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 9.731292724609376e-05, 'time_environment_step': 0.00026671409606933595, 'time_step': 0.0003706645965576172, 'evaluation': -234.01304771673858}[0m [36mstep[0m=[35m100[0m
[2m2025-05-18 18:57.03[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/hammer/AWAC_online_20250518185702/model_200.d3[0m
[2m2025-05-18 18:57.03[0m [[32m[1minfo     [0m] [1mAWAC_online_20250518185702: epoch=2 step=200[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 8.36944580078125e-05, 'time_environment_step': 0.0002572011947631836, 'time_step': 0.00034911394119262694, 'rollout_return': -231.1628832120137, 'evaluati

<d3rlpy.algos.qlearning.awac.AWAC at 0x166b4c250>

### Door

In [15]:
finetune_algorithm(door_iql, door_env, 'door_iql', 'door')
finetune_algorithm(door_cql, door_env, 'door_cql', 'door')
finetune_algorithm(door_td3bc, door_env, 'door_td3bc', 'door')
finetune_algorithm(door_awac, door_env, 'door_awac', 'door')

[2m2025-05-18 18:57.08[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(28,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(39,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[[1]])[0m
[2m2025-05-18 18:57.08[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.CONTINUOUS: 1>[0m
[2m2025-05-18 18:57.08[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m28[0m
[2m2025-05-18 18:57.08[0m [[32m[1minfo     [0m] [1mDirectory is created at training_logs/finetuning/door/IQL_online_20250518185708[0m
[2m2025-05-18 18:57.08[0m [[32m[1minfo     [0m] [1mParameters                    [0m [36mparams[0m=[35m{'observation_shape': [39], 'action_size': 28, 'config': {'type': 'i

  0%|          | 0/1000 [00:00<?, ?it/s]

[2m2025-05-18 18:57.09[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/door/IQL_online_20250518185708/model_100.d3[0m
[2m2025-05-18 18:57.09[0m [[32m[1minfo     [0m] [1mIQL_online_20250518185708: epoch=1 step=100[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00010792016983032226, 'time_environment_step': 0.00021985054016113281, 'time_step': 0.00033441543579101565, 'evaluation': -43.34002097914485}[0m [36mstep[0m=[35m100[0m
[2m2025-05-18 18:57.09[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/door/IQL_online_20250518185708/model_200.d3[0m
[2m2025-05-18 18:57.09[0m [[32m[1minfo     [0m] [1mIQL_online_20250518185708: epoch=2 step=200[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 8.922338485717774e-05, 'time_environment_step': 0.00020647048950195312, 'time_step': 0.0003066325187683105, 'rollout_return': -43.828392410832166, 'evaluation': 

  0%|          | 0/1000 [00:00<?, ?it/s]

[2m2025-05-18 18:57.20[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/door/CQL_online_20250518185719/model_100.d3[0m
[2m2025-05-18 18:57.20[0m [[32m[1minfo     [0m] [1mCQL_online_20250518185719: epoch=1 step=100[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 9.454727172851563e-05, 'time_environment_step': 0.00021110773086547853, 'time_step': 0.0003121495246887207, 'evaluation': -45.11555327928964}[0m [36mstep[0m=[35m100[0m
[2m2025-05-18 18:57.20[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/door/CQL_online_20250518185719/model_200.d3[0m
[2m2025-05-18 18:57.20[0m [[32m[1minfo     [0m] [1mCQL_online_20250518185719: epoch=2 step=200[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 8.62407684326172e-05, 'time_environment_step': 0.00020612239837646484, 'time_step': 0.00030120134353637694, 'rollout_return': -45.85980322780327, 'evaluation': -45

  0%|          | 0/1000 [00:00<?, ?it/s]

[2m2025-05-18 18:57.48[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/door/TD3PlusBC_online_20250518185747/model_100.d3[0m
[2m2025-05-18 18:57.48[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250518185747: epoch=1 step=100[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 6.841421127319336e-05, 'time_environment_step': 0.00020370244979858398, 'time_step': 0.0002798271179199219, 'evaluation': -41.14474398517384}[0m [36mstep[0m=[35m100[0m
[2m2025-05-18 18:57.48[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/door/TD3PlusBC_online_20250518185747/model_200.d3[0m
[2m2025-05-18 18:57.48[0m [[32m[1minfo     [0m] [1mTD3PlusBC_online_20250518185747: epoch=2 step=200[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 4.4515132904052736e-05, 'time_environment_step': 0.00016727924346923828, 'time_step': 0.00021957874298095704, 'rollout_return': -40.1870399

  0%|          | 0/1000 [00:00<?, ?it/s]

[2m2025-05-18 18:57.57[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/door/AWAC_online_20250518185757/model_100.d3[0m
[2m2025-05-18 18:57.57[0m [[32m[1minfo     [0m] [1mAWAC_online_20250518185757: epoch=1 step=100[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_inference': 0.00010128259658813477, 'time_environment_step': 0.00020657062530517577, 'time_step': 0.0003148198127746582, 'evaluation': -39.62667935522752}[0m [36mstep[0m=[35m100[0m
[2m2025-05-18 18:57.58[0m [[32m[1minfo     [0m] [1mModel parameters are saved to training_logs/finetuning/door/AWAC_online_20250518185757/model_200.d3[0m
[2m2025-05-18 18:57.58[0m [[32m[1minfo     [0m] [1mAWAC_online_20250518185757: epoch=2 step=200[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_inference': 8.721351623535157e-05, 'time_environment_step': 0.00019151687622070312, 'time_step': 0.0002866768836975098, 'rollout_return': -39.596034923549276, 'evaluation

<d3rlpy.algos.qlearning.awac.AWAC at 0x166b4d420>