# Section 2: Fine-Tuning an Existing Model


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import gymnasium as gym
from torch.utils.tensorboard import SummaryWriter
import json
from assignment3.device import get_device
from assignment3.training_loop import training_loop
from models import PolicyNetwork, ValueNetwork
from assignment3.dim_alignment import ENV_ACT_DIM, max_input_dim, max_output_dim
from assignment3.optuna_search import OptunaSearch, StudyFloatParamRange

In [2]:
def reinitialize_output_layer(model):
    """
    Re-initialize the final layer's weights and biases of a given nn.Sequential model.
    Expects the final layer to be `nn.Linear(..., output_dim)`.
    """
    # model is typically `nn.Sequential([..., nn.Linear(prev_size, output_dim)])`
    # So we can directly access the last layer by indexing.
    last_layer = model[-1]
    if isinstance(last_layer, nn.Linear):
        # Re-initialize
        nn.init.xavier_uniform_(last_layer.weight)
        if last_layer.bias is not None:
            nn.init.zeros_(last_layer.bias)
    else:
        raise ValueError("The last layer of the model is not a Linear layer.")


In [3]:
def fine_tune_actor_critic(
    source_policy_network,
    source_value_network,
    env_name,
    input_dim,
    output_dim,
    hidden_sizes_theta,
    hidden_sizes_w,
    dropout_layers = None,
    alpha_theta=0.001,
    alpha_w=0.001,
    episodes=500,
    gamma=0.99,
    dropout_p=0.7,
    log_dir="runs/fine_tune"
):
    """
    Fine-tune a policy/value network that was trained on another task.
    Steps:
      1) Re-initialize the final output layer.
      2) Train on the target environment.

    Returns:
      fine_tuned_policy_network, fine_tuned_value_network, rewards_per_episode, train_time
    """
    device = get_device()
    env = gym.make(env_name)
    writer = SummaryWriter(log_dir=f"{log_dir}_{env_name}")

    # -- POLICY NETWORK --
    fine_tuned_policy_network = PolicyNetwork(input_dim, hidden_sizes_theta, output_dim, dropout_layers, dropout_p).to(device)
    fine_tuned_policy_network.load_state_dict(source_policy_network.state_dict())  # copy all weights
    reinitialize_output_layer(fine_tuned_policy_network.model) # Re-initialize the final layer of the policy network

    # -- VALUE NETWORK --
    fine_tuned_value_network = ValueNetwork(input_dim, hidden_sizes_w).to(device)
    fine_tuned_value_network.load_state_dict(source_value_network.state_dict())  # copy all weights
    reinitialize_output_layer(fine_tuned_value_network.model) # Re-initialize the final layer of the value network

    policy_optimizer = optim.Adam(fine_tuned_policy_network.parameters(), lr=alpha_theta)
    value_optimizer = optim.Adam(fine_tuned_value_network.parameters(), lr=alpha_w)

    rewards_per_episode = []

    actual_act_dim = ENV_ACT_DIM[env_name]
    
    train_time = training_loop(
        input_dim=input_dim,
        actual_act_dim=actual_act_dim,
        policy_network=fine_tuned_policy_network,
        value_network=fine_tuned_value_network,
        policy_optimizer=policy_optimizer,
        value_optimizer=value_optimizer,
        env=env,
        env_name=env_name,
        gamma=gamma,
        episodes=episodes,
        writer=writer,
        rewards_per_episode=rewards_per_episode,
    )
    
    writer.close()
    env.close()

    return fine_tuned_policy_network, fine_tuned_value_network, rewards_per_episode, train_time


In [4]:
# TODO: to find the best hyperparameters for each environment, initialize different ranges and params for each env separately

episodes = 2000
n_trials = 10
overall_results = {}

# Common hidden sizes options
hidden_sizes_theta_values = ["[16, 32, 16]", "[32, 64, 32]"]
hidden_sizes_w_values = ["[16, 32, 16]", "[32, 64, 32]"]
dropout_layers = [1]

# Define your search ranges
gamma_values = StudyFloatParamRange(low=0.95, high=0.99, step=0.01)
alpha_theta_values = StudyFloatParamRange(low=0.0005, high=0.0008, step=0.0001)
alpha_w_values = StudyFloatParamRange(low=0.0005, high=0.0008, step=0.0001)
dropout_p_values = StudyFloatParamRange(low=0.2, high=0.5, step=0.1)

In [5]:
def run_experiment(env_name, source_policy_network, source_value_network, fixed_hidden_theta, fixed_hidden_w):
    optuna_search = OptunaSearch(
        train_function=fine_tune_actor_critic,
        env_name=env_name,
        max_input_dim=max_input_dim,
        max_output_dim=max_output_dim,
        hidden_sizes_theta_values=hidden_sizes_theta_values,
        hidden_sizes_w_values=hidden_sizes_theta_values,
        dropout_layers=dropout_layers,
        gamma_values=gamma_values,
        alpha_theta_values=alpha_theta_values,
        alpha_w_values=alpha_w_values,
        dropout_p_values=dropout_p_values,
        episodes=episodes,
    )
    best_policy, best_value, best_params, best_reward, study = optuna_search.optuna_search_for_env(
        n_trials=n_trials,
        source_policy_network=source_policy_network,
        source_value_network=source_value_network,
        fixed_hidden_theta=fixed_hidden_theta,
        fixed_hidden_w=fixed_hidden_w,
    )

    print("\nDone! Best parameters found by Optuna:", best_params)
    print("Best reward from Optuna:", best_reward)


# save networks to pretrained_models
    torch.save(best_policy.state_dict(), f"pretrained_models/fine_tuned_{env_name}_policy.pth")
    torch.save(best_value.state_dict(), f"pretrained_models/fine_tuned_{env_name}_value.pth")

In [8]:
# Read hidden sizes from the best hyperparameters found by Optuna from json file
with open("best_params/acrobot-v1_actor_critic_study.json", "r") as f:
    acrobot_hyperparameters = json.load(f)
    acrobot_hidden_sizes_theta = eval(acrobot_hyperparameters["hidden_sizes_theta"])
    acrobot_hidden_sizes_w = eval(acrobot_hyperparameters["hidden_sizes_w"])
    acrobot_dropout_p = acrobot_hyperparameters["dropout_p"]
    print(acrobot_hidden_sizes_theta, acrobot_hidden_sizes_w)

[32, 64, 32] [16, 32, 16]


In [9]:
# TODO: annoying future warnings, fix them?
# Load the pre-trained models
policy_acrobot = PolicyNetwork(max_input_dim, acrobot_hidden_sizes_theta, max_output_dim, dropout_layers, acrobot_dropout_p)
policy_acrobot.load_state_dict(torch.load("pretrained_models/acrobot-v1_policy.pth"))
value_acrobot = ValueNetwork(max_input_dim, acrobot_hidden_sizes_w)
value_acrobot.load_state_dict(torch.load("pretrained_models/acrobot-v1_value.pth"))


  policy_acrobot.load_state_dict(torch.load("pretrained_models/acrobot-v1_policy.pth"))
  value_acrobot.load_state_dict(torch.load("pretrained_models/acrobot-v1_value.pth"))


<All keys matched successfully>

In [12]:
run_experiment("CartPole-v1", policy_acrobot, value_acrobot, acrobot_hidden_sizes_theta, acrobot_hidden_sizes_w)

[I 2024-12-29 21:59:40,750] A new study created in memory with name: CartPole-v1_study



[OPTUNA Trial 0] Env=CartPole-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[16, 32, 16],
         gamma=0.97, dropout_p=0.30000000000000004,
         alpha_theta=0.0008, alpha_w=0.0006000000000000001


Training: 100%|██████████| 2000/2000 [00:08<00:00, 233.05episode/s, Avg Reward(100)=9.37]
[I 2024-12-29 21:59:49,340] Trial 0 finished with value: 2000.0 and parameters: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.97, 'alpha_theta': 0.0008, 'alpha_w': 0.0006000000000000001, 'dropout_p': 0.30000000000000004}. Best is trial 0 with value: 2000.0.



[OPTUNA Trial 1] Env=CartPole-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[16, 32, 16],
         gamma=0.95, dropout_p=0.30000000000000004,
         alpha_theta=0.0006000000000000001, alpha_w=0.0008


Training: 100%|██████████| 2000/2000 [02:14<00:00, 14.92episode/s, Avg Reward(100)=236.10]
[I 2024-12-29 22:02:03,406] Trial 1 finished with value: 2000.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'gamma': 0.95, 'alpha_theta': 0.0006000000000000001, 'alpha_w': 0.0008, 'dropout_p': 0.30000000000000004}. Best is trial 0 with value: 2000.0.



[OPTUNA Trial 2] Env=CartPole-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[16, 32, 16],
         gamma=0.97, dropout_p=0.2,
         alpha_theta=0.0005, alpha_w=0.0007


Training: 100%|██████████| 2000/2000 [00:38<00:00, 52.02episode/s, Avg Reward(100)=9.20]  
[I 2024-12-29 22:02:41,854] Trial 2 finished with value: 2000.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.97, 'alpha_theta': 0.0005, 'alpha_w': 0.0007, 'dropout_p': 0.2}. Best is trial 0 with value: 2000.0.



[OPTUNA Trial 3] Env=CartPole-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[16, 32, 16],
         gamma=0.96, dropout_p=0.4,
         alpha_theta=0.0008, alpha_w=0.0006000000000000001


Training: 100%|██████████| 2000/2000 [00:14<00:00, 134.10episode/s, Avg Reward(100)=9.37]
[I 2024-12-29 22:02:56,771] Trial 3 finished with value: 2000.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'gamma': 0.96, 'alpha_theta': 0.0008, 'alpha_w': 0.0006000000000000001, 'dropout_p': 0.4}. Best is trial 0 with value: 2000.0.



[OPTUNA Trial 4] Env=CartPole-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[16, 32, 16],
         gamma=0.97, dropout_p=0.30000000000000004,
         alpha_theta=0.0008, alpha_w=0.0005


Training: 100%|██████████| 2000/2000 [00:10<00:00, 197.45episode/s, Avg Reward(100)=9.30]
[I 2024-12-29 22:03:06,903] Trial 4 finished with value: 2000.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.97, 'alpha_theta': 0.0008, 'alpha_w': 0.0005, 'dropout_p': 0.30000000000000004}. Best is trial 0 with value: 2000.0.



[OPTUNA Trial 5] Env=CartPole-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[16, 32, 16],
         gamma=0.98, dropout_p=0.30000000000000004,
         alpha_theta=0.0005, alpha_w=0.0005


Training:  95%|█████████▌| 1900/2000 [01:31<00:04, 20.73episode/s, Avg Reward(100)=474.45]
[I 2024-12-29 22:04:38,571] Trial 5 finished with value: 1901.0 and parameters: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[16, 32, 16]', 'gamma': 0.98, 'alpha_theta': 0.0005, 'alpha_w': 0.0005, 'dropout_p': 0.30000000000000004}. Best is trial 5 with value: 1901.0.


Solved CartPole-v1 in 1901 episodes!

[OPTUNA Trial 6] Env=CartPole-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[16, 32, 16],
         gamma=0.98, dropout_p=0.5,
         alpha_theta=0.0006000000000000001, alpha_w=0.0008


Training:  99%|█████████▊| 1974/2000 [00:57<00:00, 34.29episode/s, Avg Reward(100)=316.02]
[I 2024-12-29 22:05:36,144] Trial 6 finished with value: 1975.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[16, 32, 16]', 'gamma': 0.98, 'alpha_theta': 0.0006000000000000001, 'alpha_w': 0.0008, 'dropout_p': 0.5}. Best is trial 5 with value: 1901.0.


Solved CartPole-v1 in 1975 episodes!

[OPTUNA Trial 7] Env=CartPole-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[16, 32, 16],
         gamma=0.98, dropout_p=0.4,
         alpha_theta=0.0007, alpha_w=0.0008


Training: 100%|██████████| 2000/2000 [00:10<00:00, 189.33episode/s, Avg Reward(100)=48.27]
[I 2024-12-29 22:05:46,711] Trial 7 finished with value: 2000.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.98, 'alpha_theta': 0.0007, 'alpha_w': 0.0008, 'dropout_p': 0.4}. Best is trial 5 with value: 1901.0.



[OPTUNA Trial 8] Env=CartPole-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[16, 32, 16],
         gamma=0.96, dropout_p=0.30000000000000004,
         alpha_theta=0.0008, alpha_w=0.0008


Training: 100%|██████████| 2000/2000 [00:08<00:00, 228.11episode/s, Avg Reward(100)=9.40]
[I 2024-12-29 22:05:55,482] Trial 8 finished with value: 2000.0 and parameters: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.96, 'alpha_theta': 0.0008, 'alpha_w': 0.0008, 'dropout_p': 0.30000000000000004}. Best is trial 5 with value: 1901.0.



[OPTUNA Trial 9] Env=CartPole-v1:
        hidden_sizes_theta=[32, 64, 32], hidden_sizes_w=[16, 32, 16],
         gamma=0.96, dropout_p=0.5,
         alpha_theta=0.0008, alpha_w=0.0008


Training: 100%|██████████| 2000/2000 [00:41<00:00, 48.17episode/s, Avg Reward(100)=9.44]   
[I 2024-12-29 22:06:37,004] Trial 9 finished with value: 2000.0 and parameters: {'hidden_sizes_theta': '[32, 64, 32]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.96, 'alpha_theta': 0.0008, 'alpha_w': 0.0008, 'dropout_p': 0.5}. Best is trial 5 with value: 1901.0.



[OPTUNA] Best trial: trail 5
  Value (Reward): 1901.00
  Params: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[16, 32, 16]', 'gamma': 0.98, 'alpha_theta': 0.0005, 'alpha_w': 0.0005, 'dropout_p': 0.30000000000000004}


TypeError: eval() arg 1 must be a string, bytes or code object

In [13]:
# Read hidden sizes from the best hyperparameters found by Optuna from json file
with open("best_params/cartpole-v1_actor_critic_study.json", "r") as f:
    cartpole_hyperparameters = json.load(f)
    cartpole_hidden_sizes_theta = eval(cartpole_hyperparameters["hidden_sizes_theta"])
    cartpole_hidden_sizes_w = eval(cartpole_hyperparameters["hidden_sizes_w"])
    cartpole_dropout_p = cartpole_hyperparameters["dropout_p"]
    print(cartpole_hidden_sizes_theta, cartpole_hidden_sizes_w)

[16, 32, 16] [32, 64, 32]


In [14]:

policy_cartpole = PolicyNetwork(max_input_dim, cartpole_hidden_sizes_theta, max_output_dim, dropout_layers, cartpole_dropout_p)
policy_cartpole.load_state_dict(torch.load("pretrained_models/cartpole-v1_policy.pth"))
value_cartpole = ValueNetwork(max_input_dim, cartpole_hidden_sizes_w)
value_cartpole.load_state_dict(torch.load("pretrained_models/cartpole-v1_value.pth"))

  policy_cartpole.load_state_dict(torch.load("pretrained_models/cartpole-v1_policy.pth"))
  value_cartpole.load_state_dict(torch.load("pretrained_models/cartpole-v1_value.pth"))


<All keys matched successfully>

In [15]:
run_experiment("MountainCarContinuous-v0", policy_cartpole, value_cartpole, cartpole_hidden_sizes_theta, cartpole_hidden_sizes_w)

[I 2024-12-29 22:10:32,499] A new study created in memory with name: MountainCarContinuous-v0_study



[OPTUNA Trial 0] Env=MountainCarContinuous-v0:
        hidden_sizes_theta=[16, 32, 16], hidden_sizes_w=[32, 64, 32],
         gamma=0.95, dropout_p=0.30000000000000004,
         alpha_theta=0.0007, alpha_w=0.0007


Training:   2%|▏         | 36/2000 [00:14<13:30,  2.42episode/s]
[W 2024-12-29 22:10:47,358] Trial 0 failed with parameters: {'hidden_sizes_theta': '[16, 32, 16]', 'hidden_sizes_w': '[32, 64, 32]', 'gamma': 0.95, 'alpha_theta': 0.0007, 'alpha_w': 0.0007, 'dropout_p': 0.30000000000000004} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/opt/anaconda3/envs/DRLCourse/lib/python3.11/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/Users/nadav/PycharmProjects/Deep-Reinforcement-Learning-Policy-Gradient-Methods/assignment3/optuna_search.py", line 153, in objective_wrapper
    return self.objective(
           ^^^^^^^^^^^^^^^
  File "/Users/nadav/PycharmProjects/Deep-Reinforcement-Learning-Policy-Gradient-Methods/assignment3/optuna_search.py", line 116, in objective
    policy_network, value_network, rewards, train_time = self.train_function(**train_par

KeyboardInterrupt: 