# Section 2: Fine-Tuning an Existing Model


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import gymnasium as gym
from torch.utils.tensorboard import SummaryWriter

from assignment3.training_loop import training_loop
from models import PolicyNetwork, ValueNetwork
from assignment3.dim_alignment import ENV_ACT_DIM, max_input_dim, max_output_dim
from assignment3.optuna_search import OptunaSearch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def reinitialize_output_layer(model):
    """
    Re-initialize the final layer's weights and biases of a given nn.Sequential model.
    Expects the final layer to be `nn.Linear(..., output_dim)`.
    """
    # model is typically `nn.Sequential([..., nn.Linear(prev_size, output_dim)])`
    # So we can directly access the last layer by indexing.
    last_layer = model[-1]
    if isinstance(last_layer, nn.Linear):
        # Re-initialize
        nn.init.xavier_uniform_(last_layer.weight)
        if last_layer.bias is not None:
            nn.init.zeros_(last_layer.bias)
    else:
        raise ValueError("The last layer of the model is not a Linear layer.")


In [3]:
def fine_tune_actor_critic(
    source_policy_network,
    source_value_network,
    env_name,
    input_dim,
    output_dim,
    hidden_sizes_theta,
    hidden_sizes_w,
    alpha_theta=0.001,
    alpha_w=0.001,
    episodes=500,
    gamma=0.99,
    log_dir="runs/fine_tune"
):
    """
    Fine-tune a policy/value network that was trained on another task.
    Steps:
      1) Re-initialize the final output layer.
      2) Train on the target environment.

    Returns:
      fine_tuned_policy_network, fine_tuned_value_network, rewards_per_episode, train_time
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    env = gym.make(env_name)
    writer = SummaryWriter(log_dir=f"{log_dir}_{env_name}")

    # -- POLICY NETWORK --
    fine_tuned_policy_network = PolicyNetwork(input_dim, hidden_sizes_theta, output_dim).to(device)
    fine_tuned_policy_network.load_state_dict(source_policy_network.state_dict())  # copy all weights
    reinitialize_output_layer(fine_tuned_policy_network.model) # Re-initialize the final layer of the policy network

    # -- VALUE NETWORK --
    fine_tuned_value_network = ValueNetwork(input_dim, hidden_sizes_w).to(device)
    fine_tuned_value_network.load_state_dict(source_value_network.state_dict())  # copy all weights
    reinitialize_output_layer(fine_tuned_value_network.model) # Re-initialize the final layer of the value network

    policy_optimizer = optim.Adam(fine_tuned_policy_network.parameters(), lr=alpha_theta)
    value_optimizer = optim.Adam(fine_tuned_value_network.parameters(), lr=alpha_w)

    rewards_per_episode = []

    actual_act_dim = ENV_ACT_DIM[env_name]
    
    train_time = training_loop(
        input_dim=input_dim,
        actual_act_dim=actual_act_dim,
        policy_network=fine_tuned_policy_network,
        value_network=fine_tuned_value_network,
        policy_optimizer=policy_optimizer,
        value_optimizer=value_optimizer,
        env=env,
        env_name=env_name,
        gamma=gamma,
        episodes=episodes,
        device=device,
        writer=writer,
        rewards_per_episode=rewards_per_episode,
    )
    
    writer.close()
    env.close()

    return fine_tuned_policy_network, fine_tuned_value_network, rewards_per_episode, train_time


In [4]:
# TODO: to find the best hyperparameters for each environment, initialize different ranges and params for each env separately

# Common hidden sizes
hidden_sizes_theta = [16, 32, 16]
hidden_sizes_w = [16, 32, 16]
episodes = 2000
n_trials = 10
overall_results = {}

# Define your search ranges
gamma_values = [0.95, 0.99]
alpha_theta_values = [0.001, 0.0005]
alpha_w_values = [0.001, 0.0005]

In [5]:
def run_experiment(env_name, source_policy_network, source_value_network):
    optuna_search = OptunaSearch(
        train_function=fine_tune_actor_critic,
        env_name=env_name,
        max_input_dim=max_input_dim,
        max_output_dim=max_output_dim,
        hidden_sizes_theta=hidden_sizes_theta,
        hidden_sizes_w=hidden_sizes_w,
        gamma_values=gamma_values,
        alpha_theta_values=alpha_theta_values,
        alpha_w_values=alpha_w_values,
        episodes=episodes,
        source_policy_network=source_policy_network,
        source_value_network=source_value_network,
    )
    best_policy, best_value, best_params, best_reward, study = optuna_search.optuna_search_for_env(n_trials=n_trials)

    print("\nDone! Best parameters found by Optuna:", best_params)
    print("Best reward from Optuna:", best_reward)


# save networks to pretrained_models
    torch.save(best_policy.state_dict(), f"pretrained_models/fine_tuned_{env_name}_policy.pth")
    torch.save(best_value.state_dict(), f"pretrained_models/fine_tuned_{env_name}_value.pth")

In [6]:
# TODO: annoying future warnings, fix them?
# Load the pre-trained models
policy_acrobot = PolicyNetwork(max_input_dim, hidden_sizes_theta, max_output_dim)
policy_acrobot.load_state_dict(torch.load("pretrained_models/acrobot-v1_policy.pth"))
value_acrobot = ValueNetwork(max_input_dim, hidden_sizes_w)
value_acrobot.load_state_dict(torch.load("pretrained_models/acrobot-v1_value.pth"))


  policy_acrobot.load_state_dict(torch.load("pretrained_models/acrobot-v1_policy.pth"))
  value_acrobot.load_state_dict(torch.load("pretrained_models/acrobot-v1_value.pth"))


<All keys matched successfully>

In [7]:
run_experiment("Acrobot-v1", policy_acrobot, value_acrobot)
# TODO: load_state_dict() is not working because of dim mismatch

[I 2024-12-28 16:24:13,280] A new study created in memory with name: no-name-6371784a-c3f5-4500-abf1-b964ed4e5df3
[W 2024-12-28 16:24:13,287] Trial 0 failed with parameters: {'gamma': 0.99, 'alpha_theta': 0.001, 'alpha_w': 0.001} because of the following error: RuntimeError('Error(s) in loading state_dict for ValueNetwork:\n\tsize mismatch for model.6.weight: copying a param with shape torch.Size([3, 16]) from checkpoint, the shape in current model is torch.Size([1, 16]).\n\tsize mismatch for model.6.bias: copying a param with shape torch.Size([3]) from checkpoint, the shape in current model is torch.Size([1]).').
Traceback (most recent call last):
  File "/opt/anaconda3/envs/DRLCourse/lib/python3.11/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/Users/nadav/PycharmProjects/Deep-Reinforcement-Learning-Policy-Gradient-Methods/assignment3/optuna_search.py", line 108, in objective_wrapper
    r


[OPTUNA Trial] Env=Acrobot-v1 | gamma=0.99, alpha_theta=0.001, alpha_w=0.001


RuntimeError: Error(s) in loading state_dict for ValueNetwork:
	size mismatch for model.6.weight: copying a param with shape torch.Size([3, 16]) from checkpoint, the shape in current model is torch.Size([1, 16]).
	size mismatch for model.6.bias: copying a param with shape torch.Size([3]) from checkpoint, the shape in current model is torch.Size([1]).

In [None]:
# TODO: cant load model, fix it
policy_cartpole = PolicyNetwork(max_input_dim, hidden_sizes_theta, max_output_dim)
policy_cartpole.load_state_dict(torch.load("pretrained_models/cartpole-v1_policy.pth"))
value_cartpole = ValueNetwork(max_input_dim, hidden_sizes_w)
value_cartpole.load_state_dict(torch.load("pretrained_models/cartpole-v1_value.pth"))

In [7]:
run_experiment("CartPole-v1", policy_cartpole, value_cartpole)

NameError: name 'value_cartpole' is not defined