In [1]:
import torch
from torch import nn
from torch.utils.tensorboard import SummaryWriter

from assignment3.Section1.CartPole_AcroBot.dim_alignment import max_input_dim, max_output_dim
from assignment3.Section1.CartPole_AcroBot.models import PolicyNetwork, ValueNetwork
from assignment3.Section1.CartPole_AcroBot.device import get_device
from assignment3.Section1.MountainCarContinuous.models import UnifiedPolicyNetwork
from assignment3.Section2.actor_critic_finetune import actor_critic_finetune

In [2]:
device = get_device()
writer = SummaryWriter('runs/fine_tuning')

# Define hidden sizes (reuse from CartPole if compatible)
hidden_sizes = [32, 64, 32]

In [3]:
def extract_hidden_layers(cartpole_model, unified_model):
    """
    Transfers the weights of the hidden layers from the CartPole model to the Unified model.
    Assumes that both models have the same architecture for hidden layers.
    """
    cartpole_layers = [module for module in cartpole_model.model]
    unified_layers = [module for module in unified_model.model]

    for cp_layer, un_layer in zip(cartpole_layers, unified_layers):
        if isinstance(cp_layer, nn.Linear) and isinstance(un_layer, nn.Linear):
            un_layer.weight.data = cp_layer.weight.data.clone()
            un_layer.bias.data = cp_layer.bias.data.clone()

    return unified_model

In [4]:
# Load the pre-trained CartPole model
cartpole_policy_network = PolicyNetwork(max_input_dim, hidden_sizes, max_output_dim).to(device)
cartpole_policy_network.load_state_dict(torch.load(
    '../Section1/CartPole_AcroBot/models/CartPole-v1/best/policy.pth',
    map_location=device
))
cartpole_policy_network.eval()

# Initialize the Unified Policy Network
unified_policy_network = UnifiedPolicyNetwork(
    input_dim=max_input_dim,
    hidden_sizes=hidden_sizes,
    output_dim=max_output_dim
).to(device)

# Transfer hidden layer weights from CartPole model
unified_policy_network = extract_hidden_layers(cartpole_policy_network, unified_policy_network)
unified_policy_network.train()  # Set to training mode

  cartpole_policy_network.load_state_dict(torch.load(


UnifiedPolicyNetwork(
  (mean_layer): Linear(in_features=32, out_features=3, bias=True)
  (log_std_layer): Linear(in_features=32, out_features=3, bias=True)
  (model): Sequential(
    (0): Linear(in_features=6, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=32, bias=True)
    (5): ReLU()
  )
)

In [5]:
# Initialize the Value Network
value_network = ValueNetwork(
    input_dim=max_input_dim,
    hidden_sizes=[256],
).to(device)
value_network.train()

alpha = 0.000432860862913505
gamma = 0.9571389501004702
entropy_coeff = 0.0075942800574955
start_noise_std = 0.12655319079849925
end_noise_std = 0.2507795275354682
noise_decay = 0.9952850823459259

episodes = 1000
env_name = "MountainCarContinuous-v0"
log_dir = "runs/actor_critic"
model_save_path = "models"

In [6]:
# Run the actor_critic training loop with pre-initialized networks
policy_network_fine_tuned, value_network_fine_tuned, rewards_per_episode, train_time, best_avg_reward_50 = actor_critic_finetune(
    env_name=env_name,
    input_dim=max_input_dim,
    output_dim=max_output_dim,
    alpha_theta=alpha,
    alpha_w=alpha,
    episodes=episodes,
    gamma=gamma,
    entropy_coeff=entropy_coeff,
    start_noise_std=start_noise_std,
    end_noise_std=end_noise_std,
    noise_decay=noise_decay,
    log_dir=log_dir,
    model_save_path=model_save_path,
    policy_network=unified_policy_network,  # Pass the pre-trained policy network
    value_network=value_network             # Pass the pre-initialized value network
)

print(f"Fine-tuning completed in {train_time:.2f} seconds.")
print(f"Best Average Reward over 50 episodes: {best_avg_reward_50:.2f}")
print(f"Total episodes trained: {len(rewards_per_episode)}")

# Save the fine-tuned models
torch.save(policy_network_fine_tuned.state_dict(), 'models/fine_tuned_cartpole_to_mountaincar.pth')
torch.save(value_network_fine_tuned.state_dict(), 'models/fine_tuned_cartpole_to_mountaincar_value.pth')

writer.close()

Episode 1: Reward=59.00, Avg(100)=59.00, Avg(50)=59.00, Noise STD=0.2508
Episode 2: Reward=-92.14, Avg(100)=-16.57, Avg(50)=-16.57, Noise STD=0.2508
Episode 3: Reward=-92.89, Avg(100)=-42.01, Avg(50)=-42.01, Noise STD=0.2508
Episode 4: Reward=-91.52, Avg(100)=-54.39, Avg(50)=-54.39, Noise STD=0.2508
Episode 5: Reward=57.99, Avg(100)=-31.91, Avg(50)=-31.91, Noise STD=0.2508
Episode 6: Reward=53.61, Avg(100)=-17.66, Avg(50)=-17.66, Noise STD=0.2508
Episode 7: Reward=-93.71, Avg(100)=-28.52, Avg(50)=-28.52, Noise STD=0.2508
Episode 8: Reward=-92.45, Avg(100)=-36.51, Avg(50)=-36.51, Noise STD=0.2508
Episode 9: Reward=-92.42, Avg(100)=-42.73, Avg(50)=-42.73, Noise STD=0.2508
Episode 10: Reward=-92.81, Avg(100)=-47.73, Avg(50)=-47.73, Noise STD=0.2508
Episode 11: Reward=7.12, Avg(100)=-42.75, Avg(50)=-42.75, Noise STD=0.2508
Episode 12: Reward=-92.91, Avg(100)=-46.93, Avg(50)=-46.93, Noise STD=0.2508
Episode 13: Reward=41.05, Avg(100)=-40.16, Avg(50)=-40.16, Noise STD=0.2508
Episode 14: Rewa