In [1]:
import time

import gymnasium
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.tensorboard import SummaryWriter

from assignment3.Section1.CartPole_AcroBot.dim_alignment import max_input_dim, max_output_dim
from assignment3.Section1.CartPole_AcroBot.models import PolicyNetwork, ValueNetwork
from assignment3.Section1.CartPole_AcroBot.action_selector import ActionSelector
from assignment3.Section1.CartPole_AcroBot.device import get_device
from assignment3.Section1.CartPole_AcroBot.training_loop import training_loop

In [2]:
device = get_device()
writer = SummaryWriter('runs/fine_tuning')

In [3]:
def reinitialize_output_layer(model):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and module.out_features == 3:
            nn.init.xavier_uniform_(module.weight)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
    return model

In [4]:
# Load the pre-trained Acrobot model
acrobot_policy_network = PolicyNetwork(max_input_dim, [32, 64, 32], max_output_dim).to(device)
acrobot_policy_network.load_state_dict(torch.load(
    '../Section1/CartPole_AcroBot/models/Acrobot-v1/best/policy.pth',
    map_location=device
))

acrobot_policy_network = reinitialize_output_layer(acrobot_policy_network)
acrobot_policy_network.train()

  acrobot_policy_network.load_state_dict(torch.load(


PolicyNetwork(
  (model): Sequential(
    (0): Linear(in_features=6, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=32, bias=True)
    (5): ReLU()
    (6): Linear(in_features=32, out_features=3, bias=True)
  )
)

In [5]:
# best hyperparameters
# hidden_sizes_theta = [32, 64, 32]
hidden_sizes_w = [16, 32, 16]
alpha_theta = 0.0007
alpha_w = 0.0006000000000000001
gamma = 0.98

episodes = 1000

In [6]:
# Initialize the Value Network
value_network = ValueNetwork(max_input_dim, hidden_sizes_w).to(device)
value_network.train()

# Initialize optimizers
policy_optimizer = Adam(acrobot_policy_network.parameters(), lr=alpha_theta)
value_optimizer = Adam(value_network.parameters(), lr=alpha_w)

# Initialize TensorBoard writer
writer = SummaryWriter(log_dir='fine_tuning_acrobot_to_cartpole')

# Initialize rewards tracking
rewards_per_episode = []

# Initialize the action selector
action_selector = ActionSelector()

# Set up the CartPole environment
env = gymnasium.make("CartPole-v1")


# Start fine-tuning
start_time = time.time()

train_time, avg_reward = training_loop(
    input_dim=max_input_dim,
    actual_act_dim=env.action_space.n,  # CartPole has 2 actions
    policy_network=acrobot_policy_network,
    value_network=value_network,
    policy_optimizer=policy_optimizer,
    value_optimizer=value_optimizer,
    env=env,
    env_name="CartPole-v1",
    episodes=episodes,
    gamma=gamma,
    writer=writer,
    rewards_per_episode=rewards_per_episode,
    action_selector=action_selector
)

print(f"Fine-tuning completed in {train_time:.2f} seconds.")
print(f"Average Reward {avg_reward:.2f}.")
print(f"Total episodes trained: {len(rewards_per_episode)}")

# Save the fine-tuned model
torch.save(acrobot_policy_network.state_dict(), 'models/fine_tuned_acrobot_to_cartpole.pth')
torch.save(value_network.state_dict(), 'models/fine_tuned_acrobot_to_cartpole_value.pth')

writer.close()

Training: 100%|██████████| 1000/1000 [00:30<00:00, 32.92episode/s, Avg Reward(100)=9.42]

Fine-tuning completed in 30.38 seconds.
Average Reward 9.42.
Total episodes trained: 1000



