In [1]:
import time
import gymnasium
import numpy as np
import optuna
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

from assignment3.Section1.CartPole_AcroBot.action_selector import ActionSelector
from assignment3.Section1.CartPole_AcroBot.device import get_device
from assignment3.Section1.CartPole_AcroBot.models import PolicyNetwork, ValueNetwork
from assignment3.Section1.CartPole_AcroBot.dim_alignment import max_input_dim, max_output_dim
from assignment3.Section1.MountainCarContinuous.models import UnifiedPolicyNetwork

device = get_device()


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_source_networks(device, max_input_dim, max_output_dim):
    """
    Loads pre-trained Acrobot and MountainCarContinuous networks
    and freezes them (requires_grad=False).
    """
    # ---- Source 1: Acrobot Policy (discrete) ----
    acrobot_policy = PolicyNetwork(
        input_dim=max_input_dim,
        hidden_sizes=[32, 64, 32],
        output_dim=max_output_dim
    ).to(device)
    acrobot_policy.load_state_dict(torch.load(
        '../Section1/CartPole_AcroBot/models/Acrobot-v1/best/policy.pth',
        map_location=device,
        weights_only=True  # Add this parameter
    ))
    acrobot_policy.eval()
    for param in acrobot_policy.parameters():
        param.requires_grad = False

    # ---- Source 2: MountainCarContinuous Policy (continuous) ----
    mountaincar_policy = UnifiedPolicyNetwork(
        input_dim=max_input_dim,
        hidden_sizes=[256],
        output_dim=max_output_dim
    ).to(device)
    mountaincar_policy.load_state_dict(torch.load(
        '../Section1/MountainCarContinuous/models/BestModel/policy.pth',
        map_location=device,
        weights_only=True  # Add this parameter
    ))
    mountaincar_policy.eval()
    for param in mountaincar_policy.parameters():
        param.requires_grad = False

    return acrobot_policy, mountaincar_policy

In [3]:
class ProgressiveCartPoleNetwork(nn.Module):
    def __init__(
            self,
            source_acrobot,  # has hidden [32, 64, 32]
            source_mountaincar,  # has hidden [256]
            target_hidden_sizes=[32, 64, 32],  # CartPole
            input_dim=6,
            output_dim=2
    ):
        super().__init__()

        # Freeze source networks & store them.
        self.source_acrobot = source_acrobot.eval()
        self.source_mountaincar = source_mountaincar.eval()
        for p in self.source_acrobot.parameters():
            p.requires_grad = False
        for p in self.source_mountaincar.parameters():
            p.requires_grad = False

        # Extract the layers from each source so we can get hidden activations
        self.acrobot_layers = nn.Sequential(*self.source_acrobot.model)
        self.mountaincar_layers = nn.Sequential(*self.source_mountaincar.model)

        # Build the target's hidden layers
        self.target_hidden_layers = nn.ModuleList()
        prev_size = input_dim
        for hs in target_hidden_sizes:  # e.g. [32, 64, 32]
            self.target_hidden_layers.append(nn.Linear(prev_size, hs))
            prev_size = hs

        # Final output layer
        self.final_layer = nn.Linear(prev_size, output_dim)

        # ADAPTERS:
        # Define an adapter for each "top" connection that has dimension mismatch
        # Acrobot top layer is 32 => CartPole top is 32 (same) => identity is fine
        # MountainCar top layer is 256 => CartPole top is 32 => need linear
        self.adapter_acrobot_3 = nn.Identity()
        self.adapter_mountaincar_1 = nn.Linear(256, 32, bias=False)

        self.adapter_acrobot_2 = nn.Identity()
        self.adapter_acrobot_1 = nn.Identity()
        # (MountainCar has no 2nd or 1st layer to connect, so skip.)

    def forward_source_acrobot(self, x):
        """
        Forward x through the Acrobot net, capturing hidden activations after each layer+ReLU.
        We'll store them in a list [act1, act2, act3].
        """
        activations = []
        current = x
        relu_count = 0

        for layer in self.acrobot_layers:
            current = layer(current)
            if isinstance(layer, nn.ReLU):
                activations.append(current.clone())
                relu_count += 1
                if relu_count == 3:
                    break
        return activations  

    def forward_source_mountaincar(self, x):
        """
        MountainCar net has 1 hidden layer of size 256 (plus final mean/std layers).
        We'll capture the output after that 256 -> ReLU, ignoring the final mean/std.
        """
        activations = []
        current = x
        for layer in self.mountaincar_layers:
            current = layer(current)
            if isinstance(layer, nn.ReLU):
                # This is the single hidden layer output (256)
                activations.append(current.clone())
                break
        return activations

    def forward(self, x):
        # get hidden states from each source
        acrobot_hiddens = self.forward_source_acrobot(x) 
        mountaincar_hiddens = self.forward_source_mountaincar(x) 

        # forward through target layers
        current = x
        # Target layer 1 (size 32)
        out_target_1 = self.target_hidden_layers[0](current)  # => shape(?,32)
        # Combine with acrobot's 1st hidden (32). MountainCar has no “1st hidden” beyond the single top layer
        out_ac_1 = self.adapter_acrobot_1(acrobot_hiddens[0])  # shape(?,32)
        current = out_target_1 + out_ac_1
        current = nn.functional.relu(current)

        # Target layer 2 (size 64)
        out_target_2 = self.target_hidden_layers[1](current)  # => shape(?,64)
        out_ac_2 = self.adapter_acrobot_2(acrobot_hiddens[1])  # shape(?,64)
        current = out_target_2 + out_ac_2
        current = nn.functional.relu(current)

        # Target layer 3 (size 32)
        out_target_3 = self.target_hidden_layers[2](current)  # => shape(?,32)
        out_ac_3 = self.adapter_acrobot_3(acrobot_hiddens[2])  # shape(?,32)

        # MountainCar only has 1 hidden layer (256), so connect it here with an adapter
        out_mc_1 = self.adapter_mountaincar_1(mountaincar_hiddens[0])  # shape(?,32)

        # Combine them
        current = out_target_3 + out_ac_3 + out_mc_1
        current = nn.functional.relu(current)

        # final linear -> logits -> softmax for discrete actions
        logits = self.final_layer(current)
        return torch.softmax(logits, dim=-1) + 1e-8


In [4]:
def pad_state(state, target_dim=6):
    """
    Given a 1D state vector (e.g., from CartPole with shape (4,)),
    return a zero-padded vector of length `target_dim` (e.g., 6).
    """
    state = np.array(state, dtype=np.float32)  # ensure numpy float32
    if state.shape[0] == target_dim:
        return state  # already the right size
    elif state.shape[0] < target_dim:
        # pad zeros at the end
        padded = np.zeros((target_dim,), dtype=np.float32)
        padded[: state.shape[0]] = state
        return padded
    else:
        raise ValueError(f"State has more dimensions ({state.shape[0]}) than target_dim ({target_dim}).")

In [5]:
def train_progressive_cartpole_actor_critic(
        device,
        source_acrobot,
        source_mountaincar,
        env_name="CartPole-v1",
        episodes=1000,
        gamma=0.98,
        alpha_t=1e-3,
        alpha_w=1e-3,
        hidden_sizes_t=[32, 64, 32],
        hidden_sizes_w=[32, 64, 32],
        writer=None
):
    """
    Trains the progressive CartPole network using an Actor-Critic approach:
      - 1-step TD error
      - Policy update via delta
      - Value update via delta
    """
    # Make the environment
    env = gymnasium.make(env_name)

    # Progressive policy network (frozen sources + new target)
    policy_network = ProgressiveCartPoleNetwork(
        source_acrobot=source_acrobot,
        source_mountaincar=source_mountaincar,
        target_hidden_sizes=hidden_sizes_t,
        input_dim=max_input_dim,  # we will pad CartPole(4) up to 6
        output_dim=2
    ).to(device)

    # Value network
    value_network = ValueNetwork(
        input_dim=max_input_dim,  # also 6
        hidden_sizes=hidden_sizes_w
    ).to(device)

    policy_optimizer = Adam(policy_network.parameters(), lr=alpha_t)
    value_optimizer = Adam(value_network.parameters(), lr=alpha_w)

    # Simple action-selector for discrete actions
    action_selector = ActionSelector()

    # Logging
    rewards_per_episode = []
    start_time = time.time()

    best_avg_reward = float('-inf')

    # Training loop (Actor-Critic)
    with tqdm(total=episodes, desc="Training", unit="episode") as pbar:
        for episode in range(episodes):
            state, _ = env.reset()
            done = False
            truncated = False
            total_reward = 0.0

            I = 1.0  # discount factor for policy updates

            while not (done or truncated):
                # Pad the state to 6D
                padded_state = pad_state(state, target_dim=max_input_dim)
                state_tensor = torch.tensor(
                    padded_state, dtype=torch.float32, device=device
                ).unsqueeze(0)  # shape [1,6]

                # Forward pass: policy network
                action_probs = policy_network(state_tensor)  # shape [1,2]
                # sample discrete action
                action, log_prob_action = action_selector.select_action(
                    action_probs, valid_action_dim=2
                )

                # Step in environment
                next_state, reward, done, truncated, info = env.step(action)
                total_reward += reward

                # Pad next state
                padded_next_state = pad_state(next_state, target_dim=max_input_dim)
                next_state_tensor = torch.tensor(
                    padded_next_state, dtype=torch.float32, device=device
                ).unsqueeze(0)

                # Current value
                value = value_network(state_tensor)  # shape [1,1]
                # Next value
                with torch.no_grad():
                    next_value = (
                        value_network(next_state_tensor)
                        if not (done or truncated)
                        else torch.tensor([[0.0]], device=device)
                    )

                # TD error
                delta = reward + gamma * next_value - value

                # Value loss: we do gradient ascent on negative loss => negative of (value * delta)
                value_loss = -value * delta.detach() * I
                value_optimizer.zero_grad()
                value_loss.backward()
                value_optimizer.step()

                # Policy loss
                policy_loss = -log_prob_action * delta.detach() * I
                policy_optimizer.zero_grad()
                policy_loss.backward()
                policy_optimizer.step()

                # Update multiplier
                I *= gamma

                # Move on
                state = next_state

            # Logging
            rewards_per_episode.append(total_reward)
            if writer is not None:
                writer.add_scalar("ActorCritic/Episode_Reward", total_reward, episode)
                writer.add_scalar("ActorCritic/ValueLoss", value_loss.item(), episode)

            # Check average reward
            avg_reward_100 = np.mean(rewards_per_episode[-100:])

            if episode > 100:
                best_avg_reward = max(best_avg_reward, avg_reward_100)

            pbar.set_postfix({'Avg(100)': f"{avg_reward_100:.2f}"})
            pbar.update(1)

            # Possibly check if solved
            if env_name == "CartPole-v1" and avg_reward_100 >= 475.0:
                print(f"Solved CartPole in {episode + 1} episodes!")
                break

    train_time = time.time() - start_time
    env.close()
    print('Training finished with best avg. reward of:', best_avg_reward)
    return policy_network, value_network, rewards_per_episode, train_time, best_avg_reward

In [6]:
# acrobot_policy, mountaincar_policy = load_source_networks(device, max_input_dim, max_output_dim)
#
# # 2) Actor-Critic hyperparameters
# episodes = 3000
# gamma = 0.97
# alpha_t = 0.0007
# alpha_w = 0.0007
# hidden_sizes_t = [32, 64, 32]
# hidden_sizes_w = [16, 32, 16]
#
# writer = SummaryWriter(log_dir="runs/progressive_acrobot_mountaincar_to_cartpole_ac")
#
# # 3) Train progressive CartPole via Actor-Critic
# progressive_policy, progressive_value, rewards, duration = train_progressive_cartpole_actor_critic(
#     device=device,
#     source_acrobot=acrobot_policy,
#     source_mountaincar=mountaincar_policy,
#     env_name="CartPole-v1",
#     episodes=episodes,
#     gamma=gamma,
#     alpha_t=alpha_t,
#     alpha_w=alpha_w,
#     hidden_sizes_t=hidden_sizes_t,
#     hidden_sizes_w=hidden_sizes_w,
#     writer=writer
# )
#
# writer.close()
#
# print(f"\nTraining finished in {duration:.2f} seconds.")
# print(f"Final average reward (last 100 episodes): {np.mean(rewards[-100:]):.2f}")
#
# # 4) Save the trained progressive policy & value networks
# torch.save(progressive_policy.state_dict(), "models/progressive_acrobot_mountaincar_to_cartpole_policy.pth")
# torch.save(progressive_value.state_dict(), "models/progressive_acrobot_mountaincar_to_cartpole_value.pth")

In [7]:
def objective(trial):
    """
    Objective function for Optuna hyperparameter optimization.

    Args:
        trial (optuna.trial.Trial): A trial object for suggesting hyperparameters.

    Returns:
        float: The performance metric to minimize or maximize (e.g., average reward).
    """
    # Suggest hyperparameters
    gamma = trial.suggest_float('gamma', 0.90, 0.99, step=0.01)
    alpha_t = trial.suggest_float('alpha_t', 0.00001, 0.001, log=True)
    alpha_w = trial.suggest_float('alpha_w', 0.00001, 0.001, log=True)

    # Load source networks
    acrobot_policy, mountaincar_policy = load_source_networks(
        device=device,
        max_input_dim=max_input_dim,
        max_output_dim=max_output_dim
    )

    # Initialize TensorBoard writer (optional, can be None)
    writer = SummaryWriter(log_dir=f"runs/progressive_acrobot_mountaincar_to_cartpole_optuna_trial_{trial.number}")

    # Train the model
    policy_net, value_net, rewards, duration, best_avg_reward = train_progressive_cartpole_actor_critic(
        device=device,
        source_acrobot=acrobot_policy,
        source_mountaincar=mountaincar_policy,
        env_name="CartPole-v1",
        episodes=3000,  # You might reduce this for quicker trials
        gamma=gamma,
        alpha_t=alpha_t,
        alpha_w=alpha_w,
        hidden_sizes_t=[32, 64, 32],
        hidden_sizes_w=[32, 64, 32],
        writer=writer
    )

    return best_avg_reward

In [8]:
def optimize_hyperparameters(n_trials=10):
    """
    Sets up and runs the Optuna study for hyperparameter optimization.

    Args:
        n_trials (int): Number of Optuna trials to run.

    Returns:
        study (optuna.study.Study): The completed Optuna study.
    """
    study = optuna.create_study(
        direction='maximize',  # Since we're maximizing average reward
        sampler=optuna.samplers.TPESampler(seed=42)  # Reproducible results
    )
    study.optimize(objective, n_trials=n_trials)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

    return study

In [10]:
study = optimize_hyperparameters(n_trials=10)

[I 2025-01-09 09:26:30,413] A new study created in memory with name: no-name-20948e57-8aba-4b4e-953a-cfc4b627378c
Training: 100%|██████████| 3000/3000 [05:38<00:00,  8.86episode/s, Avg(100)=9.33]   
[I 2025-01-09 09:32:09,175] Trial 0 finished with value: 191.23 and parameters: {'gamma': 0.93, 'alpha_t': 0.0007969454818643932, 'alpha_w': 0.000291063591313307}. Best is trial 0 with value: 191.23.


Training finished with best avg. reward of: 191.23


Training: 100%|██████████| 3000/3000 [03:07<00:00, 15.97episode/s, Avg(100)=40.70]
[I 2025-01-09 09:35:16,993] Trial 1 finished with value: 41.15 and parameters: {'gamma': 0.9500000000000001, 'alpha_t': 2.0513382630874486e-05, 'alpha_w': 2.0511104188433963e-05}. Best is trial 0 with value: 191.23.


Training finished with best avg. reward of: 41.15


Training: 100%|██████████| 3000/3000 [18:39<00:00,  2.68episode/s, Avg(100)=9.37]    
[I 2025-01-09 09:53:56,734] Trial 2 finished with value: 444.1 and parameters: {'gamma': 0.9, 'alpha_t': 0.0005399484409787432, 'alpha_w': 0.00015930522616241006}. Best is trial 2 with value: 444.1.


Training finished with best avg. reward of: 444.1


Training:  71%|███████▏  | 2143/3000 [34:17<13:42,  1.04episode/s, Avg(100)=476.91]
[I 2025-01-09 10:28:14,699] Trial 3 finished with value: 476.91 and parameters: {'gamma': 0.97, 'alpha_t': 1.0994335574766187e-05, 'alpha_w': 0.0008706020878304854}. Best is trial 3 with value: 476.91.


Solved CartPole in 2143 episodes!
Training finished with best avg. reward of: 476.91


Training: 100%|██████████| 3000/3000 [05:16<00:00,  9.48episode/s, Avg(100)=55.48]
[I 2025-01-09 10:33:31,130] Trial 4 finished with value: 55.99 and parameters: {'gamma': 0.98, 'alpha_t': 2.6587543983272695e-05, 'alpha_w': 2.3102018878452926e-05}. Best is trial 3 with value: 476.91.


Training finished with best avg. reward of: 55.99


Training:  96%|█████████▋| 2894/3000 [27:55<01:01,  1.73episode/s, Avg(100)=377.54]
[W 2025-01-09 11:01:26,518] Trial 5 failed with parameters: {'gamma': 0.91, 'alpha_t': 4.059611610484306e-05, 'alpha_w': 0.00011207606211860574} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\Users\rusanov\.conda\envs\DRL\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\rusanov\AppData\Local\Temp\ipykernel_4080\1885987866.py", line 27, in objective
    policy_net, value_net, rewards, duration, best_avg_reward = train_progressive_cartpole_actor_critic(
                                                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\rusanov\AppData\Local\Temp\ipykernel_4080\604106433.py", line 70, in train_progressive_cartpole_actor_critic
    action, log_prob_action = action_selector.select_action(
          

KeyboardInterrupt: 