In [9]:
import os
import time
import gymnasium
import numpy as np
import torch
from torch import nn, optim
from torch.utils.tensorboard import SummaryWriter
import torch.nn.functional as F
from assignment3.Section1.CartPole_AcroBot.device import get_device
from assignment3.Section1.CartPole_AcroBot.models import PolicyNetwork, ValueNetwork
from assignment3.Section1.CartPole_AcroBot.dim_alignment import max_input_dim, max_output_dim
import optuna

device = get_device()


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_source_networks(device, max_input_dim, max_output_dim):
    """
    Loads pre-trained Acrobot and CartPole networks
    and freezes them (requires_grad=False).
    """
    # ---- Source 1: Acrobot Policy (discrete) ----
    acrobot_policy = PolicyNetwork(
        input_dim=max_input_dim,
        hidden_sizes=[32, 64, 32],
        output_dim=max_output_dim
    ).to(device)
    acrobot_policy.load_state_dict(torch.load(
        '../Section1/CartPole_AcroBot/models/Acrobot-v1/best/policy.pth',
        map_location=device
    ))
    acrobot_policy.eval()
    for param in acrobot_policy.parameters():
        param.requires_grad = False

    # ---- Source 2: CartPole Policy (discrete) ----
    cartpole_policy = PolicyNetwork(
        input_dim=max_input_dim,
        hidden_sizes=[32, 64, 32],
        output_dim=max_output_dim
    ).to(device)
    cartpole_policy.load_state_dict(torch.load(
        '../Section1/CartPole_AcroBot/models/CartPole-v1/best/policy.pth',
        map_location=device
    ))
    cartpole_policy.eval()
    for param in cartpole_policy.parameters():
        param.requires_grad = False

    return acrobot_policy, cartpole_policy

In [4]:
class ProgressiveMountainCarNetwork(nn.Module):
    """
    Progressive Network for MountainCarContinuous.
    Takes two *discrete* source networks (Acrobot, CartPole)
    each with hidden sizes [32, 64, 32]. Then
    builds a new "target" column [32, 64, 32], merges
    hidden activations at each layer, and finally produces
    (mean, std) for continuous actions.
    """

    def __init__(
            self,
            source_acrobot,  # PolicyNetwork [32,64,32]
            source_cartpole,  # PolicyNetwork [32,64,32]
            target_hidden_sizes=[32, 64, 32],
            input_dim=6,
            output_dim=3,  # We'll only actually use 1 dimension for MountainCarContinuous
    ):
        super().__init__()
        # Freeze sources
        self.source_acrobot = source_acrobot.eval()
        self.source_cartpole = source_cartpole.eval()
        for p in self.source_acrobot.parameters():
            p.requires_grad = False
        for p in self.source_cartpole.parameters():
            p.requires_grad = False

        # Extract the layers from each source so we can get hidden activations
        self.acrobot_layers = nn.Sequential(*self.source_acrobot.model)  # 32->64->32
        self.cartpole_layers = nn.Sequential(*self.source_cartpole.model)  # 32->64->32

        # Build the target's hidden layers [32,64,32]
        self.target_hidden_layers = nn.ModuleList()
        prev_size = input_dim
        for hs in target_hidden_sizes:
            self.target_hidden_layers.append(nn.Linear(prev_size, hs))
            prev_size = hs

        # After final hidden => we produce mean + log_std
        self.mean_layer = nn.Linear(prev_size, output_dim)
        self.log_std_layer = nn.Linear(prev_size, output_dim)
        # Initialize log_std to a reasonable range
        nn.init.constant_(self.log_std_layer.weight, 0.0)
        nn.init.constant_(self.log_std_layer.bias, -0.5)

    def forward_source(self, x, source_layers):
        """
        Pass x through a source net (which is discrete).
        We only want the hidden-layer outputs before the final linear.
        Because the last linear is the "logits" for discrete actions.
        We'll capture 3 hidden states (since [32,64,32]).
        """
        activations = []
        current = x
        relu_count = 0
        for layer in source_layers:
            current = layer(current)
            if isinstance(layer, nn.ReLU):
                activations.append(current.clone())
                relu_count += 1
                if relu_count == 3:
                    break
        return activations  # e.g. [ (batch,32), (batch,64), (batch,32) ]

    def forward(self, x):
        # get hidden states from each source
        acrobot_hiddens = self.forward_source(x, self.acrobot_layers)  # 3 hidden layers
        cartpole_hiddens = self.forward_source(x, self.cartpole_layers)  # 3 hidden layers

        # forward pass in the new target column
        # target has 3 layers: [32, 64, 32]
        current = x

        # layer 1 => 32
        out_1 = self.target_hidden_layers[0](current)  # shape (batch,32)
        # combine with source hidden #1 => both are shape (batch,32)
        # simple sum: out_1 + acrobot_hiddens[0] + cartpole_hiddens[0]
        current = out_1 + acrobot_hiddens[0] + cartpole_hiddens[0]
        current = F.relu(current)

        # layer 2 => 64
        out_2 = self.target_hidden_layers[1](current)  # shape (batch,64)
        current = out_2 + acrobot_hiddens[1] + cartpole_hiddens[1]
        current = F.relu(current)

        # layer 3 => 32
        out_3 = self.target_hidden_layers[2](current)  # shape (batch,32)
        current = out_3 + acrobot_hiddens[2] + cartpole_hiddens[2]
        current = F.relu(current)

        # 3) final: produce mean, log_std
        mean = self.mean_layer(current)
        log_std = self.log_std_layer(current)
        log_std = torch.clamp(log_std, min=-20, max=2)
        std = torch.exp(log_std)

        return mean, std

In [5]:
def pad_state(state, target_dim=6):
    """
    Given a 1D state vector (e.g., from CartPole with shape (4,)),
    return a zero-padded vector of length `target_dim` (e.g., 6).
    """
    state = np.array(state, dtype=np.float32)  # ensure numpy float32
    if state.shape[0] == target_dim:
        return state  # already the right size
    elif state.shape[0] < target_dim:
        # pad zeros at the end
        padded = np.zeros((target_dim,), dtype=np.float32)
        padded[: state.shape[0]] = state
        return padded
    else:
        raise ValueError(f"State has more dimensions ({state.shape[0]}) than target_dim ({target_dim}).")

In [6]:
def train_progressive_mountaincar(
        env_name="MountainCarContinuous-v0",
        input_dim=6,
        output_dim=3,
        hidden_sizes_t=[32, 64, 32],
        hidden_sizes_w=[32, 64, 32],
        alpha_theta=0.0007,
        alpha_w=0.0006,
        episodes=500,
        gamma=0.99,
        entropy_coeff=0.01,
        start_noise_std=0.2,
        end_noise_std=0.05,
        noise_decay=0.99,
        log_dir="runs/progressive_acrobot_cartpole_to_mountaincar",
        model_save_path="models",
        source_acrobot=None,
        source_cartpole=None,
):
    """
    Trains a ProgressiveMountainCarNetwork on MountainCarContinuous,
    using Actor-Critic with continuous actions, while combining
    hidden layers from Acrobot and CartPole networks (both discrete).
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    env = gymnasium.make(env_name)
    writer = SummaryWriter(log_dir=f"{log_dir}_{env_name}")

    # Build the progressive policy network (continuous)
    policy_network = ProgressiveMountainCarNetwork(
        source_acrobot=source_acrobot,
        source_cartpole=source_cartpole,
        target_hidden_sizes=hidden_sizes_t,
        input_dim=input_dim,
        output_dim=output_dim
    ).to(device)

    # Build a value network
    value_network = ValueNetwork(
        input_dim=input_dim,
        hidden_sizes=hidden_sizes_w
    ).to(device)

    # Setup optimizers
    policy_optimizer = optim.Adam(policy_network.parameters(), lr=alpha_theta)
    value_optimizer = optim.Adam(value_network.parameters(), lr=alpha_w)

    rewards_per_episode = []
    start_time = time.time()

    best_avg_reward_50 = float('-inf')

    for episode in range(episodes):
        state, _ = env.reset()
        done = False
        truncated = False
        total_reward = 0.0

        # Zero-pad the state to dimension=6
        padded_state = np.zeros(input_dim, dtype=np.float32)
        padded_state[: len(state)] = state

        current_noise_std = max(end_noise_std, start_noise_std * (noise_decay ** episode))
        I = 1.0  # discount factor for policy updates

        while not (done or truncated):
            state_tensor = torch.tensor(
                padded_state, dtype=torch.float32, device=device
            ).unsqueeze(0)  # shape (1,6)

            # forward pass: progressive network => mean, std
            mean, std = policy_network(state_tensor)
            action_dist = torch.distributions.Normal(mean, std)

            # sample action
            action = action_dist.sample()
            log_prob_action = action_dist.log_prob(action).sum(dim=-1)

            # add noise for exploration
            noise = torch.randn_like(action) * current_noise_std
            noisy_action = action + noise

            # Only use the first dim for MountainCarContinuous
            clipped_dim0 = noisy_action[0, 0].clamp(env.action_space.low[0],
                                                    env.action_space.high[0])
            final_action = clipped_dim0.cpu().numpy().reshape(-1)

            # step in env
            next_state, reward, done, truncated, _info = env.step(final_action)
            total_reward += reward

            # zero-pad next_state
            padded_next_state = np.zeros(input_dim, dtype=np.float32)
            padded_next_state[: len(next_state)] = next_state

            next_state_tensor = torch.tensor(
                padded_next_state, dtype=torch.float32, device=device
            ).unsqueeze(0)

            # value estimate
            value = value_network(state_tensor)
            with torch.no_grad():
                next_value = value_network(next_state_tensor) if not (done or truncated) \
                    else torch.zeros_like(value)

            # TD error
            delta = reward + gamma * next_value - value

            # Update value
            value_loss = delta.pow(2).mean()
            value_optimizer.zero_grad()
            value_loss.backward()
            value_optimizer.step()

            # Entropy bonus
            entropy = action_dist.entropy().sum(dim=-1).mean()

            # Policy loss
            policy_loss = - (log_prob_action * delta.detach() * I).mean()
            policy_loss -= (entropy_coeff * entropy)

            policy_optimizer.zero_grad()
            policy_loss.backward()
            policy_optimizer.step()

            I *= gamma
            padded_state = padded_next_state

        rewards_per_episode.append(total_reward)
        writer.add_scalar("Episode Reward", total_reward, episode)
        writer.add_scalar("Value Loss", value_loss.item(), episode)
        writer.add_scalar("Noise STD", current_noise_std, episode)

        avg_reward_50 = np.mean(rewards_per_episode[-50:])
        avg_reward_100 = np.mean(rewards_per_episode[-100:])
        print(f"Episode {episode + 1}: Reward={total_reward:.2f}, "
              f"Avg(50)={avg_reward_50:.2f}, Avg(100)={avg_reward_100:.2f}, "
              f"Noise={current_noise_std:.3f}")

        # Save model if we see improvement
        if avg_reward_50 > best_avg_reward_50 and episode >= 49:
            best_avg_reward_50 = avg_reward_50
            os.makedirs(model_save_path, exist_ok=True)
            torch.save(policy_network.state_dict(),
                       os.path.join(model_save_path, "progressive_mc_policy.pth"))
            torch.save(value_network.state_dict(),
                       os.path.join(model_save_path, "progressive_mc_value.pth"))
            print(f"New best model saved @ episode {episode + 1} (Avg(50)={best_avg_reward_50:.2f})")

        # (Optional) if you consider it "solved" at some threshold
        if avg_reward_50 > 10:  # e.g. arbitrary threshold
            print(f"Solved MountainCarContinuous in {episode + 1} episodes!")
            break

    train_time = time.time() - start_time
    writer.close()
    env.close()

    return policy_network, value_network, rewards_per_episode, train_time, best_avg_reward_50

In [10]:
def objective(trial):
    # Hyperparameter search space
    alpha_theta = trial.suggest_loguniform('alpha_theta', 1e-5, 1e-2)
    alpha_w = trial.suggest_loguniform('alpha_w', 1e-5, 1e-2)
    gamma = trial.suggest_uniform('gamma', 0.90, 0.999)
    entropy_coeff = trial.suggest_uniform('entropy_coeff', 0.0, 0.1)
    start_noise_std = trial.suggest_uniform('start_noise_std', 0.05, 0.3)
    end_noise_std = trial.suggest_uniform('end_noise_std', 0.25, 0.5)
    noise_decay = trial.suggest_uniform('noise_decay', 0.90, 0.999)

    episodes = 1000  # You can adjust this or make it a hyperparameter

    # Define the unique log directory for this trial
    log_dir = f"runs/progressive_trial_{trial.number}"

    # Print the trial number and hyperparameters
    hyperparams = {
        'alpha_theta': alpha_theta,
        'alpha_w': alpha_w,
        'gamma': gamma,
        'entropy_coeff': entropy_coeff,
        'start_noise_std': start_noise_std,
        'end_noise_std': end_noise_std,
        'noise_decay': noise_decay
    }
    hyperparams_str = ', '.join([f"{key}={value:.6f}" for key, value in hyperparams.items()])
    print(f"\nStarting Trial {trial.number}: {hyperparams_str}\n")

    # Load the pre-trained source networks
    source_acrobot, source_cartpole = load_source_networks(
        device=device,
        max_input_dim=max_input_dim,
        max_output_dim=max_output_dim
    )

    # Train the model with the sampled hyperparameters
    policy_net, value_net, rewards, duration, best_avg_reward_50 = train_progressive_mountaincar(
        env_name="MountainCarContinuous-v0",
        input_dim=6,
        output_dim=3,  # Only the first dimension is used
        hidden_sizes_t=[32, 64, 32],
        hidden_sizes_w=[32, 64, 32],
        alpha_theta=alpha_theta,
        alpha_w=alpha_w,
        episodes=episodes,
        gamma=gamma,
        entropy_coeff=entropy_coeff,
        start_noise_std=start_noise_std,
        end_noise_std=end_noise_std,
        noise_decay=noise_decay,
        log_dir=log_dir,
        model_save_path=f"models/cartpole_acrobot_to_mountaincar_{trial.number}",
        source_acrobot=source_acrobot,
        source_cartpole=source_cartpole,
    )

    print(f"Trial {trial.number} finished with best Avg(50) reward: {best_avg_reward_50:.2f}")

    # Optuna tries to maximize the objective, so return the best average reward
    return best_avg_reward_50

In [11]:
def run_optuna_study(n_trials=50):
    # Create a study object
    study = optuna.create_study(
        direction='maximize',  # We aim to maximize the average reward
        sampler=optuna.samplers.TPESampler(seed=42)  # Set a seed for reproducibility
    )

    # Optimize the objective function
    study.optimize(objective, n_trials=n_trials, timeout=3600)  # e.g., 50 trials or 1 hour

    # Print study statistics
    print("\nStudy Statistics:")
    print(f"  Number of finished trials: {len(study.trials)}")
    print(f"  Best trial:")
    trial = study.best_trial

    print(f"    Value: {trial.value}")
    print(f"    Params: ")
    for key, value in trial.params.items():
        print(f"      {key}: {value}")

    return study

In [None]:
study = run_optuna_study(n_trials=10)

[I 2025-01-09 11:12:20,823] A new study created in memory with name: no-name-3fcee19a-0e5b-4c09-8438-b4e9d14351c1
  alpha_theta = trial.suggest_loguniform('alpha_theta', 1e-5, 1e-2)
  alpha_w = trial.suggest_loguniform('alpha_w', 1e-5, 1e-2)
  gamma = trial.suggest_uniform('gamma', 0.90, 0.999)
  entropy_coeff = trial.suggest_uniform('entropy_coeff', 0.0, 0.1)
  start_noise_std = trial.suggest_uniform('start_noise_std', 0.05, 0.3)
  end_noise_std = trial.suggest_uniform('end_noise_std', 0.25, 0.5)
  noise_decay = trial.suggest_uniform('noise_decay', 0.90, 0.999)
  acrobot_policy.load_state_dict(torch.load(
  cartpole_policy.load_state_dict(torch.load(



Starting Trial 0: alpha_theta=0.000133, alpha_w=0.007114, gamma=0.972467, entropy_coeff=0.059866, start_noise_std=0.089005, end_noise_std=0.288999, noise_decay=0.905750

Episode 1: Reward=-83.78, Avg(50)=-83.78, Avg(100)=-83.78, Noise=0.289
Episode 2: Reward=-92.16, Avg(50)=-87.97, Avg(100)=-87.97, Noise=0.289
Episode 3: Reward=-92.22, Avg(50)=-89.39, Avg(100)=-89.39, Noise=0.289


In [17]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#
# # 1) Load source networks (Acrobot & CartPole, each with [32,64,32])
# acrobot_policy, cartpole_policy = load_source_networks(
#     device=device,
#     max_input_dim=max_input_dim,
#     max_output_dim=max_output_dim
# )
#
#
#
# alpha = 0.000432860862913505
# gamma = 0.9571389501004702
# entropy_coeff = 0.0075942800574955
# start_noise_std = 0.12655319079849925
# end_noise_std = 0.2507795275354682
# noise_decay = 0.9952850823459259
# episodes = 2000
#
# # 2) Train progressive net on MountainCarContinuous
# policy_net, value_net, rewards, duration, best_avg_50 = train_progressive_mountaincar(
#     env_name="MountainCarContinuous-v0",
#     input_dim=6,
#     output_dim=3,  # we only use the first dimension in the env
#     hidden_sizes_t=[32, 64, 32],
#     hidden_sizes_w=[32, 64, 32],
#     alpha_theta=alpha,
#     alpha_w=alpha,
#     episodes=episodes,
#     gamma=gamma,
#     entropy_coeff=entropy_coeff,
#     start_noise_std=start_noise_std,
#     end_noise_std=end_noise_std,
#     noise_decay=noise_decay,
#     log_dir="runs/progressive_cartpole_acrobot_to_mountaincar",
#     model_save_path="models",
#     source_acrobot=acrobot_policy,
#     source_cartpole=cartpole_policy,
# )
#
# print(f"Training finished in {duration:.2f} seconds.")
# print(f"Best Average Reward over 50 episodes: {best_avg_50:.2f}")

  acrobot_policy.load_state_dict(torch.load(
  cartpole_policy.load_state_dict(torch.load(


Episode 1: Reward=-75.61, Avg(50)=-75.61, Avg(100)=-75.61, Noise=0.251
Episode 2: Reward=-92.25, Avg(50)=-83.93, Avg(100)=-83.93, Noise=0.251
Episode 3: Reward=-92.86, Avg(50)=-86.91, Avg(100)=-86.91, Noise=0.251
Episode 4: Reward=-93.22, Avg(50)=-88.48, Avg(100)=-88.48, Noise=0.251
Episode 5: Reward=-92.78, Avg(50)=-89.34, Avg(100)=-89.34, Noise=0.251
Episode 6: Reward=-91.78, Avg(50)=-89.75, Avg(100)=-89.75, Noise=0.251
Episode 7: Reward=68.50, Avg(50)=-67.14, Avg(100)=-67.14, Noise=0.251
Episode 8: Reward=-92.73, Avg(50)=-70.34, Avg(100)=-70.34, Noise=0.251
Episode 9: Reward=11.64, Avg(50)=-61.23, Avg(100)=-61.23, Noise=0.251
Episode 10: Reward=-92.89, Avg(50)=-64.40, Avg(100)=-64.40, Noise=0.251
Episode 11: Reward=-91.87, Avg(50)=-66.90, Avg(100)=-66.90, Noise=0.251
Episode 12: Reward=17.09, Avg(50)=-59.90, Avg(100)=-59.90, Noise=0.251
Episode 13: Reward=-94.23, Avg(50)=-62.54, Avg(100)=-62.54, Noise=0.251
Episode 14: Reward=-93.03, Avg(50)=-64.72, Avg(100)=-64.72, Noise=0.251
Epis