In [None]:
# Install all required libraries
!pip install gymnasium[classic] torch wandb numpy
!apt-get install -y ffmpeg
!pip install pyvirtualdisplay
!apt-get install -y xvfb

# We'll also need to set up the virtual display
# You can put this at the top of your main training cells
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

In [None]:
import wandb

# Run this cell and paste your W&B API key when prompted
wandb.login(key="")

In [None]:
import random

# Run a quick test to ensure wandb is working, as requested in the assignment
print("Running wandb Quickstart demo...")
run = wandb.init(
    project="cmps458_assignment2_quicktest",
    config={
        "learning_rate": 0.01,
        "epochs": 5,
    },
)

for epoch in range(5):
    loss = random.random()
    wandb.log({"epoch": epoch, "loss": loss})
    print(f"Epoch {epoch}, Loss: {loss:.4f}")

run.finish()
print("Quickstart demo finished. Check your wandb project.")

In [None]:
import random
from collections import deque, namedtuple

# Define the 'Transition' structure for our experiences
# This is a highly efficient way to store (s, a, r, s') tuples
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):
    """
    A simple replay buffer, as described in the DQN paper and lecture.
    Stores transitions and allows for random sampling of batches.
    """
    def __init__(self, capacity):
        # Use a deque as the memory. It automatically handles max length.
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition (state, action, next_state, reward)"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        """Select a random batch of transitions for training"""
        return random.sample(self.memory, batch_size)

    def __len__(self):
        """Return the current size of the memory"""
        return len(self.memory)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class DQN(nn.Module):
    """
    The Deep Q-Network model.
    It's a simple feed-forward neural network.
    Input: State (s)
    Output: Q-value for each possible action Q(s, a)
    """
    def __init__(self, n_observations, n_actions):
        super(DQN, self).__init__()
        # Define the layers, matching the lecture slide
        self.layer1 = nn.Linear(n_observations, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, n_actions)

    def forward(self, x):
        """
        Defines the forward pass of the network.
        Uses ReLU activation functions as shown in the lecture
        """
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        # The final layer returns raw Q-values (no activation)
        return self.layer3(x)

In [None]:
import math

# Use the GPU if available, otherwise use the CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def select_action(state, policy_net, n_actions, epsilon):
    """
    Selects an action using an epsilon-greedy policy.
    With probability (1-epsilon), it exploits (picks the best action).
    With probability (epsilon), it explores (picks a random action).
    """
    sample = random.random()
    if sample > epsilon:
        # EXPLOITATION: Get the best action from the policy_net
        with torch.no_grad():
            # policy_net(state) returns Q-values for all actions
            # .max(1)[1] gets the *index* of the max Q-value
            # .view(1, 1) reshapes it to [[action]]
            return policy_net(state).max(1)[1].view(1, 1)
    else:
        # EXPLORATION: Pick a random action
        return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)

In [None]:
import gymnasium as gym

class DiscretizeActionWrapper(gym.ActionWrapper):
    """
    Wraps the Pendulum-v1 environment to discretize its continuous action space.
    The continuous action is a torque in [-2.0, 2.0].
    We will map 3 discrete actions {0, 1, 2} to {-2.0, 0.0, 2.0}.
    """
    def __init__(self, env, num_discrete_actions=3):
        super().__init__(env)
        self.num_actions = num_discrete_actions
        # Redefine the action space as discrete
        self.action_space = gym.spaces.Discrete(self.num_actions)
        # Create a mapping from discrete action index to continuous torque value
        self.action_map = np.linspace(
            self.env.action_space.low[0],
            self.env.action_space.high[0],
            self.num_actions
        )

    def action(self, action_index):
        # Map the discrete action index back to a continuous value
        continuous_action = [self.action_map[action_index]]
        return np.array(continuous_action, dtype=np.float32)

In [None]:
def optimize_model(is_ddqn, policy_net, target_net, optimizer, memory, batch_size, gamma, loss_fn):
    """
    Performs one step of optimization (backpropagation).
    Samples a batch, computes the loss, and updates the policy_net.
    """
    if len(memory) < batch_size:
        return  # Not enough experiences in memory to sample a batch

    # Sample a batch of transitions from replay memory
    transitions = memory.sample(batch_size)
    # Transpose the batch (see PyTorch tutorial for details)
    batch = Transition(*zip(*transitions))

    # Create tensors for states, actions, and rewards
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Identify non-final next states
    non_final_mask = torch.tensor(tuple(s is not None for s in batch.next_state), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])

    # 1. Compute Q_current: Q(s, a)
    # We get the Q-values for *all* actions from the policy_net,
    # then use .gather() to select only the Q-value for the *action we took*.
    Q_current = policy_net(state_batch).gather(1, action_batch)

    # 2. Compute Q_target: R + gamma * max_a' Q(s', a')
    # This is the "TD Target"
    Q_target_next = torch.zeros(batch_size, device=device)

    with torch.no_grad():
        if is_ddqn:
            # --- DDQN Update Logic ---
            # 1. Get the *action* (a*) from the *policy_net*
            a_star = policy_net(non_final_next_states).argmax(dim=1).unsqueeze(1)
            # 2. Get the *value* of that action (a*) from the *target_net*
            Q_target_next[non_final_mask] = target_net(non_final_next_states).gather(1, a_star).squeeze(1)
        else:
            # --- Standard DQN Update Logic ---
            # Get the max Q-value for the next state from the *target_net*
            Q_target_next[non_final_mask] = target_net(non_final_next_states).max(1)[0]

    # Combine immediate reward + discounted future value
    Q_target = (Q_target_next * gamma) + reward_batch

    # 3. Compute Loss
    # We use Smooth L1 Loss (Huber Loss) as recommended
    loss = loss_fn(Q_current, Q_target.unsqueeze(1))

    # 4. Optimize the model (Backpropagation)
    optimizer.zero_grad()
    loss.backward()
    # Gradient Clipping (as mentioned in lecture)
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()

In [None]:
import time
import gymnasium as gym
from gymnasium.wrappers import RecordVideo
from collections import namedtuple
import itertools # Used for counting steps

# --- Main Training Function (MODIFIED to accept all HPs) ---
def run_experiment(
    env_name,
    is_ddqn,
    num_episodes=1000,
    # --- Hyperparameters to tune --..."
    batch_size=128,
    gamma=0.99,
    eps_start=0.9,
    eps_end=0.05,
    eps_decay=1000,
    tau=0.005,               # Soft update param
    lr=1e-3,
    memory_size=10000,
    # --- Wandb tracking ---
    wandb_run_name=None,
    wandb_group=None
):
    """
    The main training script.
    Initializes and runs the full training loop for one agent.
    """

    # 1. Initialize wandb
    model_type = "DDQN" if is_ddqn else "DQN"

    if wandb_run_name is None:
        wandb_run_name = f"{env_name}_{model_type}_{int(time.time())}"

    # All config parameters are logged to wandb
    config = {
        "env_name": env_name, "model_type": model_type, "num_episodes": num_episodes,
        "batch_size": batch_size, "gamma": gamma, "eps_start": eps_start,
        "eps_end": eps_end, "eps_decay": eps_decay, "tau": tau, "lr": lr,
        "memory_size": memory_size,
    }

    run = wandb.init(
        project="cmps458_final_assignment2",
        name=wandb_run_name,
        group=wandb_group,  # This groups runs in the wandb dashboard
        config=config
    )

    # 2. Setup Environment
    if env_name == "Pendulum-v1":
        env = gym.make(env_name)
        env = DiscretizeActionWrapper(env, num_discrete_actions=5)
    else:
        env = gym.make(env_name)

    n_actions = env.action_space.n
    state, info = env.reset()
    n_observations = len(state)

    # 3. Initialize Models and Optimizer
    policy_net = DQN(n_observations, n_actions).to(device)
    target_net = DQN(n_observations, n_actions).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = torch.optim.AdamW(policy_net.parameters(), lr=lr, amsgrad=True)
    memory = ReplayMemory(memory_size)
    loss_fn = nn.SmoothL1Loss()

    print(f"--- Starting Training for {wandb_run_name} (Group: {wandb_group}) ---")
    print(f"Config: {config}")

    # 4. Training Loop
    steps_done = 0
    
    # --- ADDED: Start timer ---
    start_time = time.time()
    
    for i_episode in range(num_episodes):
        state, info = env.reset()
        state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

        episode_duration = 0
        episode_reward = 0
        for t in itertools.count():
            epsilon = eps_end + (eps_start - eps_end) * math.exp(-1. * steps_done / eps_decay)
            action = select_action(state, policy_net, n_actions, epsilon)
            steps_done += 1

            observation, reward, terminated, truncated, _ = env.step(action.item())
            episode_duration += 1
            episode_reward += reward
            reward = torch.tensor([reward], device=device)
            done = terminated or truncated

            if terminated:
                next_state = None
            else:
                next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)

            memory.push(state, action, next_state, reward)
            state = next_state

            optimize_model(is_ddqn, policy_net, target_net, optimizer, memory, batch_size, gamma, loss_fn)

            # Soft update of the target network's weights
            target_net_state_dict = target_net.state_dict()
            policy_net_state_dict = policy_net.state_dict()
            for key in policy_net_state_dict:
                target_net_state_dict[key] = policy_net_state_dict[key]*tau + target_net_state_dict[key]*(1-tau)
            target_net.load_state_dict(target_net_state_dict)

            if done:
                break

        # Log episode results to wandb
        wandb.log({"episode": i_episode, "duration": episode_duration, "episode_reward": episode_reward, "epsilon": epsilon})

    # --- ADDED: End timer and calculate duration ---
    end_time = time.time()
    training_duration_seconds = end_time - start_time

    print(f"--- Training Complete for {wandb_run_name} ---")
    print(f"Total training time: {training_duration_seconds:.2f} seconds") # --- ADDED ---

    model_path = f"{wandb_run_name}.pth"
    torch.save(policy_net.state_dict(), model_path)
    
    # --- ADDED: Log training time to wandb before finishing ---
    wandb.log({"training_time_seconds": training_duration_seconds})
    
    run.finish()
    return model_path, config


# --- Test & Record Function (MODIFIED to accept wandb grouping) ---
# --- Test & Record Function (CORRECTED) ---
def test_and_record(
    env_name,
    model_type,
    model_path,
    num_tests=100,
    wandb_run_name=None,
    wandb_group=None,
    config=None
):
    """
    Tests a trained agent for 100 episodes and logs the duration.
    Records a video of the first test episode.
    """
    print(f"\n--- Testing {model_type} on {env_name} ---")

    if wandb_run_name is None:
        wandb_run_name = f"test_{model_type}"

    if config is None:
        config={"env_name": env_name, "model_type": model_type, "num_tests": num_tests}

    run = wandb.init(
        project="cmps458_final_assignment2_tests",
        name=wandb_run_name,
        group=wandb_group,
        config=config,
        reinit=True # Add reinit=True to allow this in a loop
    )

    # 2. Setup Environment
    if env_name == "Pendulum-v1":
        base_env = gym.make(env_name, render_mode="rgb_array")
        env = DiscretizeActionWrapper(base_env, num_discrete_actions=5)
    else:
        env = gym.make(env_name, render_mode="rgb_array")

    # 3. Wrap for Video Recording
    video_folder = f"videos/{wandb_run_name}"
    env = RecordVideo(
        env, video_folder=video_folder,
        episode_trigger=lambda e: e == 0,
        name_prefix=f"{model_type}"
    )

    # 4. Load Model
    n_actions = env.action_space.n
    state, info = env.reset()
    n_observations = len(state)

    model = DQN(n_observations, n_actions).to(device)
    model.load_state_dict(torch.load(model_path))
    model.eval()

    # 5. Run 100 Tests
    test_durations = []
    test_rewards = []
    for i_episode in range(num_tests):
        state, info = env.reset()
        state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

        episode_duration = 0
        episode_reward = 0
        for t in itertools.count():
            action = select_action(state, model, n_actions, epsilon=0.0) # Always greedy
            observation, reward, terminated, truncated, _ = env.step(action.item())
            episode_duration += 1
            episode_reward += reward
            if terminated or truncated:
                break
            state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)

        wandb.log({"test_episode": i_episode, "test_duration": episode_duration, "test_reward": episode_reward})
        test_durations.append(episode_duration)
        test_rewards.append(episode_reward)

    print(f"Testing complete. Video saved in '{video_folder}'")
    avg_duration = sum(test_durations) / num_tests
    avg_reward = sum(test_rewards) / num_tests
    print(f"Average duration: {avg_duration:.2f}, Average reward: {avg_reward:.2f}")

    wandb.log({"avg_test_duration": avg_duration, "avg_test_reward": avg_reward})

    # --- THIS IS THE CRUCIAL FIX ---
    # Close the environment to finalize the video recording.
    env.close()
    # --- END OF FIX ---

    run.finish()

In [None]:
# BATCH_SIZE is the number of transitions sampled from the replay buffer
# GAMMA is the discount factor as mentioned in the previous section
# EPS_START is the starting value of epsilon
# EPS_END is the final value of epsilon
# EPS_DECAY controls the rate of exponential decay of epsilon, higher means a slower decay
# TAU is the update rate of the target network
# LR is the learning rate of the ``AdamW`` optimizer


# --- CartPole-v1 Hyperparameter Sweep ---
# 1. Test Learning Rates (lr)
lr_tests = [1e-3, 1e-2, 3e-4, 1e-5]
gamma_tests = [0.99, 0.9, 0.5, 0.1]
eps_decay_tests = [1000, 500, 2500, 5000] # Fast vs. Patient exploration 
memory_tests = [10000, 500, 50000]
batch_size_tests = [128, 64, 256] # Noisy/Fast vs. Stable/Slow

environments = ["CartPole-v1", "Acrobot-v1", "MountainCar-v0", "Pendulum-v1"]

for environment in environments:
    for lr in lr_tests:
        if environment == "CartPole-v1":
            continue;
        group_name = f"{environment}_LR_Test"
        run_name = f"{environment}_DDQN_lr-{lr}"

        model_path, config = run_experiment(
            env_name=environment, is_ddqn=True,
            lr=lr, # <-- Variable we are testing
            wandb_run_name=run_name, wandb_group=group_name
        )
        test_and_record(
            environment, run_name, model_path,
            wandb_run_name=f"test_{run_name}", wandb_group=group_name, config=config
        )

    # 2. Test Discount Factors (gamma)
    for gamma in gamma_tests:
        if environment == "CartPole-v1":
            continue;
        group_name = f"{environment}_Gamma_Test"
        run_name = f"{environment}_DDQN_gamma-{gamma}"

        model_path, config = run_experiment(
            env_name=environment, is_ddqn=True,
            gamma=gamma, # <-- Variable we are testing
            wandb_run_name=run_name, wandb_group=group_name
        )
        test_and_record(
            environment, run_name, model_path,
            wandb_run_name=f"test_{run_name}", wandb_group=group_name, config=config
        )

    # 3. Test Epsilon Decay Rates (Crucial for this env)
    for eps_decay in eps_decay_tests:
        if environment == "CartPole-v1" and eps_decay in [1000, 500]:
            continue;
        group_name = f"{environment}_EpsDecay_Test"
        run_name = f"{environment}_DDQN_eps-{eps_decay}"

        model_path, config = run_experiment(
            env_name=environment, is_ddqn=True,
            eps_decay=eps_decay, # <-- Variable we are testing
            wandb_run_name=run_name, wandb_group=group_name
        )
        test_and_record(
            environment, run_name, model_path,
            wandb_run_name=f"test_{run_name}", wandb_group=group_name, config=config
        )

    # 4. Test Replay Memory Sizes
    for mem_size in memory_tests:
        group_name = f"{environment}_Memory_Test"
        run_name = f"{environment}_DDQN_mem-{mem_size}"

        model_path, config = run_experiment(
            env_name=environment, is_ddqn=True,
            memory_size=mem_size, # <-- Variable we are testing
            wandb_run_name=run_name, wandb_group=group_name
        )
        test_and_record(
            environment, run_name, model_path,
            wandb_run_name=f"test_{run_name}", wandb_group=group_name, config=config
        )

    # 5. Test Batch Sizes
    for batch_size in batch_size_tests:
        group_name = f"{environment}_BatchSize_Test"
        run_name = f"{environment}_DDQN_batch-{batch_size}"

        model_path, config = run_experiment(
            env_name=environment, is_ddqn=True,
            batch_size=batch_size, # <-- Variable we are testing
            wandb_run_name=run_name, wandb_group=group_name
        )
        test_and_record(
            environment, run_name, model_path,
            wandb_run_name=f"test_{run_name}", wandb_group=group_name, config=config
        )