# Deep Reinforcement Learning Laboratory

In this laboratory session we will work on getting more advanced versions of Deep Reinforcement Learning algorithms up and running. Deep Reinforcement Learning is **hard**, and getting agents to stably train can be frustrating and requires quite a bit of subtlety in analysis of intermediate results. We will start by refactoring (a bit) my implementation of `REINFORCE` on the [Cartpole environment](https://gymnasium.farama.org/environments/classic_control/cart_pole/).

## Exercise 1: Improving my `REINFORCE` Implementation (warm up)

In this exercise we will refactor a bit and improve some aspects of my `REINFORCE` implementation.

**First Things First**: Spend some time playing with the environment to make sure you understand how it works.

In [1]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from torch.distributions import Categorical
import wandb
from collections import deque

# Instantiate a rendering and a non-rendering environment.
env_render = gym.make('CartPole-v1', render_mode='human')
env = gym.make('CartPole-v1')

In [2]:
observation, info = env.reset()
print("Observation after reset:", observation)
print("Observation space:", env.observation_space)

Observation after reset: [-0.00895654 -0.00242876 -0.04069757  0.03738886]
Observation space: Box([-4.8               -inf -0.41887903        -inf], [4.8               inf 0.41887903        inf], (4,), float32)


**Next Things Next**: Now get your `REINFORCE` implementation working on the environment. You can import my (probably buggy and definitely inefficient) implementation here. Or even better, refactor an implementation into a separate package from which you can `import` the stuff you need here.

In [3]:
# Given an environment, observation, and policy, sample from pi(a | obs). Returns the
# selected action and the log probability of that action (needed for policy gradient).
def select_action(env, obs, policy):
    dist = Categorical(policy(obs))
    action = dist.sample()
    log_prob = dist.log_prob(action)
    return (action.item(), log_prob.reshape(1))

# Utility to compute the discounted total reward. Torch doesn't like flipped arrays, so we need to
# .copy() the final numpy array. There's probably a better way to do this.
def compute_returns(rewards, gamma):
    return np.flip(np.cumsum([gamma**(i+1)*r for (i, r) in enumerate(rewards)][::-1]), 0).copy()

# Given an environment and a policy, run it up to the maximum number of steps.
def run_episode(env, policy, maxlen=500):
    # Collect just about everything.
    observations = []
    actions = []
    log_probs = []
    rewards = []

    # Reset the environment and start the episode.
    (obs, info) = env.reset()
    for i in range(maxlen):
        # Get the current observation, run the policy and select an action.
        obs = torch.tensor(obs)
        (action, log_prob) = select_action(env, obs, policy)
        observations.append(obs)
        actions.append(action)
        log_probs.append(log_prob)

        # Advance the episode by executing the selected action.
        (obs, reward, term, trunc, info) = env.step(action)
        rewards.append(reward)
        if term or trunc:
            break
    return (observations, actions, torch.cat(log_probs), rewards)

In [4]:
class PolicyNet(nn.Module):
    def __init__(self, env):
        super().__init__()
        self.fc1 = nn.Linear(env.observation_space.shape[0], 128)
        self.fc2 = nn.Linear(128, env.action_space.n)

    def forward(self, s):
        s = F.relu(self.fc1(s))
        s = F.softmax(self.fc2(s), dim=-1)
        return s

In [5]:
# The REINFORCE algorithm implementation.
def reinforce(policy, env, gamma=0.99, num_episodes=10, eval_interval=100, eval_episodes=10, use_standardize_baseline=True, policy_lr=1e-2):
    
    opt = torch.optim.Adam(policy.parameters(), lr=policy_lr)

    # Track episode rewards in a list.
    scores_on_100_episodes = deque(maxlen=100) # Keep track of the last 100 episode scores
    eval_avg_rewards = []
    eval_avg_lengths = []  
    

    # The main training loop.
    policy.train()
    for episode in range(num_episodes):
        # Run an episode of the environment, collect everything needed for policy update.
        (observations, actions, log_probs, rewards) = run_episode(env, policy)

        # Compute the discounted reward for every step of the episode.
        returns = torch.tensor(compute_returns(rewards, gamma), dtype=torch.float32)

        scores_on_100_episodes.append(sum(rewards))
        average_score = np.mean(scores_on_100_episodes)

        # Standardize returns if the option is enabled.
        if use_standardize_baseline:
            returns = (returns - returns.mean()) / returns.std()

        wandb.log({'episode_reward': sum(rewards), 'average_score_100':average_score}) # Log episode reward and average score over last 100 episodes
        
        # Make an optimization step
        opt.zero_grad()
        loss = (-log_probs * returns).mean()
        loss.backward()
        opt.step()

        print(f'\rEpisode {episode}\tAverage Score: {average_score:.2f}', end="")


        # Perform evaluation periodically
        if (episode + 1) % eval_interval == 0:
            policy.eval()
            avg_reward = 0
            avg_length = 0
            for _ in range(eval_episodes):
                (eval_observations, eval_actions, eval_log_probs, eval_rewards) = run_episode(env, policy)
                avg_reward += sum(eval_rewards)
                avg_length += len(eval_rewards)
            avg_reward /= eval_episodes
            avg_length /= eval_episodes
            eval_avg_rewards.append(avg_reward)
            eval_avg_lengths.append(avg_length)
            wandb.log({'eval_avg_reward': avg_reward, 'eval_avg_length': avg_length}) # Log evaluation metrics and run number
            print(f'\nEpisode {episode + 1}: Evaluation average reward: {avg_reward}')
            print(f'Episode {episode + 1}: Evaluation average length: {avg_length}')
            if avg_reward >= 500:
                 torch.save(policy.state_dict(), "CartPole_REINFORCE_no_baseline.pt")
                 break
            policy.train()

    # Return the running rewards, average evaluation rewards, and average evaluation lengths.
    policy.eval()
    return eval_avg_rewards, eval_avg_lengths

In [6]:
def reinforce_train():
    wandb.init(group="cartpole_REINFORCE_No_Baseline", name="REINFORCE_No_Baseline")
    config = wandb.config
    
    seed = 2000
    torch.manual_seed(seed)
    np.random.seed(seed)

    env = gym.make('CartPole-v1', render_mode='rgb_array')
    env.reset(seed=seed)

    policy = PolicyNet(env)
    
    reinforce(
        policy=policy,
        env=env,
        gamma=config.gamma,
        num_episodes=config.num_episodes,
        eval_interval=config.eval_interval,
        eval_episodes=config.eval_episodes,
        use_standardize_baseline=config.use_standardize_baseline,
        policy_lr=config.policy_lr
    )

    env.close()
    env_render.close()
    wandb.finish()


In [None]:
sweep_config = {
    'method': 'random',
    'metric': {'name': 'eval_avg_reward', 'goal': 'maximize'},
    'parameters': {
        'gamma': {'min': 0.98, 'max': 0.999},
        'policy_lr': {'min': 1e-4, 'max': 1e-2},
        'num_episodes': {'value': 2000},
        'use_standardize_baseline': {'value': False},
        'eval_interval': {'value': 100},
        'eval_episodes': {'value': 10},
    }
}

sweep_id = wandb.sweep(sweep_config, project="Laboratory-2")
wandb.agent(sweep_id, function=reinforce_train, count=5)

In [12]:
env = gym.make("CartPole-v1", render_mode="rgb_array")
env = gym.wrappers.RecordVideo(env, video_folder="./cartpole_REINFORCE_no_baseline_videos", episode_trigger=lambda x: True)
policy = PolicyNet(env)
policy.load_state_dict(torch.load("CartPole_REINFORCE_no_baseline.pt"))
policy.eval()
for _ in range(5):
    (eval_observations, eval_actions, eval_log_probs, eval_rewards) = run_episode(env, policy)
    print(f'Evaluation episode reward: {sum(eval_rewards)}')
env.close()



  logger.warn(


Evaluation episode reward: 500.0
Evaluation episode reward: 500.0
Evaluation episode reward: 500.0
Evaluation episode reward: 500.0
Evaluation episode reward: 500.0


**Last Things Last**: My implementation does a **super crappy** job of evaluating the agent performance during training. The running average is not a very good metric. Modify my implementation so that every $N$ iterations (make $N$ an argument to the training function) the agent is run for $M$ episodes in the environment. Collect and return: (1) The average **total** reward received over the $M$ iterations; and (2) the average episode length. Analyze the performance of your agents with these new metrics.

N.B for the "vanilla" cartpole enviroment the total reward and the lenght of the episode are the same

-----
## Exercise 2: `REINFORCE` with a Value Baseline (warm up)

In this exercise we will augment my implementation (or your own) of `REINFORCE` to subtract a baseline from the target in the update equation in order to stabilize (and hopefully speed-up) convergence. For now we will stick to the Cartpole environment.



**First Things First**: Recall from the slides on Deep Reinforcement Learning that we can **subtract** any function that doesn't depend on the current action from the q-value without changing the (maximum of our) objecttive function $J$:  

$$ \nabla J(\boldsymbol{\theta}) \propto \sum_{s} \mu(s) \sum_a \left( q_{\pi}(s, a) - b(s) \right) \nabla \pi(a \mid s, \boldsymbol{\theta}) $$

In `REINFORCE` this means we can subtract from our target $G_t$:

$$ \boldsymbol{\theta}_{t+1} \triangleq \boldsymbol{\theta}_t + \alpha (G_t - b(S_t)) \frac{\nabla \pi(A_t \mid s, \boldsymbol{\theta})}{\pi(A_t \mid s, \boldsymbol{\theta})} $$

Since we are only interested in the **maximum** of our objective, we can also **rescale** our target by any function that also doesn't depend on the action. A **simple baseline** which is even independent of the state -- that is, it is **constant** for each episode -- is to just **standardize rewards within the episode**. So, we **subtract** the average return and **divide** by the variance of returns:

$$ \boldsymbol{\theta}_{t+1} \triangleq \boldsymbol{\theta}_t + \alpha \left(\frac{G_t - \bar{G}}{\sigma_G}\right) \nabla  \pi(A_t \mid s, \boldsymbol{\theta}) $$

This baseline is **already** implemented in my implementation of `REINFORCE`. Experiment with and without this standardization baseline and compare the performance. We are going to do something more interesting.

In [8]:
def reinforce_standardized_baseline_train():
    wandb.init(group="Standardized_Baseline_cartpole_REINFORCE", name="REINFORCE_Standardized_Baseline")
    config = wandb.config
    
    seed = 2000
    torch.manual_seed(seed)
    np.random.seed(seed)

    env = gym.make('CartPole-v1', render_mode='rgb_array')
    env.reset(seed=seed)

    policy = PolicyNet(env)
    
    reinforce(
        policy=policy,
        env=env,
        gamma=config.gamma,
        num_episodes=config.num_episodes,
        eval_interval=config.eval_interval,
        eval_episodes=config.eval_episodes,
        use_standardize_baseline=config.use_standardize_baseline,
        policy_lr=config.policy_lr
    )

    env.close()
    env_render.close()
    wandb.finish()


In [None]:
sweep_config = {
    'method': 'random',
    'metric': {'name': 'eval_avg_reward', 'goal': 'maximize'},
    'parameters': {
        'gamma': {'min': 0.98, 'max': 0.999},
        'policy_lr': {'min': 1e-4, 'max': 1e-2},
        'num_episodes': {'value': 2000},
        'use_standardize_baseline': {'value': True}, # Use standardized returns as baseline
        'eval_interval': {'value': 100},
        'eval_episodes': {'value': 10},
    }
}

sweep_id = wandb.sweep(sweep_config, project="Laboratory-2")
wandb.agent(sweep_id, function=reinforce_standardized_baseline_train, count=5)

In [5]:
env = gym.make("CartPole-v1", render_mode="rgb_array")
env = gym.wrappers.RecordVideo(env, video_folder="./cartpole_REINFORCE_no_baseline_videos", episode_trigger=lambda x: True)
policy = PolicyNet(env)
policy.load_state_dict(torch.load("CartPole_REINFORCE_no_baseline.pt"))
policy.eval()
for _ in range(5):
    (eval_observations, eval_actions, eval_log_probs, eval_rewards) = run_episode(env, policy)
    print(f'Evaluation episode reward: {sum(eval_rewards)}')
env.close()

  logger.warn(
  from pkg_resources import resource_stream, resource_exists


Evaluation episode reward: 500.0
Evaluation episode reward: 500.0
Evaluation episode reward: 500.0
Evaluation episode reward: 500.0
Evaluation episode reward: 500.0


**The Real Exercise**: Standard practice is to use the state-value function $v(s)$ as a baseline. This is intuitively appealing -- we are more interested in updating out policy for returns that estimate the current **value** worse. Our new update becomes:

$$ \boldsymbol{\theta}_{t+1} \triangleq \boldsymbol{\theta}_t + \alpha (G_t - \tilde{v}(S_t \mid \mathbf{w})) \frac{\nabla \pi(A_t \mid s, \boldsymbol{\theta})}{\pi(A_t \mid s, \boldsymbol{\theta})} $$

where $\tilde{v}(s \mid \mathbf{w})$ is a **deep neural network** with parameters $w$ that estimates $v_\pi(s)$. What neural network? Typically, we use the **same** network architecture as that of the Policy.

**Your Task**: Modify your implementation to fit a second, baseline network to estimate the value function and use it as **baseline**.

In [5]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from torch.distributions import Categorical
import wandb


class ValueNet(nn.Module):
    def __init__(self, env):
        super().__init__()
        self.fc1 = nn.Linear(env.observation_space.shape[0], 128)
        self.fc2 = nn.Linear(128, 1) 

    def forward(self, s):
        s = F.relu(self.fc1(s))
        s = self.fc2(s)
        return s


In [None]:
# The REINFORCE algorithm with baseline implementation, it is possible to choose whether to use the value baseline or standardized returns.
def reinforce_with_baseline(policy, value_net, env, gamma=0.99, num_episodes=10, eval_interval=100, eval_episodes=10, use_value_baseline=True, policy_lr=1e-2, value_lr=1e-2):
    
    opt_policy = torch.optim.Adam(policy.parameters(), lr=policy_lr)
    opt_value = torch.optim.Adam(value_net.parameters(), lr=value_lr)
    value_criterion = nn.MSELoss()

    # Track episode rewards in a list.
    scores_on_100_episodes = deque(maxlen=100) # Keep track of the last 100 episode scores
    eval_avg_rewards = []
    eval_avg_lengths = []
    best_eval_reward = -float('inf')  

    # The main training loop.
    policy.train()
    for episode in range(num_episodes):

        (observations, actions, log_probs, rewards) = run_episode(env, policy)

        returns = torch.tensor(compute_returns(rewards, gamma), dtype=torch.float32)
        observations_tensor = torch.stack(observations)

        scores_on_100_episodes.append(sum(rewards))
        average_score = np.mean(scores_on_100_episodes)

        # Compute value loss and update value network
        predicted_values = value_net(observations_tensor).squeeze()
        value_loss = value_criterion(predicted_values, returns)
        opt_value.zero_grad()
        value_loss.backward()
        opt_value.step()

        # Compute advantages, if value baseline is not used, use standardized returns
        if use_value_baseline:
            advantages = returns - predicted_values.detach()
        else:
            advantages = (returns - returns.mean()) / returns.std()

        wandb.log({'episode_reward': sum(rewards), 'average_score_100':average_score}) # Log episode reward and average score over last 100 episodes
        # Make an optimization step for policy network
        opt_policy.zero_grad()
        policy_loss = (-log_probs * advantages).mean()
        policy_loss.backward()
        opt_policy.step()

        # Perform evaluation periodically
        if (episode + 1) % eval_interval == 0:
            policy.eval()
            avg_reward = 0
            avg_length = 0
            for _ in range(eval_episodes):
                (eval_observations, eval_actions, eval_log_probs, eval_rewards) = run_episode(env, policy)
                avg_reward += sum(eval_rewards)
                avg_length += len(eval_rewards)
            avg_reward /= eval_episodes
            avg_length /= eval_episodes
            eval_avg_rewards.append(avg_reward)
            eval_avg_lengths.append(avg_length)
            wandb.log({'eval_avg_reward': avg_reward, 'eval_avg_length': avg_length}) # Log evaluation metrics and run number
            print(f'Episode {episode + 1}: Evaluation average reward: {avg_reward}')
            print(f'Episode {episode + 1}: Evaluation average length: {avg_length}')
            if avg_reward >= 500:
                 torch.save(policy.state_dict(), "best_policy_with_baseline.pt")
                 break
            policy.train()

    # Return the running rewards, average evaluation rewards, and average evaluation lengths.
    policy.eval()
    return eval_avg_rewards, eval_avg_lengths

In [7]:
def reinforce_value_baseline_train():
    wandb.init(group="REINFORCE_Value_Baseline", name="Value_Baseline_cartpole_REINFORCE")
    config = wandb.config
    
    seed = 2000
    torch.manual_seed(seed)
    np.random.seed(seed)

    env = gym.wrappers.RecordVideo(gym.make('CartPole-v1', render_mode='rgb_array'), video_folder='./reinforce_cartpole_no_baseline_videos', episode_trigger=lambda x: x % 500 == 0)
    env.reset(seed=seed)

    policy = PolicyNet(env)
    value_net = ValueNet(env)
    
    reinforce_with_baseline(
        policy=policy,
        value_net=value_net,
        env=env,
        gamma=config.gamma,
        num_episodes=config.num_episodes,
        eval_interval=config.eval_interval,
        eval_episodes=config.eval_episodes,
        use_value_baseline=config.use_value_baseline,
        policy_lr=config.policy_lr,
        value_lr=config.value_lr
    )

    env.close()
    env_render.close()
    wandb.finish()


In [None]:
sweep_config = {
    'method': 'random',
    'metric': {'name': 'eval_avg_reward', 'goal': 'maximize'},
    'parameters': {
        'gamma': {'min': 0.98, 'max': 0.999},
        'policy_lr': {'min': 1e-4, 'max': 1e-2},
        'value_lr': {'min': 1e-4, 'max': 1e-2},
        'num_episodes': {'value': 2000},
        'use_value_baseline': {'value': True}, # Use value baseline
        'eval_interval': {'value': 100},
        'eval_episodes': {'value': 10},
    }
}

sweep_id = wandb.sweep(sweep_config, project="Laboratory-2")
wandb.agent(sweep_id, function=reinforce_value_baseline_train, count=5)

-----
## Exercise 3: Going Deeper

As usual, pick **AT LEAST ONE** of the following exercises to complete.

### Exercise 3.1: Solving Lunar Lander with `REINFORCE` (easy)

Use my (or even better, improve on my) implementation of `REINFORCE` to solve the [Lunar Lander Environment](https://gymnasium.farama.org/environments/box2d/lunar_lander/). This environment is a little bit harder than Cartpole, but not much. Make sure you perform the same types of analyses we did during the lab session to quantify and qualify the performance of your agents.

### Exercise 3.2: Solving Cartpole and Lunar Lander with `Deep Q-Learning` (harder)

On policy Deep Reinforcement Learning tends to be **very unstable**. Write an implementation (or adapt an existing one) of `Deep Q-Learning` to solve our two environments (Cartpole and Lunar Lander). To do this you will need to implement a **Replay Buffer** and use a second, slow-moving **target Q-Network** to stabilize learning.

### Exercise 3.3: Solving the OpenAI CarRacing environment (hardest)

Use `Deep Q-Learning` -- or even better, an off-the-shelf implementation of **Proximal Policy Optimization (PPO)** -- to train an agent to solve the [OpenAI CarRacing](https://github.com/andywu0913/OpenAI-GYM-CarRacing-DQN) environment. This will be the most *fun*, but also the most *difficult*. Some tips:

1. Make sure you use the `continuous=False` argument to the environment constructor. This ensures that the action space is **discrete** (we haven't seen how to work with continuous action spaces).
2. Your Q-Network will need to be a CNN. A simple one should do, with two convolutional + maxpool layers, folowed by a two dense layers. You will **definitely** want to use a GPU to train your agents.
3. The observation space of the environment is a single **color image** (a single frame of the game). Most implementations stack multiple frames (e.g. 3) after converting them to grayscale images as an observation.



### Exercise 3.2: Solving Cartpole and Lunar Lander with `Deep Q-Learning`

In [1]:
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.autograd as autograd
from torch.autograd import Variable
from collections import deque, namedtuple
import gymnasium as gym
import wandb

In [2]:
class Network(nn.Module):

  def __init__(self, state_size, action_size, seed = 42):
    super(Network, self).__init__()
    self.seed = torch.manual_seed(seed)
    self.fc1 = nn.Linear(state_size, 64)
    self.fc2 = nn.Linear(64, 64)
    self.fc3 = nn.Linear(64, action_size)

  def forward(self, state):
    x = self.fc1(state)
    x = F.relu(x)
    x = self.fc2(x)
    x = F.relu(x)
    return self.fc3(x)
     

In [3]:
class ReplayMemory(object):

  def __init__(self, capacity):
    self.device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
    self.capacity = capacity
    self.memory = []

  def push(self, event):
    self.memory.append(event)
    if len(self.memory) > self.capacity:
      del self.memory[0]

  def sample(self, batch_size):
    experiences = random.sample(self.memory, k = batch_size)
    states = torch.from_numpy(np.vstack([e[0] for e in experiences if e is not None])).float().to(self.device)
    actions = torch.from_numpy(np.vstack([e[1] for e in experiences if e is not None])).long().to(self.device)
    rewards = torch.from_numpy(np.vstack([e[2] for e in experiences if e is not None])).float().to(self.device)
    next_states = torch.from_numpy(np.vstack([e[3] for e in experiences if e is not None])).float().to(self.device)
    dones = torch.from_numpy(np.vstack([e[4] for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device)
    return states, next_states, actions, rewards, dones

In [5]:
class Agent():

  def __init__(self, state_size, action_size, learning_rate , replay_buffer_size, minibatch_size, discount_factor=0.99, interpolation_parameter=1e-3):
    self.device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
    self.state_size = state_size
    self.action_size = action_size
    self.local_qnetwork = Network(state_size, action_size).to(self.device)
    self.target_qnetwork = Network(state_size, action_size).to(self.device)
    self.optimizer = optim.Adam(self.local_qnetwork.parameters(), lr = learning_rate)
    self.memory = ReplayMemory(replay_buffer_size)
    self.t_step = 0
    self.minibatch_size = minibatch_size
    self.discount_factor = discount_factor
    self.interpolation_parameter = interpolation_parameter
    #self.warmup_size = 400  # Ensure enough samples before learning

    # For wandb logging
    self.last_loss = 0
    self.last_q_values = 0

  def step(self, state, action, reward, next_state, done):
    self.memory.push((state, action, reward, next_state, done))
    self.t_step = (self.t_step + 1) % 4
    if self.t_step == 0:
      if len(self.memory.memory) > self.minibatch_size:
      #if len(self.memory.memory) > self.warmup_size:
        #experiences = self.memory.sample(100)
        experiences = self.memory.sample(self.minibatch_size)
        self.learn(experiences, self.discount_factor)

  def act(self, state, epsilon = 0.):
    state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
    self.local_qnetwork.eval()
    with torch.no_grad():
      action_values = self.local_qnetwork(state)
    self.local_qnetwork.train()

    # For logging
    self.last_q_values = action_values.cpu().numpy()
    
    if random.random() > epsilon:
      return np.argmax(action_values.cpu().data.numpy())
    else:
      return random.choice(np.arange(self.action_size))

  def learn(self, experiences, discount_factor):
    states, next_states, actions, rewards, dones = experiences
    next_q_targets = self.target_qnetwork(next_states).detach().max(1)[0].unsqueeze(1)
    q_targets = rewards + discount_factor * next_q_targets * (1 - dones)
    q_expected = self.local_qnetwork(states).gather(1, actions)
    loss = F.mse_loss(q_expected, q_targets)
    self.last_loss = loss.item()  # For logging
    self.optimizer.zero_grad()
    loss.backward()
    self.optimizer.step()
    self.soft_update(self.local_qnetwork, self.target_qnetwork, self.interpolation_parameter)

  def soft_update(self, local_model, target_model, interpolation_parameter):
    for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
      target_param.data.copy_(interpolation_parameter * local_param.data + (1.0 - interpolation_parameter) * target_param.data)

In [6]:
def evaluate_policy(agent, env, n_episodes=10, max_steps=1000, solve_score=200):
    """
    Evaluate the agent's policy over n_episodes.
    Returns True if all episodes achieve at least `solve_score`.
    """
    all_solved = True
    rewards_list = []

    for episode in range(n_episodes):
        state, _ = env.reset()
        episode_reward = 0
        for t in range(max_steps):
            # Use greedy policy
            action = agent.act(state, epsilon=0.0)
            next_state, reward, done, _, _ = env.step(action)
            episode_reward += reward
            state = next_state
            if done:
                break
        rewards_list.append(episode_reward)
        if episode_reward < solve_score:
            all_solved = False  # At least one episode not solved

    print(f"Evaluation rewards: {rewards_list}")
    return all_solved, rewards_list

In [7]:
def trainDQNLunarLander():
    wandb.init(group="DQN_LunarLander", name="LunarLander_DQN")
    config = wandb.config

    env = gym.make(config.env)
    state_size = env.observation_space.shape[0]
    number_actions = env.action_space.n

    agent = Agent(
        state_size,
        number_actions,
        learning_rate=config.learning_rate,
        replay_buffer_size=config.replay_buffer_size,
        minibatch_size=config.minibatch_size,
        discount_factor=config.discount_factor,
        interpolation_parameter=config.interpolation_parameter
    )

    episodes = config.episodes
    max_steps = config.max_timesteps

    epsilon = config.epsilon_start
    epsilon_min = config.epsilon_end
    epsilon_decay = config.epsilon_decay

    scores_100 = deque(maxlen=100)

    for episode in range(1, episodes + 1):

        state, _ = env.reset()
        score = 0

        for t in range(max_steps):
            action = agent.act(state, epsilon)
            next_state, reward, done, _, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)

            state = next_state
            score += reward
            if done:
                break

        epsilon = max(epsilon_min, epsilon_decay * epsilon)

        scores_100.append(score)
        avg_score = np.mean(scores_100)

        wandb.log({
            "episode_reward": score,
            "epsilon": epsilon,
            "avg_score_100": avg_score
        })

        print(f"\rEpisode {episode}  Avg Score: {avg_score:.2f}", end="")
        if episode % 100 == 0:
            print()

        if avg_score >= 200:
            #perform evaluation
            all_solved, rewards_list = evaluate_policy(agent, env, n_episodes=10, max_steps=max_steps)
            if all_solved:
                print(f"\nSolved environment in {episode - 100} episodes (all evaluation episodes ≥ 200)!")
                torch.save(agent.local_qnetwork.state_dict(),
                           f'checkpointLunarLander_solved_in_{episode-100}_episodes.pth')
                break
            else:
                print("\nEvaluation failed: not all episodes reached 200 reward. Continuing training...")

    wandb.finish()


In [None]:
sweep_config = {
    "method": "grid",
    "metric": {
        "name": "avg_score_100",
        "goal": "maximize"
    },
    "parameters": {
        "learning_rate": {
            "values": [5e-4, 1e-3]
        },
        "discount_factor": {
            "values": [0.98, 0.99]
        },
        "epsilon_decay": {
            "values": [0.995, 0.998]
        },
        "interpolation_parameter": {
            "values": [1e-3, 1e-2]
        },
        # fixed parameters
        "replay_buffer_size": {"value": 100000},
        "minibatch_size": {"value": 128},
        "env": {"value": "LunarLander-v3"},
        "episodes": {"value": 1500},
        "max_timesteps": {"value": 1000},
        "epsilon_start": {"value": 1.0},
        "epsilon_end": {"value": 0.01}
    }
}
sweep_id = wandb.sweep(sweep_config, project="Laboratory-2")
wandb.agent(sweep_id, function=trainDQNLunarLander)

In [29]:
import gymnasium as gym
import torch
import numpy as np
import wandb
import glob

# --- Init W&B ---
wandb.init(
    project="Laboratory-2",
    name="dqn-eval-video",
    config={"num_eval_episodes": 5}
)

env = gym.make(
    "LunarLander-v3",
    render_mode="rgb_array"
)

env = gym.wrappers.RecordVideo(
    env,
    video_folder="LunarLander_videos",
    name_prefix="dqn-eval",
    episode_trigger=lambda episode_id: True
)

agent = Agent(
    state_size=env.observation_space.shape[0],
    action_size=env.action_space.n,
    learning_rate=5e-3,
    replay_buffer_size=int(1e5),
    minibatch_size=64
)

agent.local_qnetwork.load_state_dict(
    torch.load("DQL_LunarLander_checkpoints/checkpointLunarLander_solved_in_503_episodes.pth")
)
agent.local_qnetwork.eval()

num_eval_episodes = 5

for ep in range(num_eval_episodes):
    state, _ = env.reset()
    total_reward = 0
    done = False

    while not done:
        action = agent.act(state, epsilon=0.0)
        next_state, reward, done, truncated, info = env.step(action)

        total_reward += reward
        state = next_state

        if done or truncated:
            print(f"Episode {ep+1}: reward = {total_reward}")
            wandb.log({f"episode_{ep+1}_reward": total_reward})
            break

env.close()

# --- Log videos to W&B ---
video_files = sorted(glob.glob("LunarLander_videos/*.mp4"))
for vf in video_files:
    wandb.log({"eval/video": wandb.Video(vf, fps=30, format="gif")})

wandb.finish()


  logger.warn(


Episode 1: reward = 264.93888747506355
Episode 2: reward = 241.35734634056504
Episode 3: reward = 266.1384300416239
Episode 4: reward = 254.9475277884772
Episode 5: reward = 230.78907007580221




0,1
episode_1_reward,▁
episode_2_reward,▁
episode_3_reward,▁
episode_4_reward,▁
episode_5_reward,▁

0,1
episode_1_reward,264.93889
episode_2_reward,241.35735
episode_3_reward,266.13843
episode_4_reward,254.94753
episode_5_reward,230.78907


In [6]:
def trainDQNCartPole():
    wandb.init(group="DQN_CartPole", name="CartPole_DQN")
    config = wandb.config

    env = gym.make(config.env)
    state_size = env.observation_space.shape[0]
    number_actions = env.action_space.n

    agent = Agent(
        state_size,
        number_actions,
        learning_rate=config.learning_rate,
        replay_buffer_size=config.replay_buffer_size,
        minibatch_size=config.minibatch_size,
        discount_factor=config.discount_factor,
        interpolation_parameter=config.interpolation_parameter
    )

    episodes = config.episodes
    max_steps = config.max_timesteps

    epsilon = config.epsilon_start
    epsilon_min = config.epsilon_end
    tau = config.tau

    scores_100 = deque(maxlen=100)

    for episode in range(1, episodes + 1):

        state, _ = env.reset()
        score = 0

        for t in range(max_steps):
            action = agent.act(state, epsilon)
            next_state, reward, done, _, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)

            state = next_state
            score += reward
            if done:
                break

        #epsilon = max(epsilon_min, epsilon_decay * epsilon)
        epsilon = epsilon_min + (config.epsilon_start - epsilon_min) * np.exp(-1. * episode / tau)

        scores_100.append(score)
        avg_score = np.mean(scores_100)

        wandb.log({
            "episode_reward": score,
            "epsilon": epsilon,
            "avg_score_100": avg_score
        })

        print(f"\rEpisode {episode}  Avg Score: {avg_score:.2f}", end="")
        #if episode % 100 == 0:
        #    print()

        if avg_score >= 500:
            print(f'\nEnvironment solved in {episode - 100:d} episodes!\tAverage Score: {avg_score:.2f}')
            torch.save(agent.local_qnetwork.state_dict(), f'checkpointCartPole_solved_in_{episode-100}_episodes.pth')
            break

    wandb.finish()


In [None]:
sweep_config = {
    "method": "grid",
    "metric": {
        "name": "avg_score_100",
        "goal": "maximize"
    },
    "parameters": {
        "learning_rate": {
            "values": [5e-4, 1e-3]
        },
        "tau": {
            "values": [200, 300]
        },
        "interpolation_parameter": {
            "values": [5e-3, 1e-2]
        },
        "epsilon_end": {"values": [0.01, 0.05]},

        # fixed parameters
        "replay_buffer_size": {"value": 100000},
        "minibatch_size": {"value": 128},
        "env": {"value": "CartPole-v1"},
        "episodes": {"value": 1500},
        "max_timesteps": {"value": 500},
        "epsilon_start": {"value": 1.0},
        "discount_factor": {"value": 0.99},
    }
}
sweep_id = wandb.sweep(sweep_config, project="Laboratory-2")
wandb.agent(sweep_id, function=trainDQNCartPole)

In [27]:
import gymnasium as gym
import torch
import numpy as np
import wandb
import glob

# ---- Init W&B ----
wandb.init(
    project="laboratory-2",
    name="dqn-cartpole-eval-video",
    config={"num_eval_episodes": 4}
)

env = gym.make(
    "CartPole-v1",
    render_mode="rgb_array"
)

env = gym.wrappers.RecordVideo(
    env,
    video_folder="videos",
    name_prefix="dqn-eval",
    episode_trigger=lambda episode_id: True  # record ALL episodes
)

agent = Agent(
    state_size=4,
    action_size=2,
    learning_rate=5e-3,
    replay_buffer_size=int(1e5),
    minibatch_size=64
)

agent.local_qnetwork.load_state_dict(torch.load("DQL_CartPole_checkpoints/checkpointCartPole_solved_in_1180_episodes.pth"))
agent.local_qnetwork.eval()

num_eval_episodes = 4

for ep in range(num_eval_episodes):
    state, _ = env.reset()
    total_reward = 0
    done = False

    while not done:
        # deterministic action (no epsilon)
        action = agent.act(state, epsilon=0.0)

        next_state, reward, done, truncated, info = env.step(action)
        total_reward += reward
        state = next_state

        if done or truncated:
            print(f"Episode {ep+1}: reward = {total_reward}")
            wandb.log({f"episode_{ep+1}_reward": total_reward})
            break

env.close()

# ---- Log videos to W&B ----
video_files = sorted(glob.glob("videos/*.mp4"))

for vf in video_files:
    wandb.log({"eval/video": wandb.Video(vf, fps=30, format="gif")})

wandb.finish()


  logger.warn(


Episode 1: reward = 500.0
Episode 2: reward = 500.0
Episode 3: reward = 500.0
Episode 4: reward = 500.0




0,1
episode_1_reward,▁
episode_2_reward,▁
episode_3_reward,▁
episode_4_reward,▁

0,1
episode_1_reward,500
episode_2_reward,500
episode_3_reward,500
episode_4_reward,500
