# Reinforcement learning project


This is a reinforcement learning project on deep Q-learning and policy gradient methods.

In [1]:
import gymnasium as gym
import numpy as np
import torch
from torch import nn as nn
from torch import optim as optim
from torch.distributions import Categorical
from torch.nn import functional as F

## 1) Lunar Lander environment

Your objective is to understand how the Lunar Lander environment works and what is the problem we want to solve.
You need to write what are the states, the rewards and the actions.
Write down the Markov decision process associated to the problem.

In [2]:
"""Run a random policy."""

# Create and reset environment
env = gym.make("LunarLander-v3", continuous=False, render_mode="rgb_array")
obsevation, info = env.reset(seed=None)
total_reward = 0.0

# While the episode is not finished
finished = False
while not finished:

    # Select a random action
    action = env.action_space.sample()

    # One step forward
    obsevation, reward, terminated, truncated, info = env.step(action)
    finished = terminated or truncated

    # Eventually render the environment (render mode should be "human")
    total_reward += reward
    env.render()

# Print reward
print("total_reward = {}".format(total_reward))
env.close()

total_reward = -235.50002544373228


## 2) Deep neural Q-network

We aim to build a deep Q-network $ Q_\theta(s, a) $, which estimates the Q-value for each action $ a $ given a state $ s $.
This network is parameterized by weights $ \theta $ and replaces the classical Q-table used in tabular methods.

During learning, the network is updated to minimize the temporal difference (TD) error: $\delta = r + \gamma \max_{a'} Q_\theta(s', a') - Q_\theta(s, a)$.
This leads to the Q-learning update rule: $Q_\theta(s, a) \leftarrow Q_\theta(s, a) + \alpha \, \delta$.
In practice, we minimize the squared TD error using gradient descent.

Below are 3 code samples.
- A class implementing the Q-network. You must specify `input_size` and `nb_actions`.
- A script to train the Q-network. Complete the missing parts of the code.
- A script to test the Q-network. Determine how to select an action from the Q-values.

In [3]:
class QNetwork(nn.Module):
    """Deep neural Q-network."""

    def __init__(self):
        """Initialize."""
        super(QNetwork, self).__init__()
        # --> TODO: put the correct input and output sizes
        input_size = 8
        nb_actions = 4

        # Layers
        self.layer_a = nn.Linear(input_size, 128)
        self.layer_b = nn.Linear(128, 128)
        self.layer_c = nn.Linear(128, nb_actions)

    def forward(self, x):
        """Forward."""
        x = F.relu(self.layer_a(x))
        x = F.relu(self.layer_b(x))
        q_values = self.layer_c(x)
        return q_values

In [30]:
"""Run deep Q-learning."""

# ---> TODO: find good hyperparameters
discount_factor = 0.99   # because rewards come late (safe landings) and need long-term planning
learning_rate = 0.001    # to avoid unstable Q-updates in a noisy, continuous state space
epsilon_start = 1.0      # to explore all kinds of landings and failures at the beginning
epsilon_end = 0.01       # to keep minimal exploration for edge cases after policy stabilizes
epsilon_decay = 0.995    # to slowly shift from random moves to confident decisions over time

# Create environment and reset it
env = gym.make("LunarLander-v3", continuous=False, render_mode="rgb_array")
observation, info = env.reset(seed=None)

# Create Q-network and enable train mode
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
q_network = QNetwork().to(device)
q_network.train()

# Create optimizer
optimizer = optim.Adam(q_network.parameters(), lr=learning_rate)

# Launch training
running_reward = 0.0
training_iteration = 0
epsilon = epsilon_start
while True:

    # Reset the environment
    observation, info = env.reset()
    episode_total_reward = 0.0

    # Sample a trajectory
    while True:

        # Epsilon-greedy action selection (random)
        if np.random.rand() < epsilon:
            action = env.action_space.sample()

        # Epsilon-greedy action selection (best action)
        else:
            with torch.no_grad():

                # Add batch dimension and transform to tensor
                x = np.expand_dims(observation, 0)
                x = torch.from_numpy(x).float().to(device)
                q_values = q_network(x)

                # ---> TODO: how to compute action
                action = torch.argmax(q_values).item()

        # Take the action
        observation_next, reward, terminated, truncated, info = env.step(
            action)

        # Check if episode is done and save reward
        done = terminated or truncated
        episode_total_reward += reward

        # Compute the TD target
        with torch.no_grad():
            x_next = np.expand_dims(observation_next, 0)
            x_next = torch.from_numpy(x_next).float().to(device)
            q_next = q_network(x_next)
            q_next_max = q_next.max(dim=1).values.item()

            # ---> TODO: compute the TD target
            target = reward + discount_factor * q_next_max * (0 if done else 1)

        # TD prediction
        x = np.expand_dims(observation, 0)
        x = torch.from_numpy(x).float().to(device)
        q_pred = q_network(x)[0, action]

        # ---> TODO: compute loss and update
        loss = nn.functional.mse_loss(q_pred, torch.tensor(target, dtype = torch.float32).to(device))

        # Reset gradients to 0.0
        optimizer.zero_grad()

        # Compute the gradients of the loss (backpropagation)
        loss.backward()

        # Update the policy parameters (gradient ascent)
        optimizer.step()

        # Transition
        observation = observation_next

        # End episode
        if done:
            break

    # Logging
    running_reward = 0.1 * episode_total_reward + 0.9 * running_reward
    epsilon = max(epsilon_end, epsilon * epsilon_decay)

    # Log results
    log_frequency = 5
    training_iteration += 1
    if training_iteration % log_frequency == 0:

        # Save neural network
        torch.save(q_network.state_dict(), "q_network.pt")

        # Print results
        print("iteration {} - last reward: {:.2f}".format(
            training_iteration, episode_total_reward))

        # Exit condition
        if running_reward >= 200:
            break

# Close environment
env.close()

iteration 5 - last reward: -375.65
iteration 10 - last reward: -197.60
iteration 15 - last reward: -162.55
iteration 20 - last reward: -71.14
iteration 25 - last reward: -165.36
iteration 30 - last reward: -395.71
iteration 35 - last reward: -98.30
iteration 40 - last reward: -53.32
iteration 45 - last reward: -182.16
iteration 50 - last reward: -79.38
iteration 55 - last reward: -85.73
iteration 60 - last reward: -96.40
iteration 65 - last reward: -141.19
iteration 70 - last reward: -65.75
iteration 75 - last reward: -86.09
iteration 80 - last reward: -200.82
iteration 85 - last reward: -235.93
iteration 90 - last reward: -319.33
iteration 95 - last reward: -64.09
iteration 100 - last reward: -76.29
iteration 105 - last reward: -7.73
iteration 110 - last reward: -135.60
iteration 115 - last reward: -21.64
iteration 120 - last reward: -150.45
iteration 125 - last reward: -70.00
iteration 130 - last reward: -242.78
iteration 135 - last reward: -265.27
iteration 140 - last reward: -71.79

In [31]:
"""Test Q-network."""

# Create environment and reset it
env = gym.make("LunarLander-v3", continuous=False, render_mode="human")
observation, info = env.reset(seed=None)
total_reward = 0.0

# Load trained Q-network and enable test mode
device = torch.device("cpu")
q_network = QNetwork().to(device)
q_network.load_state_dict(torch.load("q_network.pt", weights_only=True))
q_network.eval()

# While the episode is not finished
finished = False
while not finished:

    # Add batch dimension and transform to tensor
    x = np.expand_dims(observation, 0)
    x = torch.from_numpy(x).float().to(device)

    # Compute action from the Q-table
    q_values = q_network(x)

    # ---> TODO: how to select an action
    action = torch.argmax(q_values, dim=1).item()

    # One step forward
    observation, reward, terminated, truncated, info = env.step(action)
    finished = terminated or truncated

    # Eventually render the environment (render mode should be "human")
    total_reward += reward
    env.render()

# Print reward
print("total_reward = {}".format(total_reward))
env.close()

total_reward = 262.46410197990696


## 3) REINFORCE algorithm

We want to build a policy $\pi_\theta(a | s) = P(a | s, \theta)$ that gives the probability of choosing an action $a$ in state $s$.
The policy is a deep neural network parameterized by some weights $\theta$.
The policy is also referred to as "actor".

We want to find the parameters $\theta$ that maximize the performance measure $J(\theta) = \mathbb{E}_{\pi_\theta}[ G_0 ]$ with $G_t = \sum_{k=0}^{\infty} \gamma^k r_{t+k+1}$ and $\gamma \in [0, 1]$ being a discount factor.
To do so, we use the gradient ascent method: $\theta_{k+1} = \theta_{k} + \alpha \nabla_{\theta_k} J(\theta_k)$ with $\alpha$ being the learning rate.
The performance measure depends on both the action selection and the distribution of states.
Both are affected by the policy parameters, which make the computation of the gradient challenging.

The policy gradient theorem gives an expression for $\nabla_\theta J(\theta)$ that does not involve the derivative of the state distribution.
The expectation is over all possible state-action trajectories over the policy $\pi_\theta$:
$\nabla_\theta J(\theta) = \mathbb{E}_{\pi_\theta}[ \sum_{t=0}^{\infty} G_t \nabla_\theta \ln \pi_\theta(a_t | s_t) ]$.
In the REINFORCE algorithm, we use a Monte-Carlo estimate over one episode, i.e., one trajectory:
$\nabla_\theta J(\theta) = \sum_{t=0}^{\infty} G_t \nabla_\theta \ln \pi_\theta(a_t | s_t)$.

Your objective is to complete the REINFORCE algorithm to train the policy until convergence. To solve the problem, you need to achieve a cumulative reward of at least 200 when training the policy. Below are: the code of the REINFORCE algorithm and a script to test your policy once it is trained.

Below are 3 code samples.
- A class implementing the policy. You must specify `input_size` and `nb_actions`.
- A script to train the policy using REINFORCE. Complete the missing parts of the code.
- A script to test the policy. Determine how to select an action from the output.

In [6]:
class ActorModel(nn.Module):
    """Deep neural network policy."""

    def __init__(self):
        """Initialize."""
        super(ActorModel, self).__init__()
        # --> TODO: specify the correct input and output sizes
        input_size = 8
        nb_actions = 4

        # Layers
        self.layer_a = nn.Linear(input_size, 128)
        self.layer_b = nn.Linear(128, 128)
        self.policy = nn.Linear(128, nb_actions)

    def forward(self, x):
        """Forward."""
        x = F.relu(self.layer_a(x))
        x = F.relu(self.layer_b(x))
        action_prob = F.softmax(self.policy(x), dim=-1)
        return action_prob

In [7]:
"""Run REINFORCE."""

# ---> TODO: find good hyperparameters
discount_factor = 0.99  # encourages long-term reward, fits LunarLander  
learning_rate = 0.001   # to avoid unstable updates from high-variance policy gradients in REINFORCE

# Create environment and reset it
env = gym.make("LunarLander-v3", continuous=False, render_mode="rgb_array")
obsevation, info = env.reset(seed=None)

# Create policy and enable train mode
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
policy = ActorModel().to(device)
policy.train()

# Create optimizer
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

# Launch training
running_reward = 0.0
training_iteration = 0
while True:

    # Experience
    # ------------------------------------------

    # Reset the environment
    obsevation, info = env.reset()

    # During experience, we will save:
    # - the probability of the chosen action at each time step pi(at|st)
    # - the rewards received at each time step ri
    saved_probabilities = list()
    saved_rewards = list()

    # Sample a trajectory
    while True:

        # Add batch dimension and transform to tensor
        x = torch.from_numpy(np.expand_dims(obsevation, 0)).float()

        # Create a categorical distribution over the list of probabilities
        # of actions (given by the policy) and sample an action from it
        probabilities = policy(x.to(device))
        distribution = Categorical(probabilities)
        action = distribution.sample()

        # Take the action
        obsevation, reward, terminated, truncated, info = env.step(
            action.item())

        # Save the probability of the chosen action and the reward
        saved_probabilities.append(probabilities[0][action])
        saved_rewards.append(reward)

        # End episode
        if terminated or truncated:
            break

    # Compute discounted sum of rewards


    # ------------------------------------------

    # Current discounted reward
    discounted_reward = 0.0

    # List of all the discounted rewards, for each time step
    discounted_rewards = list()

    # ---> TODO: compute discounted rewards
    for r in saved_rewards[::-1]:
        discounted_reward = r + discount_factor * discounted_reward
        discounted_rewards.insert(0, discounted_reward)

    # Eventually normalize for stability purposes
    discounted_rewards = torch.tensor(discounted_rewards)
    mean, std = discounted_rewards.mean(), discounted_rewards.std()
    discounted_rewards = (discounted_rewards - mean) / (std + 1e-7)

    # Update policy parameters
    # ------------------------------------------

    # For each time step
    actor_loss = list()
    for p, g in zip(saved_probabilities, discounted_rewards):

        # ---> TODO: compute policy loss
        time_step_actor_loss = -torch.log(p) * g

        # Save it
        actor_loss.append(time_step_actor_loss.view(1))

    # Sum all the time step losses
    actor_loss = torch.cat(actor_loss).sum()

    # Reset gradients to 0.0
    optimizer.zero_grad()

    # Compute the gradients of the loss (backpropagation)
    actor_loss.backward()

    # Update the policy parameters (gradient ascent)
    optimizer.step()

    # Logging
    # ------------------------------------------

    # Episode total reward
    episode_total_reward = sum(saved_rewards)
    running_reward = 0.1 * episode_total_reward + 0.9 * running_reward

    # Log results
    log_frequency = 5
    training_iteration += 1
    if training_iteration % log_frequency == 0:

        # Save neural network
        torch.save(policy.state_dict(), "policy.pt")

        # Print results
        print("iteration {} - last reward: {:.2f}".format(
            training_iteration, episode_total_reward))

        # Exit condition
        if running_reward >= 200:
            break

# Close environment
env.close()

iteration 5 - last reward: -141.72
iteration 10 - last reward: -188.17
iteration 15 - last reward: -146.14
iteration 20 - last reward: -117.40
iteration 25 - last reward: -117.63
iteration 30 - last reward: -19.22
iteration 35 - last reward: -139.01
iteration 40 - last reward: -187.32
iteration 45 - last reward: -126.62
iteration 50 - last reward: -192.32
iteration 55 - last reward: -68.22
iteration 60 - last reward: -85.34
iteration 65 - last reward: -214.87
iteration 70 - last reward: -145.13
iteration 75 - last reward: -112.09
iteration 80 - last reward: -56.32
iteration 85 - last reward: -82.82
iteration 90 - last reward: -109.25
iteration 95 - last reward: -348.72
iteration 100 - last reward: -184.89
iteration 105 - last reward: -203.10
iteration 110 - last reward: -316.28
iteration 115 - last reward: -160.72
iteration 120 - last reward: -395.96
iteration 125 - last reward: -81.24
iteration 130 - last reward: -63.47
iteration 135 - last reward: -60.63
iteration 140 - last reward: 

In [8]:
"""Test policy."""
# Create environment and reset it
env = gym.make("LunarLander-v3", continuous=False, render_mode="human")
obsevation, info = env.reset(seed=None)
total_reward = 0.0

# Load trained policy and enable test mode
device = torch.device("cpu")
policy = ActorModel().to(device)
policy.load_state_dict(torch.load("policy.pt", weights_only=True))
policy.eval()

# While the episode is not finished
finished = False
while not finished:

    # Add batch dimension and transform to tensor
    x = torch.from_numpy(np.expand_dims(obsevation, 0)).float()

    # Compute action from the policy
    action = policy(x.to(device))

    # ---> TODO: how to select an action
    with torch.no_grad():
        x = torch.from_numpy(np.expand_dims(obsevation, 0)).float().to(device)
        probabilities = policy(x)

        # Select action with highest probability
        action = torch.argmax(probabilities, dim=1).item()

    # One step forward
    obsevation, reward, terminated, truncated, info = env.step(action)
    finished = terminated or truncated

    # Eventually render the environment (render mode should be "human")
    total_reward += reward
    env.render()

# Print reward
print("total_reward = {}".format(total_reward))
env.close()

total_reward = 105.0579070455011
