In [None]:
%%bash

apt-get install swig

git clone https://github.com/pybox2d/pybox2d
cd pybox2d
python setup.py build
python setup.py install

apt-get install -y xvfb

# pip install \
#     gym==0.21 \
#     gym[box2d]==0.21 \
#     pytorch-lightning==1.6.0 \
#     optuna==2.7.0 \
#     pyglet==1.5.27 \
#     pyvirtualdisplay

In [None]:
!pip install gym==0.21
# !pip install gym[box2d]==0.21
!pip install pytorch-lightning==1.6.0
!pip install optuna==2.7.0
!pip install pyglet==1.5.27
!pip install pyvirtualdisplay

In [None]:
!pip install gym[box2d]

In [None]:
pip install pyvirtualdisplay



---



---



---



---



---





---



---



---



---



---



#### Setup virtual display

In [52]:
from pyvirtualdisplay import Display
Display(visible=False, size=(1400, 900)).start()

<pyvirtualdisplay.display.Display at 0x79c7821dc850>

#### Import the necessary code libraries

In [53]:
import copy
import statistics
import gym
import torch
import optuna

import numpy as np
import torch.nn.functional as F

from collections import deque, namedtuple
from IPython.display import HTML
from base64 import b64encode

from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data.dataset import IterableDataset
from torch.optim import AdamW

from pytorch_lightning import LightningModule, Trainer

from gym.wrappers import RecordVideo, RecordEpisodeStatistics

from optuna.integration import PyTorchLightningPruningCallback

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
num_gpus = torch.cuda.device_count()

  and should_run_async(code)


In [54]:
def display_video(episode=0):
  video_file = open(f'/content/videos/rl-video-episode-{episode}.mp4', "r+b").read()
  video_url = f"data:video/mp4;base64,{b64encode(video_file).decode()}"
  return HTML(f"<video width=600 controls><source src='{video_url}'></video>")

#### Create the Deep Q-Network

In [55]:
class DQN(nn.Module):
  """
  The Deep Q-network

  return a vector of Q-values.
  each Q-value for each one of the action that we can take.
  The Q-value is the expected cumelative return, given state and action.
  Q(s,a)
  """
    def __init__(self, hidden_size, obs_size, n_actions):
        super().__init__()
        self.net = nn.Sequential(
            # Input: observation size (the state)
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            # Output: Q-values (one Q-value for each action.)
            nn.Linear(hidden_size, n_actions),
        )

    def forward(self, x):
      """
      What should happen when we pass an observation to this nn?
      """
      # apply the layers in the ANN to the input.
      # the Output: Q-values (one Q-value for each action.)
      return self.net(x.float())

#### Create the policy

In [56]:
def epsilon_greedy(state, env, net, epsilon=0.0):
  """
  The epcilon greedy policy.
  Care to balance between exploration and explotation by epsilon.
  """
  # Exploration >>
  if np.random.random() < epsilon:
      action = env.action_space.sample() # take random action.

  # Explotation >>
  # choose the action that yield the higher q-value.
  else:
      state = torch.tensor([state]).to(device) # take the state
      q_values = net(state) # predicr the best Q-values for that state
      _, action = torch.max(q_values, dim=1) # Take the max action that yield the higer Q-value
      action = int(action.item()) # take the index of the best action

  # return the action
  return action

#### Create the replay buffer
> __implement the functionality that will allow us to turn our environment into a dataset.__

- The replay buffer is a crucial component in training Deep Q-Networks (DQN) and other off-policy reinforcement learning algorithms.
- The replay buffer stores experiences (state transitions) observed by the __agent during interactions with the environment__ and allows for randomly sampling batches of experiences during the training process.

In [57]:
# Define the names of each element in the tuple.
Experience = namedtuple(
    "Experience",
    field_names=["state", "action", "reward", "done", "new_state"],
)

In [58]:
class ReplayBuffer:
  """
  Replay Buffer

  Parameters:
  ----------
  - `capacity`: maximum number of experiences that the buffer can store.
  """

  def __init__(self, capacity):
    """
    Initialization the replay buffer database
    """
    self.buffer = deque(maxlen=capacity)

  def __len__(self):
    """
    return the length of the dataset
    """
    return len(self.buffer)

  def append(self, experience):
    """
    add a new experience to the dataset
    """
    self.buffer.append(experience)

  def sample(self, batch_size):
    """
    randomly samples a batch of experiences from the replay buffer

    Random sampling helps in decorrelating the training data and
    breaking the temporal correlations present in consecutive experiences.
    The sampled batch is then typically used for updating the
    Q-network during the training process.

    Parameters:
    ----------
    - `batch_size` determines the number of experiences to sample
    """
    # choise `batch_size` indexes from the entire dataset
    indices = np.random.choice(len(self.buffer), batch_size, replace=False)
    # extract a tensor of each component:
    states, actions, rewards, dones, next_states = zip(*(self.buffer[idx] for idx in indices))

    #  returns a tuple arrays corresponding to the sampled batch of experiences.
    return (
          np.array(states),
          np.array(actions),
          np.array(rewards, dtype=np.float32),
          np.array(dones, dtype=np.bool),
          np.array(next_states),
        )

In [59]:
class RLDataset(IterableDataset):
  """
  an implementation of a PyTorch IterableDataset designed for creating
  an iterable dataset from the `ReplayBuffer` class.

  suitable for use in training reinforcement learning models.

  Parameters:
  -----------
  - `buffer`: the dataset (that contain all the experience)
  - `sample_size`: the number of experiences to be sampled from the replay buffer in each iteration
  """

  def __init__(self, buffer, sample_size=200):
    self.buffer = buffer # set the dataset
    self.sample_size = sample_size # set sample size

  def __iter__(self):
    """
    Iter method.
    """
    # extract `sample_size` of states, action, ...
    states, actions, rewards, dones, new_states = self.buffer.sample(self.sample_size)

    # return the samples iteratively.
    for i in range(len(dones)):
      # ..return each experience..
      yield states[i], actions[i], rewards[i], dones[i], new_states[i]

#### Create the environment

In [60]:
def create_environment(name):
  """
  Function for creating and configuring a gym environment.

  Parameters:
    - name: String, the name of the gym environment.

  Returns:
    A configured gym environment.

  """

  # Create the gym environment with the specified name.
  env = gym.make(name)

  # Wrap the environment with RecordVideo to record videos during training.
  env = RecordVideo(env, video_folder='./videos', episode_trigger=lambda x: x % 50 == 0)

  # Wrap the environment with RecordEpisodeStatistics to record episode statistics.
  env = RecordEpisodeStatistics(env)

  # Return the configured environment.
  return env


#### Create the test/sampling function

In [61]:
  @torch.no_grad()
  def play_episode(env, q_net, buffer, policy=None, epsilon=0.):
    """
    Function that play one epicode, and push the transitions
    (the experience) to the buffer (the memory)
    """
    obs = env.reset() # get initial state
    done = False

    while not done:
      if policy: # if there is a policy, act based on the policy
        action = policy(obs, env, q_net, epsilon=epsilon)
      else: # if there is no policy, act based on random policy
        action = env.action_space.sample()

      # perform the action, get the transition
      next_obs, reward, done, info = env.step(action)
      # create Experience instance (contain the transition)
      exp = Experience(obs, action, reward, done, next_obs)
      # append the experience in the buffer mamory
      buffer.append(exp)
      # update the observation
      obs = next_obs

#### Create the Deep Q-Learning algorithm

In [80]:
class DeepQLearning(LightningModule):
  """
  Implementing the Deep Q-learning algorithm.

  Parameters
  -----------
  - `env_name`: The name of the gym environment that the agent will interact with.
  - `policy`: The exploration-exploitation policy function. Default is epsilon_greedy, which uses epsilon-greedy exploration.
  - `sample_fn`: The function used for collecting experiences. Default is play_episode, which plays episodes and adds experiences to the replay buffer.
  - `capacity`: The capacity of the replay buffer, determining how many experiences it can store.
  - `batch_size`: The size of batches sampled from the replay buffer during training.
  - `lr`: The learning rate for the optimizer.
  - `hidden_size`: The size of the hidden layer in the neural network.
  - `gamma`: The discount factor for future rewards in the Q-learning update.
  - `loss_fn`: The loss function used for training the Q-network. Default is the Huber loss (F.smooth_l1_loss).
  - `optim`: The optimizer used for updating the Q-network weights. Default is AdamW.
  - `eps_start`: The initial epsilon value for epsilon-greedy exploration.
  - `eps_end`: The minimum epsilon value for epsilon-greedy exploration.
  - `eps_last_episode`: The episode at which epsilon should reach its minimum value.
  - `samples_per_epoch`: The number of samples to collect in the replay buffer before starting training.
  - `sync_rate`: The frequency at which the `target Q-network` is synchronized with the `Q-network`.
  """

  def __init__(self, env_name, policy=epsilon_greedy, sample_fn=play_episode,
               capacity=100_000, batch_size=256, lr=1e-3, hidden_size=128, gamma=0.99,
               loss_fn=F.smooth_l1_loss, optim=AdamW, eps_start=1.0, eps_end=0.15,
               eps_last_episode=100, samples_per_epoch=10_000, sync_rate=10):

    super().__init__()
    self.env = create_environment(env_name) # create the envireoment

    obs_size = self.env.observation_space.shape[0] # Input shape (for DQN): state size
    n_actions = self.env.action_space.n # Output shape (for DQN): action size

    ## DQN -> the policy.
    # get as input: state
    # return as output: Q-value for each action.
    # - The output of the DQN is a vector of Q-values, where each element
    #   corresponds to the estimated Q-value for a specific action.
    # - The Q-values represent the expected cumulative reward for taking each
    #   action from the current state.
    self.q_net = DQN(hidden_size, obs_size, n_actions).to(device)

    ## Target DQN -> for improve the stability and convergence of the learning process
    # - The target Q-network helps stabilize training by providing a fixed target for a certain number of iterations.
    self.target_q_net = copy.deepcopy(self.q_net)

    # define the policy function (the epsilon greedy).
    # the epsolin greedy balance the:
    # - exploration (take the best action based on the policy)
    # - explotation (take the random action)
    self.policy = policy

    # the mamory (contains the experience)
    self.buffer = ReplayBuffer(capacity=capacity)

    # save the hyperparameter (for convince)
    self.save_hyperparameters()

    # ensures that the replay buffer is filled with enough samples before training begins.
    # The loop continues until the number of samples in the buffer reaches.
    while len(self.buffer) < self.hparams.samples_per_epoch:
      print(f"{len(self.buffer)} samples in experience buffer. Filling...")
      # sample_fn() => generate new samples using the exploration strategy
      #                defined by epsilon (exploration probability)
      # so its generate samples and push the samples to the buffer (memory)
      self.hparams.sample_fn(
          self.env, # the environment
          self.q_net, # the policy
          self.buffer, # the dataset
          epsilon=self.hparams.eps_start # the epcilon (for the policy that'll balance exploration & explotation)
        )

  def forward(self, x):
    """
    Forward propegation, return Q-value for each action
    """
    output = self.q_net(x)
    return output

  def configure_optimizers(self):
    """
    Configure the optimizer for updating the Q-network weigths
    """
    q_net_optimizer = self.hparams.optim(self.q_net.parameters(), lr=self.hparams.lr)
    return [q_net_optimizer]

  def train_dataloader(self):
    """
    Create Dataloader object for training
    """
    # Create the dataset
    dataset = RLDataset(self.buffer, self.hparams.samples_per_epoch)
    # Create the data loader
    dataloader = DataLoader(
        dataset=dataset,
        batch_size=self.hparams.batch_size,
    )
    return dataloader

  def training_step(self, batch, batch_idx):
    """
    Execute a training step.

    Parameters:
    -----------
    - `batch`: the batch that contaon the components of the experience.
    """
    # 1. Extract the components from the batch (extract batch from the memory)
    # this is represent the transition that the agent make in the past.
    states, actions, rewards, dones, next_states = batch
    # 2. Formatting Actions, Rewards, and Dones:
    # reshape the tensors to match the expected format for computations.
    actions = actions.unsqueeze(1)
    rewards = rewards.unsqueeze(1)
    dones = dones.unsqueeze(1)

    # 3. Q-Value Estimation:
    # compute the Q-value of the actions that taken (as part from the experiments) in each of the state.
    # - Uses the gather operation to select the Q-values corresponding to the taken actions.
    state_action_values = self.q_net(states).gather(1, actions)

    # 4. Target Q-Value Estimation:
    # this is the Target DQN that stay stable, and not change during the training.
    # the Target-DQN updated each `sync_rate` epicodes.
    # so the Target-DQN use for calculate the target vakue for compare it to the
    # Q-value prediction of the DQN that updates itself each step.
    # - the different between those policies help us measure if the
    #   updated policy (that update intelf each step), better than the
    #   "constant" policy (that update intelf `sync_rate` each epicode only.)
    with torch.no_grad():
      # compute the target Q-values and select the max Q-value for each next state.
      next_state_values, _ = self.target_q_net(next_states).max(dim=1, keepdim=True)
      # Sets the target Q-values to 0 for states where the episode is done (dones).
      next_state_values[dones] = 0.0

    # 5. Bellman Equation and Loss Calculation:
    # - Applies the Bellman equation to calculate the expected state-action values
    #   using the rewards and discounted future Q-values.
    expected_state_action_values = rewards + self.hparams.gamma * next_state_values
    # Compute the loss for updating the DQN weights.
    # - comparing the estimated Q-values and the expected Q-values.
    loss = self.hparams.loss_fn(state_action_values, expected_state_action_values)

    # log the loss
    self.log('episode/MSE Loss', loss, on_step=False, on_epoch=True)

    # return the loss:
    # - A measure of the disparity between the Q-values predicted by the
    #   Q-network and the Target-Q-values that derived from the Bellman equation.
    return loss


  def training_epoch_end(self, training_step_outputs):
    """
    A callback function when the training epoch is end.
    """
    # 1. Get the maximum epsilon
    # the idea behind is each epoch we increase the explotation,
    # and decease the exploration.
    # over the time, we want that the agent more exploit and less explor..
    epsilon = max(
        self.hparams.eps_end,
        self.hparams.eps_start - self.current_epoch / self.hparams.eps_last_episode
    )

    # 2. update the memory with the new policy.
    # after each epoch, the policy is updating.
    # so now, we want generate more samples based on the new policy,
    # for improve the memory and the experience.
    self.hparams.sample_fn(self.env,   # environment
                           self.q_net, # the policy
                           self.buffer,# the dataset
                           policy=self.policy, # the policy function (epsilo greedy)
                           epsilon=epsilon)    # the epsilon that'll care to balance the exploration & explotation

    # log the return of the epicode.
    self.log("episode/Return", self.env.return_queue[-1])

    # set in `hp_metric` folder the mean return each 200 epoch
    # (its for optuna optimization!)
    if self.current_epoch == 199:
      returns = list(self.env.return_queue)
      self.log("hp_metric", statistics.mean(returns))

    ## Updating the target-DQN weigths by the DQN weigths each `sync_rate` times
    if self.current_epoch % self.hparams.sync_rate == 0:
      self.target_q_net.load_state_dict(self.q_net.state_dict())

#### Create the objective function

In [63]:
def objective(trial):
  """
  An objective function to minimize (by optuna)
  for find the best hyperparameters of our DQN.
  """
  # 1. With a given ramge, from each hyperparameter
  #    select an hyperparameter that we want to trail.
  lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
  gamma = trial.suggest_float("gamma", 0.0, 1.0)
  hidden_size = trial.suggest_categorical("hidden_size", [32, 64, 128, 256])
  eps_end = trial.suggest_float("eps_end", 0.0, 0.3)

  # 2. Create an instance of our Deep Q-learning algorithm with
  # the suggested hyperparameters.
  algo = DeepQLearning(
      'LunarLander-v2',
      lr=lr,
      gamma=gamma,
      hidden_size=hidden_size,
      eps_end=eps_end
  )

  # callback function after the epoch is complate.
  # - This callback is used for pruning, which is a technique to
  #   stop unpromising trials early and save computational resources.
  callback = PyTorchLightningPruningCallback(trial, monitor="episode/Return")

  trainer = Trainer(
      gpus=num_gpus,
      max_epochs=200,
      track_grad_norm=2,
      callbacks=[callback]
  )
  # define dictionary of the suggested hyperparameter
  hyperparameters = dict(
      lr=lr,
      gamma=gamma,
      hidden_size=hidden_size,
      eps_end=eps_end
  )
  # log the hyperparameters
  trainer.logger.log_hyperparams(hyperparameters)

  # fit the trainer
  trainer.fit(algo)

  return trainer.callback_metrics["episode/Return"].item()

#### Create the optimization study

In [64]:
# create the pruner (for stop the trails that not promising)
pruner = optuna.pruners.SuccessiveHalvingPruner()
# create the study
study = optuna.create_study(direction="maximize", # maximze the value that we monitoring in the objective function. (episode/Return)
                            pruner=pruner)

[32m[I 2023-12-08 06:47:39,705][0m A new study created in memory with name: no-name-0a67e0f6-c498-4d91-8552-e1df319cb438[0m


#### Purge logs and run the visualization tool (Tensorboard)

In [None]:
# Start tensorboard.
!rm -r /content/lightning_logs/
!rm -r /content/videos/
%load_ext tensorboard
%tensorboard --logdir /content/lightning_logs/

#### Run the hyperparameter search

In [None]:
study.optimize(objective, n_trials=20)

#### Select and use the best hyperparameters

In [67]:
study.best_params

  and should_run_async(code)


{'lr': 0.004077209276002187,
 'gamma': 0.21771064621450942,
 'hidden_size': 32,
 'eps_end': 0.22327832664379624}

In [None]:
# Start tensorboard.
!rm -r /content/lightning_logs/
!rm -r /content/videos/
%load_ext tensorboard
%tensorboard --logdir /content/lightning_logs/

In [None]:
# Now lets train the DQN with the best hyperparameters from our study
algo = DeepQLearning('LunarLander-v2', **study.best_params)

trainer = Trainer(
  gpus=num_gpus,
  max_epochs=1000,
  track_grad_norm=2,
)

trainer.fit(algo)

#### Check the resulting policy

In [84]:
display_video(episode=800)