# REINFORCE

> Implement the REINFORCE algorithm.


#### Install the libraries

In [None]:
!apt-get install -y xvfb

!pip install \
  pygame \
  gym==0.23.1 \
  pytorch-lightning==1.6 \
  pyvirtualdisplay

#### Setup virtual display
> Its for allowing us to render the enviroment.

In [None]:
from pyvirtualdisplay import Display
Display(visible=False, size=(1400, 900)).start()

#### Import the necessary code libraries

In [None]:
import copy
import torch
import random
import gym
import matplotlib

import numpy as np
import matplotlib.pyplot as plt

import torch.nn.functional as F

from collections import deque, namedtuple
from IPython.display import HTML
from base64 import b64encode

from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data.dataset import IterableDataset
from torch.optim import AdamW

from pytorch_lightning import LightningModule, Trainer

from gym.wrappers import RecordVideo, RecordEpisodeStatistics, \
  NormalizeObservation, NormalizeReward


device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
num_gpus = torch.cuda.device_count()

#### Utils functions

In [None]:
def plot_policy(policy):
  """
  Plotting the policy in a hitmap.
  ------------------------------
   computes the probability of taking a "left" action for each
   state according to the policy,
   and then visualizes these probabilities using a heatmap.
  """

  # 1. Grid Creation
  # Initial arrays representing different aspects of the
  # state space (position, velocity, angle, and angular velocity).
  pos = np.linspace(-4.8, 4.8, 100)
  vel = np.random.random(size=(10000, 1)) * 0.1
  ang = np.linspace(-0.418, 0.418, 100)
  ang_vel = np.random.random(size=(10000, 1)) * 0.1

  # 2. State Grid Construction:
  # - The state grid is constructed by stacking the meshgrid arrays and the
  #   velocity/angular velocity arrays along the last axis.
  # - The resulting grid array represents combinations of
  #   position, angle, velocity, and angular velocity.
  g1, g2 = np.meshgrid(pos, ang)
  grid = np.stack((g1,g2), axis=-1)
  grid = grid.reshape(-1, 2)
  grid = np.hstack((grid, vel, ang_vel))

  # 3. Policy Evaluation:
  # The policy function (policy) is applied to the state grid.
  # The result is a tensor of probabilities for each action
  # (e.g., moving left or right).
  probs = policy(grid).detach().numpy()
  probs_left = probs[:, 0]

  # 4. Data Reshaping:
  # The probabilities for the "left" action (probs_left) are reshaped
  # to match the shape of the position and angle grid.
  probs_left = probs_left.reshape(100, 100)
  probs_left = np.flip(probs_left, axis=1)

  # 5. Plotting
  plt.figure(figsize=(8, 8))
  plt.imshow(probs_left, cmap='coolwarm')
  plt.colorbar()
  plt.clim(0, 1)
  plt.title("P(left | s)", size=20)
  plt.xlabel("Cart Position", size=14)
  plt.ylabel("Pole angle", size=14)
  plt.xticks(ticks=[0, 50, 100], labels=['-4.8', '0', '4.8'])
  plt.yticks(ticks=[100, 50, 0], labels=['-0.418', '0', '0.418'])


In [None]:
def test_env(env_name, policy, obs_rms):
  """
  Test a reinforcement learning policy on a specified environment.


  Parameters:
  ----------
  - `env_name`: the name of the env.
  - `policy`: the policy of the agent (givan a state, give me an action...)
  - `obs_rms`: the observation root mean square value of the environment.
               This is a precomputed value used for normalization.

  """
  env = gym.make(env_name) # create the env
  env = RecordVideo(env, 'videos', episode_trigger=lambda e: True) # record the video
  env = NormalizeObservation(env) # normalize the observation in the env
  env.obs_rms = obs_rms # set the observation root mean square value of the environment

  for episode in range(10): # loop over `n` episodes
    done = False
    obs = env.reset() # get the initial state (first observation)
    while not done: # while it not done, play in the game
      action = policy(obs).multinomial(1).cpu().item() # given an observation, produce an action based on the rl policy
      obs, _, done, _ = env.step(action) # perform the action in the env
  del env

In [None]:
def display_video(episode=0):
  """
  Display a vidoe in HTML format.
  """
  video_file = open(f'/content/videos/rl-video-episode-{episode}.mp4', "r+b").read()
  video_url = f"data:video/mp4;base64,{b64encode(video_file).decode()}"
  return HTML(f"<video width=600 controls><source src='{video_url}'></video>")

#### Create the policy

In [None]:
class GradientPolicy(nn.Module):
  """
  A policy gradient class
  -----------------------
  In DRL, the policy is a neural network.
  So the neural network will get an action, and predict the
  best policy that can be.

  it's designed to map input states to probability distributions over
  a set of discrete actions.

  by this policy we are going to solve the cart pole problem.
  """

  def __init__(self, in_features, n_actions, hidden_size=128):
    super().__init__()
    self.fc1 = nn.Linear(in_features, hidden_size) # input: states
    self.fc2 = nn.Linear(hidden_size, hidden_size)
    self.fc3 = nn.Linear(hidden_size, n_actions) # output: actions

  def forward(self, x):
    x = torch.tensor(x).float().to(device)
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    # apply softmax for get probability distributions of actions.
    # e.g, the probability distribution to take the actions left and rigth.
    # -> ([0.44,0.56]) -> ([probability of L, probability of R])
    x = F.softmax(self.fc3(x), dim=-1)
    return x

#### Plot the untrained policy
> Its literally the initial (random) weigths of the `GradientPolicy` class.

In [None]:
policy = GradientPolicy(4, 2) # input,output (state/action)
grid = plot_policy(policy)

# Interpretation:
# we plot the probability to take action `left` give state `s`.
# - on the y-axis you have the angle of the pole.
# - on the x-axis you have the position (location) of the cart
#   (the pole in on the cart).
# - the color-map shows us the probability of take the action `left`.
#   - if the probability is 1, its 100% that action `left`.
#   - if the probability is 0, its 100% that action `right`.

# you can see that without training out nn choose actions very randomaly.

In [None]:
grid

#### Create the environment (several copies of env)
> Creating multiple environments in parallel can be useful for training reinforcement learning agents more efficiently, especially when using vectorized environments.

In [None]:
# Create `num_envs` enviroments (its for perellal)
env = gym.vector.make("CartPole-v1", num_envs=2)

In [None]:
# as you can see, we have two rows.
# the first row -> is the initial state of the first env
# the second row -> is the initial state of the second env
env.reset()

# states:
# Cart Position, Cart Velocity, Pole Angle, Pole Angular Velocity

In [None]:
# see the observation space:
# its show us the minimum value,
# and the maximum values that can be in the states.
print(env.observation_space)
print()

# the action spcace: 2 in env_1, 2 in env_2.
print(env.action_space)

In [None]:
# Lets test the interaction with the envs:

# create two actions for the two envs:
# take left in env_1_ take laft in env_2.
actions = np.array([0, 0])

# let's perdorm the actions in the environments
next_obs, rewards, dones, infos = env.step(actions)

In [None]:
# this is the next state in the enviroments:
next_obs

In [None]:
# the rewards:
rewards

In [None]:
# if the game is done:
dones

In [None]:
# additional information
infos

In [None]:
def create_env(env_name, num_envs):
  """
  Function to create multiple environments
  to be solved with by `torch lightning` library.

  Parameters:
  ------------
  - `env_name`: the name of the environment.
  - `num_envs`: how many environment to create.
  """
  # create the environment(s)
  env = gym.vector.make(env_name, num_envs=num_envs)

  # Wrapper the env: for saving the executions statistics in an array that
  # we can access the outcomes of the previous episodes.
  env = RecordEpisodeStatistics(env)

  # Wrapper the env: normalize the observation
  # (neural network loves normalize data)
  env = NormalizeObservation(env)

  # Wrapper: normalize the rewards also.
  env = NormalizeReward(env)

  return env

#### Create the dataset


In [None]:
class RLDataset(IterableDataset):
  """
  a custom iterable dataset for reinforcement learning.

  - It generates batches of training data for training a policy.
  - The dataset consists of transitions,
    where each transition includes an observation,
    an action, and the return (cumulative discounted rewards).

  Parameters:
  -----------
  - `env`: The environment for which the dataset is generated.
  - `policy`: The policy used to generate actions in the environment.
  - `steps_per_epoch`: The number of transitions to generate for each epoch.
  - `gamma`: The discount factor for calculating the cumulative discounted rewards.
  - `obs`: The current observation, initialized by resetting the environment.
  """

  # Initialization
  def __init__(self, env, policy, steps_per_epoch, gamma):
    self.env = env
    self.policy = policy
    self.steps_per_epoch = steps_per_epoch
    self.gamma = gamma
    self.obs = env.reset()

  # Iteration method
  @torch.no_grad()
  def __iter__(self): # an iterator for the dataset.
    """
    Iteration method
    defines how to return the elements of the dataset in sequence.
    """

    # list where we store the transition where we interactive with the env.
    transitions = []

    # Transition Generation (interact with the env, save the transitions.)
    for step in range(self.steps_per_epoch):
      action = self.policy(self.obs) # choose an action
      action = action.multinomial(1).cpu().numpy() # prepare the action to be pass to the env
      next_obs, reward, done, info = self.env.step(action.flatten()) # perform the action (flatten(): [[a],[a]] -> [a,a])
      transitions.append((self.obs, action, reward, done)) # append the trandistion
      self.obs = next_obs # update the current observation

    ## create a tensores of all the observation, actios, rewards, dones.
    # zip(*transitions) -> unpacking the list of transitions (convert to tuple (*)).
    # after we have tuples (obs, action, reward, done),
    # stack -> stack arrays along a new axis. (each variable have array.)
    # obs_b, action_b,... -> are all arrays.
    obs_b, action_b, reward_b, done_b = map(np.stack, zip(*transitions))

    # our algorithms doesn't learn from rewards,
    # but based on the return! so we need to sume all the rewards.
    # its mean that for each time step we need to compute the
    # discountet sum of the rewards.

    # initial arrays (for each env) of zeros (for store the return.):
    # - this is for store the running sum of rewards
    # - This running sum is then used to calculate the return for each time step.
    running_return = np.zeros(self.env.num_envs, dtype=np.float32)
    # - this is for store the final returns for each time step,
    #   taking into account the cumulative discounted sum of rewards.
    # batch of returns
    return_b = np.zeros_like(reward_b)

    ## calculating the return for each time step in reverse orde
    for row in range(self.steps_per_epoch - 1, -1, -1): # looping in Reverse
      # updates the running return for the current time step (row).
      # its the reward times the previes, rcursivily (cumulative discounted sum of rewards,)
      running_return = reward_b[row] + (1 - done_b[row]) * self.gamma * running_return
      # store the calculated running return for the current time step
      return_b[row] = running_return
      # > Now we have the array of returns that the algorithm needs to
      #   be able to train our policy...


    # make sure that all the batches have the rigth shape (for shuffles that)
    # compute the num of samples
    num_samples = self.env.num_envs * self.steps_per_epoch
    obs_b = obs_b.reshape(num_samples, -1) # -> 1D
    action_b = action_b.reshape(num_samples, -1) # -> 1D
    return_b = return_b.reshape(num_samples, -1) # -> 1D

    # create a list of all the indexes as the number of samples
    idx = list(range(num_samples))
    # shuffle the observations
    random.shuffle(idx)

    # loop for each index (in the shuffeled order)
    for i in idx:
      # return the observation, action, and return
      yield obs_b[i], action_b[i], return_b[i]



# Now our dataset is ready to be used every time that our algorithm
# has to load the observations into batches to undergo training,
# it will call this __iter__ method and it will get number of
# observations in a random order.

#### Create the REINFORCE algorithm

In [None]:
class Reinforce(LightningModule):
  """
  PyTorch Lightning module for training a policy using the REINFORCE algorithm.

  Parameters:
  -----------
   - `env_name`: the name of the enviroment
   - `num_envs`: number of environments
   - `samples_per_epoch`: number of samples (observations) to collect in each epoch
   - `batch_size`: The size of the batches used during training. It determines how many samples will be processed together in each iteration.
   - `hidden_size`: number of hidden layers in the policy network.
   - `policy_lr` The learning rate used for optimizing the policy network.
   - `gamma`: discount factor used in the computation of the cumulative discounted sum of rewards.
   - `entropy_coef`: The coefficient for the entropy regularization term in the loss function. It controls the amount of entropy regularization applied to the policy.
   - `optim`: the optimizer for update the policy network
  """
  def __init__(self, env_name, num_envs=8, samples_per_epoch=1000,
               batch_size=1024, hidden_size=64, policy_lr=0.001,
               gamma=0.99, entropy_coef=0.001, optim=AdamW):

    super().__init__()

    self.env = create_env(env_name, num_envs=num_envs) # create enviroments

    # get the shape of the observations (Input shape od policy ANN)
    # how many features each of env have ?
    obs_size = self.env.single_observation_space.shape[0]
    # get the number of actions (Output shape of policy ANN)
    # number of avaulable action (action space size.)
    n_actions = self.env.single_action_space.n

    # Create an instance of the ANN policy,
    # and define the input & output shapes and the numner of hidden units state.
    self.policy = GradientPolicy(obs_size, n_actions, hidden_size)

    # create the data set,
    # send the environments, the policy ANN, an so forth.
    self.dataset = RLDataset(self.env, self.policy, samples_per_epoch, gamma)

    # save each of the parameters of the class so
    # we'll can reference them any where from the code
    self.save_hyperparameters() # just a convince method..

  # Configure optimizers.
  def configure_optimizers(self):
    """
    Create Adam optimizer with the parameter of our policy.
    This method returns the optimizer that will be used to update the
    parameters of the policy network during the training process.

    - self.policy.parameters():
      - provides the parameters (weights and biases) of the policy network.
        The optimizer will update these parameters during the
        optimization process.

    """
    return self.hparams.optim(self.policy.parameters(),
                              lr=self.hparams.policy_lr)


  def train_dataloader(self):
    """
    DataLoader that will be used to iterate over batches of
    training data during the training process.

    - self.dataset: This part accesses the dataset used for training
    - batch_size: how many observations we want to group together before passing them to the training step
    """
    return DataLoader(dataset=self.dataset, batch_size=self.hparams.batch_size)

  # Training step.
  def training_step(self, batch, batch_idx):
    """
    Execute a training step.

    Parameters:
    ----------
    - `batch`: the batch that contain the observations.
    - `batch_idx`: the index of this batch.
    """

    # extract the observation, actions, returns from the batch
    obs, actions, returns = batch

    # given the state (obs), return the actions probabilities
    # (based on the policy ANN)
    probs = self.policy(obs)

    ## entropy part
    # compute the log of the action probability (that came from the ANN policy)
    log_probs = torch.log(probs + 1e-6)
    # take the logarithem (base 1) of the action that we took in that state
    action_log_prob = log_probs.gather(1, actions)

    # compute the entropy of the probability distribution
    entropy = -torch.sum(probs * log_probs, dim=-1, keepdim=True)

    # what that we want to minimize
    pg_loss = -action_log_prob * returns
    # compute the total loss with discount to the entropy (e.g, include 0.8 from the entropy)
    loss = (pg_loss - self.hparams.entropy_coef * entropy).mean()

    # save in the log (for plot later)
    self.log("episode/PG Loss", pg_loss.mean())
    self.log("episode/Entropy", entropy.mean())

    # return the total loss
    return loss

  def training_epoch_end(self, training_step_outputs):
    """
    for reach the return of each epicode.
    """
    self.log("episode/Return", self.env.return_queue[-1])

#### Purge logs and run the visualization tool (Tensorboard)

In [None]:
!rm -r /content/lightning_logs/
!rm -r /content/videos/
%load_ext tensorboard
%tensorboard --logdir /content/lightning_logs/

#### Train the policy

In [None]:
algo = Reinforce('CartPole-v1')

# train the agent using the `Trainer` class
trainer = Trainer(
  gpus=num_gpus, # number of computation GPU units
  max_epochs=100,
  log_every_n_steps=1 # update the log each 1 step.
)

# run the training
trainer.fit(algo)

#### Check the resulting policy

In [None]:
import warnings
warnings.filterwarnings('ignore')
# test the agant!
test_env('CartPole-v1', algo.policy, algo.env.obs_rms)

In [None]:
display_video(episode=1)

#### Plot the trained policy

In [None]:
plot_policy(algo.policy)