# Chapter 2: REINFORCE

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions

import matplotlib.pyplot as plt
import numpy as np

import gym

In [None]:
!pip install pygame
!apt-get install python-opengl -y

!apt install xvfb -y

!pip install pyvirtualdisplay

!pip install piglet


from pyvirtualdisplay import Display
Display().start()

import gym
from IPython import display
import matplotlib.pyplot as plt
%matplotlib inline

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Reading package lists... Done
Building dependency tree       
Reading state information... Done
python-opengl is already the newest version (3.1.0+dfsg-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
xvfb is already the newest version (2:1.19.6-1ubuntu4.11).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/si

In [None]:
class DiscretePolicy(nn.Module):
  """
  REINFORCE policy that generates discrete
  probability distributions of action wrt state
  """

  def __init__(self, in_dim=10, out_dim=10):
    super(DiscretePolicy, self).__init__()
    self.model = nn.Sequential(*[
        nn.Linear(in_dim, 64),
        nn.ReLU(),
        nn.Linear(64, 64),
        nn.ReLU(),
        nn.Linear(64, 64),
        nn.ReLU(),
        nn.Linear(64, out_dim),
        nn.Softmax()
    ])
    self.onpolicy_reset()
    
  def onpolicy_reset(self):
    self.log_probs = []
    self.rewards = []
  
  def forward(self, x):
    pdparam = self.model(x)
    return pdparam
  
  def action(self, state):
    x = torch.from_numpy(state.astype(np.float32))
    pdparam = self.forward(x) # Generate action pd wrt state
    pd = torch.distributions.Categorical(logits=pdparam) 
    action = pd.sample() # Choose action according to generated pd
    log_prob = pd.log_prob(action) 
    self.log_probs.append(log_prob) # Record log prob of this action 
    return action.item()

In [None]:
def train(policy, optimizer, gamma=0.99):
  T = len(policy.rewards)
  rets = np.empty(T, dtype=np.float32)
  future_ret = 0.
  for t in reversed(range(T)):
    future_ret = policy.rewards[t] + gamma*future_ret
    rets[t] = future_ret
  rets = torch.tensor(rets)
  log_probs = torch.stack(policy.log_probs)
  loss = -log_probs*rets 
  loss = torch.sum(loss)
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()
  return loss.item()

In [None]:
def main():
  env = gym.make('CartPole-v0')
  env.reset()
  in_dim = env.observation_space.shape[0]
  out_dim = env.action_space.n
  pi = DiscretePolicy(in_dim, out_dim)
  optimizer = optim.Adam(pi.parameters(), lr=0.0001)
  for epi in range(3000):
    state = env.reset()
    for t in range(200):
      action = pi.action(state)
      state, reward, done, _ = env.step(action)
      pi.rewards.append(reward)
      if done: break
    loss = train(pi, optimizer)
    total_reward = sum(pi.rewards)
    solved = total_reward>195
    pi.onpolicy_reset()
    if epi%100 == 0: 
      print(f'Episode {epi}, loss: {loss}, total reward: {total_reward}')

In [None]:
if __name__ == '__main__':
  main()

In [3]:
class ContinuousPolicy(nn.Module):
  """
  REINFORCE policy that generates continuous probability distributions
  """

  def __init__(self, in_dim):
    super(ContinuousPolicy, self).__init__()
    self.model = nn.Sequential(*[
        nn.Linear(in_dim, 64),
        nn.ReLU(),
        nn.Linear(64, 64),
        nn.ReLU(),
        nn.Linear(64, 64),
        nn.ReLU(),
        nn.Linear(64, 2), # mean and variance as pdparam for normal distribution
    ])
    self.onpolicy_reset()
  
  def forward(self, state):
    pdparam = self.model(state)
    return pdparam

  def onpolicy_reset(self):
    self.log_probs = []
    self.rewards = []
  
  def action(self, state):
    if state.dtype != torch.float32:
      state = torch.from_numpy(state.astype(np.float32))
    pdparam = self.forward(state)
    pd = torch.distributions.Normal(loc=pdparam[0], scale=pdparam[1])
    action = pd.sample()
    log_prob = pd.log_prob(action)
    self.log_probs.append(log_prob)
    return action.item()
  