# Chapter 2: REINFORCE

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions

import matplotlib.pyplot as plt
import numpy as np

import gym

In [4]:
!pip install pygame
!apt-get install python-opengl -y

!apt install xvfb -y

!pip install pyvirtualdisplay

!pip install piglet


from pyvirtualdisplay import Display
Display().start()

import gym
from IPython import display
import matplotlib.pyplot as plt
%matplotlib inline

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Reading package lists... Done
Building dependency tree       
Reading state information... Done
python-opengl is already the newest version (3.1.0+dfsg-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
xvfb is already the newest version (2:1.19.6-1ubuntu4.11).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/si

In [5]:
class Policy(nn.Module):
  """
  REINFORCE policy that generates probability distributions of action wrt state
  """

  def __init__(self, in_dim=10, out_dim=10):
    super(Policy, self).__init__()
    self.model = nn.Sequential(*[
        nn.Linear(in_dim, 64),
        nn.ReLU(),
        nn.Linear(64, 64),
        nn.ReLU(),
        nn.Linear(64, 64),
        nn.ReLU(),
        nn.Linear(64, out_dim),
        nn.Softmax()
    ])
    self.onpolicy_reset()
    
  def onpolicy_reset(self):
    self.log_probs = []
    self.rewards = []
  
  def forward(self, x):
    pdparam = self.model(x)
    return pdparam
  
  def action(self, state):
    x = torch.from_numpy(state.astype(np.float32))
    pdparam = self.forward(x) # Generate action pd wrt state
    pd = torch.distributions.Categorical(pdparam) 
    action = pd.sample() # Choose action according to generated pd
    log_prob = pd.log_prob(action) 
    self.log_probs.append(log_prob) # Record log prob of this action 
    return action.item()

In [6]:
def train(policy, optimizer, gamma=0.99):
  T = len(policy.rewards)
  rets = np.empty(T, dtype=np.float32)
  future_ret = 0.
  for t in reversed(range(T)):
    future_ret = policy.rewards[t] + gamma*future_ret
    rets[t] = future_ret
  rets = torch.tensor(rets)
  log_probs = torch.stack(policy.log_probs)
  loss = -log_probs*rets 
  loss = torch.sum(loss)
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()
  return loss.item()

In [11]:
def main():
  env = gym.make('CartPole-v0')
  env.reset()
  in_dim = env.observation_space.shape[0]
  out_dim = env.action_space.n
  pi = Policy(in_dim, out_dim)
  optimizer = optim.Adam(pi.parameters(), lr=0.0001)
  for epi in range(3000):
    state = env.reset()
    for t in range(200):
      action = pi.action(state)
      state, reward, done, _ = env.step(action)
      pi.rewards.append(reward)
      if done: break
    loss = train(pi, optimizer)
    total_reward = sum(pi.rewards)
    solved = total_reward>195
    pi.onpolicy_reset()
    print(f'Episode {epi}, loss: {loss}, total reward: {total_reward}')

In [12]:
if __name__ == '__main__':
  main()

Episode 0, loss: 89.24971008300781, total reward: 16.0
Episode 1, loss: 43.56621551513672, total reward: 11.0
Episode 2, loss: 276.3490905761719, total reward: 29.0
Episode 3, loss: 89.01139831542969, total reward: 16.0
Episode 4, loss: 29.560367584228516, total reward: 9.0
Episode 5, loss: 60.17000198364258, total reward: 13.0
Episode 6, loss: 311.99810791015625, total reward: 31.0
Episode 7, loss: 78.84321594238281, total reward: 15.0
Episode 8, loss: 331.21087646484375, total reward: 32.0
Episode 9, loss: 68.99488067626953, total reward: 14.0
Episode 10, loss: 331.8008728027344, total reward: 32.0
Episode 11, loss: 111.7855224609375, total reward: 18.0
Episode 12, loss: 124.60955810546875, total reward: 19.0
Episode 13, loss: 43.37836837768555, total reward: 11.0
Episode 14, loss: 411.1186828613281, total reward: 36.0
Episode 15, loss: 138.0559844970703, total reward: 20.0
Episode 16, loss: 500.53680419921875, total reward: 40.0
Episode 17, loss: 59.742984771728516, total reward: 13