<a href="https://colab.research.google.com/github/RLWH/reinforcement-learning-notebook/blob/master/6.%20Policy%20Gradient/Solving_Cartpole_by_A2C_Algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Solving Cartpole by A2C Algorithm

In [3]:
#@title Install dependencies

#remove " > /dev/null 2>&1" to see what is going on under the hood
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
# !pip install gym[atari] > /dev/null 2>&1

Requirement already up-to-date: setuptools in /usr/local/lib/python3.6/dist-packages (41.0.1)


In [4]:
#@title Wrapper function for openai gym rendering
import gym
import matplotlib
import matplotlib.pyplot as plt
import glob
import io
import base64

from gym import logger as gymlogger
from gym.wrappers import Monitor
from IPython.display import HTML
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display

%matplotlib inline
gymlogger.set_level(40) #error only
display = Display(visible=0, size=(1400, 900))
display.start()

"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

xdpyinfo was not found, X start can not be checked! Please install xdpyinfo!


In [0]:
!rm -rf ./video/*

## Setup the environment

In [0]:
env = wrap_env(gym.make("CartPole-v0"))

In [22]:
env.reset()

AttributeError: ignored

In [7]:
print("Observation space: %s" % env.observation_space.shape[0])
print("Action space: %s" % env.action_space.n)

Observation space: 4
Action space: 2


In [0]:
GLOBAL_STEP = env.env.spec.max_episode_steps
SCORE_REQUIREMENT = -198
NUM_EPISODES = 10000

## Setup the network

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from collections import deque
from torch.distributions import Categorical

In [0]:
class PolicyNetwork(nn.Module):
    """
    Policy Network -> Update the policy gradient
    """
    
    def __init__(self, state_size, action_size, fc1_units=128, fc2_units=64):
        super().__init__()
        
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, action_size)
        
        self.dropout1 = nn.Dropout(p=0.5)
        self.dropout2 = nn.Dropout(p=0.5)
        
    def forward(self, x):
        """
        Forward pass
        Essentially, the forward pass return the Q value
        """
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = F.softmax(self.fc3(x), dim=0)
        
        return x

In [0]:
class ValueNetwork(nn.Module):
    """
    Policy Network
    """
    
    def __init__(self, state_size, action_size=1, fc1_units=128, fc2_units=64):
        super().__init__()
        
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, action_size)
        
        self.dropout1 = nn.Dropout(p=0.5)
        self.dropout2 = nn.Dropout(p=0.5)
        
    def forward(self, x):
        """
        Forward pass
        Essentially, the forward pass return the Q value
        """
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        
        return x

In [0]:
# Check GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


### Algorithm
---
```
Input: a differentiable policy parameterization pi(a|s, theta)                   [Policy Network]
Input: a differentiable state-value function parameterization Q_w(s, a, w)       [Value Network]
Parameters: step sizes alpha_theta > 0; alpha_w > 0
​
Loop forever for each episode:
​
        Initialise S, theta
        Sample a from policy network
        
        Loop while S is not terminal for each time step:
                A = pi(.|S, theta) [policy(state)]
                Take action A, observe S', R
                delta = R + gamma * A(S', A', w) - A(S, A, w)  [TD(0) error, or advantage]
                theta = theta + alpha_theta * grad_pi log pi_theta(s,a) A(S,A)     [policy gradient update]
                w = w + alpha_w * delta * x(s, a)    [TD(0)]
                A = A', S = S'
```
---

In [0]:
class A2CAgent:
    """Actor Critic Agent"""
    
    def __init__(self, n_state, n_action, policy_network):
        
        self.env = env
        
        self.n_state = n_state
        self.n_action = n_action
        
        # Initialise the model
        self.policy_network = policy_network
    
    def act(self, state):
        state = state.float()
        probs = self.policy_network(Variable(state))
#         value = self.value_network(Variable(state))
        m = Categorical(probs)
        action = m.sample()
        log_probs = m.log_prob(action)
#         policy.saved_log_probs.append(log_prob)

        return log_probs, action.item()

In [20]:
observation_space = env.observation_space.shape[0]
action_space = env.action_space.n

policy_network = PolicyNetwork(observation_space, action_space)
value_network = ValueNetwork(observation_space)

agent = A2CAgent(observation_space, action_space, policy_network)

reward_list = deque(maxlen=100)

for i in range(10000 + 1):
    
    # Log the results
    action_log_probs = []
    rewards = []
    states = []
    targets = []
    errors = []
    t = 0
    R = 0
    
    state = env.reset()
    
    while True:
        
        # Select and take action
        log_probs, action = agent.act(state)
        
        # Sample R_t+1 and S_t+1
        next_state, reward, done, _ = env.step(action)
        
        # Find the value of the next step
        next_state_value = value_network(next_state)
        current_state_value = value_network(state).detach()
        
        td_target = reward + next_state_value
        
        # TD loss, or advantage
        td_loss = torch.nn.MSELoss(reduction="sum")(td_target, current_state_value)
        actor_loss = -log_probs * td_loss.detach()
        
        # Optimize
        actor_optimizer.zero_grad()
        critic_optimizer.zero_grad()
        
        td_loss.backward()
        actor_loss.backward()
        
        actor_optimizer.step()
        critic_optimizer.step()
        
        R += reward
        
        if done:
            reward_list.append(R)
            break
            
        # Update state
        state = next_state
        
    if i % 1000 == 0:
        print("\rEpisode %s \t Average Score: %s" % (i, np.mean(reward_list)))

AttributeError: ignored