In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal
import numpy as np
import gym
import math
import time
import matplotlib.pyplot as plt
from multiprocessing_env import SubprocVecEnv

In [2]:
env_name = 'Pendulum-v0'
env = gym.make(env_name)

In [3]:
def make_env():
    def _thunk():
        env=gym.make(env_name)
        return env
    return _thunk

In [4]:
def normalize(x):
    x-=x.mean()
    x /= (x.std()+1e-8)
    return x

In [5]:
num_inputs = env.observation_space.shape[0]
num_outputs = env.action_space.shape[0]
print("State size ",num_inputs," State info : ",env.observation_space)

print("Action size ",num_outputs," Action info : ",env.action_space)

State size  3  State info :  Box(3,)
Action size  1  Action info :  Box(1,)


In [6]:
def t(x):
    return torch.from_numpy(x).float()

In [7]:
class ActorCritic(nn.Module):
    def __init__(self,input_size,output_size,std=0.0):
        super().__init__()
        
        self.actor = nn.Sequential(nn.Linear(input_size,32),
                                  nn.ReLU(),
                                  nn.Linear(32,16) ,
                                  nn.Linear(16,output_size))
        self.critic = nn.Sequential(nn.Linear(input_size,32),
                                    nn.Linear(32,16),
                                    nn.Linear(16,1))
        self.log_std = nn.Parameter(torch.ones(1,output_size)*std)
    def forward(self,state):
        value = self.critic(state)
        mu = self.actor(state)
        std = self.log_std.exp()
        dist = Normal(mu,std)
        return dist,value

## Hyperparameters

In [8]:
n_env = 8 
gamma = 0.9
lr = 1e-4
gae_lambda = 0.95
epsilon = 0.2
entropy_beta = 0.001
critic_discount = 0.5
PPO_STEPS = 256
mini_batch = 64
ppo_epochs = 10

In [9]:
def compute_gae(next_val,rewards,masks,values,gamma=gamma,lam=gae_lambda):
    values = values+[next_value]
    gae = 0 
    returns = []
    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma*values[step+1]*masks[step] - values[step]
        gae = delta + gamma * lam * masks[step] *gae
        returns.insert(0,gae + values[step])
    return returns 

In [10]:
def normalize(x):
    x -= x.mean()
    x /= (x.std() + 1e-8)
    return x

In [11]:
def test_env(env, model, device, deterministic=True):
    state = env.reset()
    done = False
    total_reward = 0
    while not done:
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        dist, _ = model(state)
        action = dist.mean.detach().cpu().numpy()[0] if deterministic \
            else dist.sample().cpu().numpy()[0]
        next_state, reward, done, _ = env.step(action)
        state = next_state
        total_reward += reward
    return total_reward

In [12]:
def ppo_iter(states, actions, log_probs, returns, advantage):
    batch_size = states.size(0)
    # generates random mini-batches until we have covered the full batch
    for _ in range(batch_size // mini_batch):
        rand_ids = np.random.randint(0, batch_size, mini_batch)
        yield states[rand_ids, :], actions[rand_ids, :], log_probs[rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]

In [13]:
def ppo_update(states,actions,log_probs,returns,advantages,clip=epsilon):
    for ep in range(ppo_epochs):
        for states,action,old_log_prob,return_,advantage in ppo_iter(states,actions,log_probs,returns,advantages):
            dist,value = model(states)
            entropy = dist.entropy().mean()
            new_log_prob = dist.log_prob(action)
            
            ratio = (new_log_prob-old_log_prob).exp()
            surr1 = ratio*advantage
            surr2 = torch.clamp(ratio,1-clip,1+clip)*advantage
            
            actor_loss = -torch.min(surr1,surr2).mean()
            critic_loss = (return_-value).pow(2).mean()
            
            loss = 0.5*critic_loss + actor_loss - entropy_beta*entropy
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

## The PPO Algorithm
____

In [14]:
envs = [make_env() for i in range(n_env)]
envs = SubprocVecEnv(envs)
env = gym.make(env_name)
num_inputs = env.observation_space.shape[0]
num_outputs = env.action_space.shape[0]
model = ActorCritic(num_inputs,num_outputs)
optimizer = optim.Adam(model.parameters(),lr=lr)

In [15]:
env.action_space

Box(1,)

In [16]:
n_episodes = 1000
state = envs.reset()
for ep in range(n_episodes):
    log_probs = []
    values = []
    states = []
    actions = []
    rewards = []
    masks = []
    
    for _ in range(PPO_STEPS):
        state = torch.FloatTensor(state)
        dist,value = model(state)
        
        action = dist.sample()
        
        next_state,reward,done,_ = envs.step(action.numpy())
        log_prob = dist.log_prob(action)
        
        log_probs.append(log_prob)
        values.append(value)
        rewards.append(torch.FloatTensor(reward).unsqueeze(1))
        masks.append(torch.FloatTensor(1-done).unsqueeze(1))
        states.append(state)
        actions.append(action)
        
        state = next_state
    
    next_state = torch.FloatTensor(next_state)
    _,next_value = model(next_state)
    returns = compute_gae(next_value,rewards,masks,values)
    
    returns   = torch.cat(returns).detach()
    log_probs = torch.cat(log_probs).detach()
    values    = torch.cat(values).detach()
    states    = torch.cat(states)
    actions   = torch.cat(actions)
    advantage = returns - values
    advantage = normalize(advantage)
    
    ppo_update(states,actions,log_probs,returns,advantage)

    if ep%50 == 0:
        test_reward = np.mean([test_env(env,model,'cpu') for _ in range(10)])
        print("Test Reward : ",test_reward)

Test Reward :  -1201.3785819726484
Test Reward :  -1267.536772756382
Test Reward :  -1176.4746206341297
Test Reward :  -1227.9544301076044
Test Reward :  -1228.3721415976838
Test Reward :  -1197.1015071108657
Test Reward :  -1291.0571909412981
Test Reward :  -1176.598561789001


KeyboardInterrupt: 

In [None]:
state = env.reset()
deterministic = False
done = False
total_reward = 0
while not done:
    state = torch.FloatTensor(state).unsqueeze(0)
    env.render()
    dist, _ = model(state)
    action = dist.mean.detach().cpu().numpy()[0] if deterministic \
        else dist.sample().cpu().numpy()[0]
    next_state, reward, done, _ = env.step(action)
    state = next_state
    total_reward += reward
print(total_reward)