In [1]:
import gym
gym.logger.set_level(40) 
import numpy as np
import copy
from collections import namedtuple, deque
import matplotlib.pyplot as plt
import torch
import os
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
from torch.distributions import Categorical
import random
from torch import multiprocessing
torch.manual_seed(121) 
%matplotlib inline

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
env = gym.make('Pendulum-v0').unwrapped
env.seed(121)
n_state=env.observation_space.shape[0]
n_actions=env.action_space.shape[0]
print(f'state space {n_state} \naction space {n_actions}')

state space 3 
action space 1


In [4]:
Transition = namedtuple('Transition', ('state', 'action', 'mask', 'next_state',
                                      'reward'))
class ReplayMemory(object):
    def __init__(self):
        self.memory = []

    def push(self, *args):
        self.memory.append(Transition(*args))
    def sample(self, batch_size=None):
        if batch_size is None:
            return Transition(*zip(*self.memory))
        else:
            random_batch = random.sample(self.memory, batch_size)
            return Transition(*zip(*random_batch))
    def append(self, new_memory):
        self.memory += new_memory.memory

    def __len__(self):
        return len(self.memory)

In [20]:
# test buffer
"""Random Agent"""
Num_episode=1
Time_step=10
def run_episodes(env):
    for episode in range(Num_episode):
        env.reset()
        returns=0
        for t in range(Time_step):
            action=env.action_space.sample()
            print(action)
            next_state,reward,done,_=env.step(action) 
            returns+=reward
            if done:
                print(f"Episode {episode} finished after {t} timesteps")
                break
        print(f"Episode {episode} - total rewards obtained {returns}")
      
run_episodes(env)   


[1.9144734]
[1.1966343]
[-0.15408255]
[1.1221167]
[-1.5269023]
[0.5596841]
[-1.4265869]
[1.7786757]
[0.08739328]
[-0.34135225]
Episode 0 - total rewards obtained -90.62275300130331


<img src="ppo_adv.png">

<img src="ppo.png">

In [21]:
def normal_entropy(std):
    var = std.pow(2)
    entropy = 0.5 + 0.5 * torch.log(2 * var * np.pi)
    return entropy.sum(1, keepdim=True)
def normal_log_density(x, mean, log_std):
    std = torch.exp(log_std)
    var = std.pow(2)
    log_density = - torch.pow(x - mean, 2) / (2 * var) - 0.5 * np.log(2 * np.pi) - log_std
    return log_density.sum(1, keepdim=True)

In [22]:
class Policy(nn.Module):    
    def __init__(self, s_dim, a_dim):
        super(Policy, self).__init__()
        self.net = nn.Sequential(nn.Linear(s_dim, 64),
                     nn.ReLU(),
                     nn.Linear(64, 64),
                     nn.ReLU(),
                     nn.Linear(64, a_dim))
        self.a_log_std = nn.Parameter(torch.zeros(1, a_dim))

    def forward(self, s):
        a_mean = self.net(s)
        a_log_std = self.a_log_std.expand_as(a_mean)
        return a_mean, a_log_std

    def select_action(self, s):
        a_mean, a_log_std = self.forward(s)
        a = torch.normal(a_mean, torch.exp(a_log_std))
        return a

    def get_log_prob(self, s, a):
        a_mean, a_log_std = self.forward(s)
        log_prob = normal_log_density(a, a_mean, a_log_std)
        return log_prob

In [23]:
class Value(nn.Module):
    def __init__(self, s_dim):
        super(Value, self).__init__()
        self.net = nn.Sequential(nn.Linear(s_dim, 64),
                                 nn.ReLU(),
                                 nn.Linear(64, 64),
                                 nn.ReLU(),
                                 nn.Linear(64, 1))

    def forward(self, s):
        value = self.net(s)
        return value

In [24]:
def sampler(pid, queue, env, policy, batchsz):
    buff = ReplayMemory()
    sampled_num = 0
    sampled_traj_num = 0
    traj_len = 200
    real_traj_len = 0
    avg_reward = []

    while sampled_num < batchsz:
        traj_reward = 0
        s = env.reset()
        for t in range(traj_len):
            a = policy.select_action(Variable(torch.Tensor(s).unsqueeze(0))).data[0].numpy()
            next_s, reward, done, _ = env.step(a)
            mask = 0 if done else 1
            buff.push(s, a, mask, next_s, reward)
            s = next_s
            traj_reward += reward
            real_traj_len = t
            if done:
                break
        sampled_num += real_traj_len
        sampled_traj_num += 1
        avg_reward.append(traj_reward)
    avg_reward = sum(avg_reward) / len(avg_reward)
    queue.put([pid, buff, avg_reward])


In [25]:
class PPO:
    gamma = 0.99
    l2_reg = 0
    lr = 3e-4
    epsilon = 0.2
    tau = 0.95
    def __init__(self, env_cls, thread_num):
        self.thread_num = thread_num
        self.env_cls = env_cls
        dummy_env = env_cls()
        self.s_dim = dummy_env.observation_space.shape[0]
        is_discrete_action = len(dummy_env.action_space.shape)
        if is_discrete_action == 0:
            self.a_dim = dummy_env.action_space.n
            self.is_discrete_action = True
        else:
            self.a_dim = dummy_env.action_space.shape[0]
            self.is_discrete_action = False
        self.env_list = []
        for _ in range(thread_num):
            self.env_list.append(env_cls())
        self.policy = Policy(self.s_dim, self.a_dim)
        self.value = Value(self.s_dim)

        self.policy_optim = optim.Adam(self.policy.parameters(), lr=self.lr)
        self.value_optim = optim.Adam(self.value.parameters(), lr=self.lr)
            
    def est_adv(self, r, v, mask):
        batch = v.size(0)
        v_target = torch.Tensor(batch)
        delta = torch.Tensor(batch)
        A_sa = torch.Tensor(batch)
        prev_v_target = 0
        prev_v = 0
        prev_A_sa = 0
        for t in reversed(range(batch)):
            v_target[t] = r[t] + self.gamma * prev_v_target * mask[t]
            delta[t] = r[t] + self.gamma * prev_v * mask[t] - v[t]
            A_sa[t] = delta[t] + self.gamma * self.tau * prev_A_sa * mask[t]
            prev_v_target = v_target[t]
            prev_v = v[t]
            prev_A_sa = A_sa[t]
        A_sa = (A_sa - A_sa.mean()) / A_sa.std()
        return A_sa, v_target
    
    def update(self, batchsz):
        batch = self.sample(batchsz)
        s = torch.from_numpy(np.stack(batch.state))
        a = torch.from_numpy(np.stack(batch.action))
        r = torch.Tensor(np.stack(batch.reward))
        mask = torch.Tensor(np.stack(batch.mask))
        batchsz = s.size(0)
        v = self.value(Variable(s)).data.squeeze()
        log_pi_old_sa = self.policy.get_log_prob(Variable(s), Variable(a)).data
        A_sa, v_target = self.est_adv(r, v, mask)
        v_target = Variable(v_target)
        A_sa = Variable(A_sa)
        s = Variable(s)
        a = Variable(a)
        log_pi_old_sa = Variable(log_pi_old_sa)

        for _ in range(5):

            # 4.1 shuffle current batch
            perm = torch.randperm(batchsz)
            # shuffle the variable for mutliple optimize
            v_target_shuf, A_sa_shuf, s_shuf, a_shuf, log_pi_old_sa_shuf = v_target[perm], A_sa[perm], s[perm], a[perm], \
                                                                           log_pi_old_sa[perm]

            # 4.2 get mini-batch for optimizing
            optim_batchsz = 4096
            optim_chunk_num = int(np.ceil(batchsz / optim_batchsz))
            # chunk the optim_batch for total batch
            v_target_shuf, A_sa_shuf, s_shuf, a_shuf, log_pi_old_sa_shuf = torch.chunk(v_target_shuf, optim_chunk_num), \
                                                                           torch.chunk(A_sa_shuf, optim_chunk_num), \
                                                                           torch.chunk(s_shuf, optim_chunk_num), \
                                                                           torch.chunk(a_shuf, optim_chunk_num), \
                                                                           torch.chunk(log_pi_old_sa_shuf,
                                                                                       optim_chunk_num)
            # 4.3 iterate all mini-batch to optimize
            for v_target_b, A_sa_b, s_b, a_b, log_pi_old_sa_b in zip(v_target_shuf, A_sa_shuf, s_shuf, a_shuf,
                                                                     log_pi_old_sa_shuf):
                # print('optim:', batchsz, v_target_b.size(), A_sa_b.size(), s_b.size(), a_b.size(), log_pi_old_sa_b.size())
                # 1. update value network
                v_b = self.value(s_b)
                loss = torch.pow(v_b - v_target_b, 2).mean()
                self.value_optim.zero_grad()
                loss.backward()
                # nn.utils.clip_grad_norm(self.value.parameters(), 4)
                self.value_optim.step()

                # 2. update policy network by clipping
                # [b, 1]
                log_pi_sa = self.policy.get_log_prob(s_b, a_b)
                # ratio = exp(log_Pi(a|s) - log_Pi_old(a|s)) = Pi(a|s) / Pi_old(a|s)
                # we use log_pi for stability of numerical operation
                # [b, 1] => [b]
                ratio = torch.exp(log_pi_sa - log_pi_old_sa_b).squeeze(1)
                surrogate1 = ratio * A_sa_b
                surrogate2 = torch.clamp(ratio, 1 - self.epsilon, 1 + self.epsilon) * A_sa_b
                # this is element-wise comparing.
                # we add negative symbol to convert gradient ascent to gradient descent.
                surrogate = - torch.min(surrogate1, surrogate2).mean()
                self.policy_optim.zero_grad()
                surrogate.backward(retain_graph=True)
                torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 10)
                self.policy_optim.step()
   
    def sample(self, batchsz):
    
        thread_batchsz = np.ceil(batchsz / self.thread_num).astype(np.int32)
        queue = multiprocessing.Queue()
        evt = multiprocessing.Event()
        threads = []
        for i in range(self.thread_num):
            thread_args = (i, queue, self.env_list[i], self.policy, thread_batchsz)
            threads.append(multiprocessing.Process(target=sampler, args=thread_args))
        for t in threads:
            t.daemon = True
            t.start()

        # we need to get the first ReplayMemory object and then merge others ReplayMemory use its append function.
        pid0, buff0, avg_reward0 = queue.get()
        avg_reward = [avg_reward0]
        for _ in range(1, self.thread_num):
            pid, buff_, avg_reward_ = queue.get()
            buff0.append(buff_) # merge current ReplayMemory into buff0
            avg_reward.append(avg_reward_)

        # now buff saves all the sampled data and avg_reward is the average reward of current sampled data
        buff = buff0
        avg_reward = np.array(avg_reward).mean()

        print('avg reward:', avg_reward)
        return buff.sample()
    
    def render(self, interval=8):
        thread = multiprocessing.Process(target=self.render_, args=(interval,))
        thread.start()
        
    def render_(self, interval):
        env = self.env_cls()
        s = env.reset()

        while True:
            s = Variable(torch.Tensor(s)).unsqueeze(0)
            a = self.policy.select_action(s).squeeze().data.numpy()
            print(a)
            s, r, done, _ = env.step(a)

            env.render()

            if done:
                s = env.reset()
                time.sleep(interval)
    
    def save(self, filename='ppo'):

        torch.save(self.value.state_dict(), filename + '.val.mdl')
        torch.save(self.policy.state_dict(), filename + '.pol.mdl')

        print('saved network to mdl')

    def load(self, filename='ppo'):
        value_mdl = filename + '.val.mdl'
        policy_mdl = filename + '.pol.mdl'
        if os.path.exists(value_mdl):
            self.value.load_state_dict(torch.load(value_mdl))
            print('loaded checkpoint from file:', value_mdl)
        if os.path.exists(policy_mdl):
            self.policy.load_state_dict(torch.load(policy_mdl))
        print('loaded checkpoint from file:', policy_mdl)
    
    

In [26]:
def make_env():
    env=gym.make('Pendulum-v0').unwrapped
    return env

In [28]:



def main():
    torch.set_default_tensor_type('torch.DoubleTensor')

    batchsz = 32
    ppo = PPO(make_env, 1)

    # load model from checkpoint
    ppo.load()
    # comment this line to close evaluaton thread, to speed up training process.
    ppo.render(1)

    for i in range(10000):

        ppo.update(batchsz)

        if i % 100 == 0 and i:
            ppo.save()


if __name__ == '__main__':
    main()

loaded checkpoint from file: ppo.val.mdl
loaded checkpoint from file: ppo.pol.mdl
3.7529173492021135


Process Process-818:
Traceback (most recent call last):
  File "/home/satty/anaconda3/envs/rl/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/satty/anaconda3/envs/rl/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-25-1f1e7293597b>", line 224, in render_
    s, r, done, _ = env.step(a)
  File "/home/satty/anaconda3/envs/rl/lib/python3.6/site-packages/gym/envs/classic_control/pendulum.py", line 37, in step
    u = np.clip(u, -self.max_torque, self.max_torque)[0]
IndexError: invalid index to scalar variable.


avg reward: -1364.7605578718028
avg reward: -1392.1997531627449
avg reward: -1393.683563335906
avg reward: -1385.2519553157417
avg reward: -1406.2934864401461
avg reward: -1369.020550786967
avg reward: -1381.8208466611973
avg reward: -1371.4209307721264
avg reward: -1379.8438413363497
avg reward: -1390.3604794734376
avg reward: -1373.026315030896
avg reward: -1396.386007607114
avg reward: -1385.6732747503597
avg reward: -1343.2822575281998
avg reward: -1376.2344226038972
avg reward: -1368.644922168544
avg reward: -1374.6841486120827
avg reward: -1352.856089768198
avg reward: -1361.8026275794382
avg reward: -1342.0493130179846
avg reward: -1346.5701871863605
avg reward: -1356.2482045708932
avg reward: -1320.4126012996699
avg reward: -1316.4213216208157
avg reward: -1345.9538290502567
avg reward: -1342.1614412239637
avg reward: -1329.529651367656
avg reward: -1377.7027715109139
avg reward: -1363.0959499602857
avg reward: -1377.7075998102011
avg reward: -1346.9099247144422
avg reward: -13

Process Process-943:
Traceback (most recent call last):
  File "/home/satty/anaconda3/envs/rl/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/satty/anaconda3/envs/rl/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)


KeyboardInterrupt: 

  File "<ipython-input-24-05ac8c38086d>", line 25, in sampler
    while sampled_num < batchsz:
KeyboardInterrupt


Reference:
- https://github.com/dragen1860/PPO-Pytorch
- https://github.com/ikostrikov/pytorch-a2c-ppo-acktr
- https://github.com/dai-dao/PPO-Pytorch