In [None]:
from copy import deepcopy
import numpy as np
import torch
from torch.optim import Adam
import gym
import time
import core

from spinup.utils.logx import EpochLogger

In [None]:
"""
FIFO buffer
"""
class ReplayBuffer:
  def __init__(self, obs_dim, act_dim, size):
    self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
    self.obs_next_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
    self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32)
    self.rew_buf = np.zeros(core. dtype=np.float32)
    self.done_buf = np.zeros(size, dtype=np.float32)
    self.ptr, self.size, self.max_size = 0,0,size

  def store(self, obs, act, rew, next_obs, done):
    self.obs_buf[self.ptr] = obs
    self.obs_next_buf[self.ptr] = next_obs
    self.act_buf[self.ptr] = act
    self.rew_buf[self.ptr] = rew
    self.done_buf = done
    self.ptr = (self.ptr+1) % self.max_size
    self.size = min(self.max_size, self.size+1)

  def sample_batch(self, batch_size=32):
    idxs = np.random.randint(0, self.size, size=batch_size)
    batch = dict(obs=self.obs_buf[idxs],
            obs_next=self.obs_next_buf[idxs],
            act=self.act_buf[idxs],
            rew=self.rew_buf[idxs],
            done=self.done_buf[idxs])
    return {k: torch.as_tensor(v, dtype=torch.float32) for k,v in batch.items()}
    

In [None]:
"""
params:
  env_fn:
  
  actor_critic:
  
  ac_kwargs
  
  seed: Random number seed
  
  gamma: Discount factor
  
  polyak: Interpolation factor in polyak averaging
  
  pi_lr: learning rate for policy network
  
  q_lr: learning rate for Q-network
  
  start_steps: num of steps for uniform-random action
  
  update_every: num of steps between gradient descent updates

  act_noise: Stddev for Gaussian exploration noise added to policy

  num_test_episodes: num of episodes to test at the end of epoch

  max_ep_len: max len of episode

  logger_kwargs: keyword args for EpochLogger

  save_freq: freq to save policy and q func
"""
def ddpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, 
         epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, 
         batch_size=100, start_steps=10000, update_after=1000, update_every=50, act_noise=0.1, 
         num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1):
  torch.manual_seed(seed)
  np.random.seed(seed)

  # load env
  env, test_env = env_fn(), env_fn()
  obs_dim = env.observation_space.shape
  act_dim = env.action_space.shape
  # the upper bound of act space
  act_limit = env.action_space.high[0]

  ac = actor_critic(env.observation_space, env.action_space, **ackwargs)
  # target network
  ac_targ = deepcopy(ac)

  # freeze target network (only update using polyak)
  for p in ac_targ.parameters():
    p.requires_grad = False

  replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

  # count num of params
  var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q])

  def compute_loss_q(data):
    # s, a, r, s', terminal
    o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']

    q = ac.q(o, a)

    # Bellman backup for Q
    # Q(s', pi_tar(s'))
    with torch.no_grad():
      q_pi_targ = ac_targ.q(o2, ac_targ.pi(o2))
      backup = r + gamma * (1-d) * q_pi_targ

    loss_q = ((q-backup)**2).mean()

    return loss_q

  # pi: max Q
  def compute_loss_pi(data):
    o = data['obs']
    q_pi = ac.q(o, ac.pi(o))
    return -q_pi.mean()

  pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
  q_optimizer = Adam(ac.q.parameters(), lr=q_lr)

  def update(data):
    # gradient descent for Q
    q_optimizer.zero_grad()
    loss_q = compute_loss_q(data)
    loss_q.backward()
    q_optimizer.step()

    # freeze Q-network to save computational effort
    for p in ac.q.parameters():
      p.requires_grad = False

    #gradient des for policy
    pi_optimizer.zero_grad()
    loss_pi = compute_loss_pi(data)
    loss_pi.backward()
    pi_optimizer.step()

    for p in ac.q.parameters():
      p.requires_grad = True

    # update target network
    with torch.no_grad():
      for p,p_targ in zip(ac.parameters(), ac_targ.parameters()):
        # use mul_, add_ instead of mul, add (make new tensors)
        p_targ.data.nul_(polyak)
        p_targ.data.add_((1-polyak)*p.data)
  def get_action(o, noise_scale):
    a = ac.act(torch.as_tensor(o, dtype=torch.float32))
    a += noise_scale * np.random.randn(act_dim)
    return np.clip(a, -act_limit, act_limit)

  def test_agent():
    for j in range(num_test_episodes):
      o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
      while not(d or (ep_len==max_ep_len)):
        # noise=0 in test
        o, r, d, _ = test_env.step(get_action(o, 0))
        ep_ret += r
        ep_len += 1

  total_steps = steps_per_epoch * epochs
  start_time = time.time()
  o, ep_ret, ep_len = env.reset(), 0, 0

  # main loop
  for t in range(total_steps):
    # randomly sample action at first
    if t>start_steps:
      a = get_action(o, act_noise)
    else:
      a = env.action_space.sample()

    o2, r, d, _ = env.step(a)
    ep_ret += r
    ep_len += 1

    d = False if ep_len==max_ep_len else d

    replay_buffer.store(o, a, r, o2, d)

    o = o2

    if d or (ep_len==max_ep_len):
      o, ep_ret, ep_len = env.reset(), 0, 0

    # update
    if t >= update_after and t%update_every==0:
      for _ in range(update_every):
        batch = replay_buffer.sample_batch(batch_size)
        update(data=batch)

    # End of epoch
    if (t+1) % steps_per_epoch ==0:
      epoch = (t+1) // steps_per_epoch

      if (epoch % save_freq==0) or (epoch==epochs):
        logger.save_state({'env':env}, None)

      test_agent()

In [None]:
if __name__ == '__main__':
  import argparse
  parser = argparse.ArgumentParser()
  parser.add_argument('--env', type=str, default='HalfCheetah-v2')
  parser.add_argument('--hid', type=int, default=256)
  parser.add_argument('--l', type=int, default=2)#hidden layers
  parser.add_argument('--gamma', type=float, default=0.99)
  parser.add_argument('--seed', '-s', type=int, default=0)
  parser.add_argument('--epochs', type=int, default=50)
  parser.add_argument('--exp_name', type=str, default='ddpg')
  args = parser.parse_args()

  ddpg(lambda: gym.make(args.env), actor_critic=core.MLPActorCritic, 
       ac_kwargs=dict(hidden_sizes=[args.hid]*args.l),
       gamma=args.gamma, seed=args.seed, epochs=args.epochs)