In [1]:
import numpy as np
import gym, random, copy

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable


class Actor(nn.Module):
    def __init__(self, n_dim_states, n_dim_actions, action_bound):
        super(Actor, self).__init__()
        self.action_bound = action_bound
        self.lin1 = nn.Linear(n_dim_states, 400)
        self.lin2 = nn.Linear(400, 300)
        self.lin3 = nn.Linear(300, n_dim_actions)
        self.optimizer = optim.Adam(self.parameters(), lr=1e-4)

    def forward(self, X):
        X = F.relu(self.lin1(X))
        X = F.relu(self.lin2(X))
        return self.action_bound * F.tanh(self.lin3(X))


class Critic(nn.Module):
    def __init__(self, n_dim_states, n_dim_actions):
        super(Critic, self).__init__()
        self.lin1 = nn.Linear(n_dim_states + n_dim_actions, 400)
        self.lin2 = nn.Linear(400, 300)
        self.lin3 = nn.Linear(300, 1)
        self.optimizer = optim.Adam(self.parameters(), lr=1e-3)
        
    def forward(self, X):
        X = F.relu(self.lin1(X))
        X = F.relu(self.lin2(X))
        return self.lin3(X)


class Agent:
    def __init__(self, n_dim_states, n_dim_actions, action_bound, gamma=0.99, 
                 batch_size=64, mem_capacity=1e6, target_lr=1e-3):
        self.n_dim_states = n_dim_states
        self.n_actions = n_dim_actions
        self.gamma = gamma
        self.batch_size = batch_size
        self.target_lr = target_lr

        self.actor = Actor(n_dim_states, n_dim_actions, action_bound)
        self.critic = Critic(n_dim_states, n_dim_actions)
        self.target_actor = copy.deepcopy(self.actor)
        self.target_critic = copy.deepcopy(self.critic)
        
        # List of past samples (state, action, reward, next_state)
        self.memory = []
        self.mem_capacity = mem_capacity
        
    def policy(self, state):
        var_state = Variable(torch.Tensor(state.reshape(1, -1)))
        action = self.actor(var_state).data.numpy().reshape(-1)
        return action

    def observe(self, sample):
        # Add sample to memory, and delete one sample if capacity exceeded
        self.memory.append(sample)
        if (len(self.memory) > self.mem_capacity):
            self.memory.pop(0)

    def experience_replay(self):
        # Sample a batch from memory uniformly at random 
        batch_size = min(self.batch_size, len(self.memory))
        batch = np.array(random.sample(self.memory, batch_size))
        
        states = Variable(torch.FloatTensor(np.vstack(batch[:, 0])))
        actions = Variable(torch.FloatTensor(np.vstack(batch[:, 1])))
        rewards = Variable(torch.FloatTensor(np.vstack(batch[:, 2])))
        next_states = Variable(torch.FloatTensor(np.vstack(batch[:, 3])))
        
        # Improve critic
        Q_values = self.critic(torch.cat((states, actions), dim=1))
        next_actions = self.target_actor(next_states)
        next_Q_values = self.target_critic(torch.cat((next_states, next_actions), dim=1))
        targets = rewards + self.gamma * next_Q_values
        targets = targets.detach() # We don't want the gradient w.r.t the targets
                                    
        critic_loss = nn.MSELoss()(Q_values, targets)
        self.critic.optimizer.zero_grad()
        critic_loss.backward()
        self.critic.optimizer.step()
                   
        # Improve actor
        actions = self.actor(states)
        Q_values = self.critic(torch.cat((states, actions), dim=1))
        
        actor_loss = -Q_values.mean()
        self.actor.optimizer.zero_grad()
        actor_loss.backward()
        self.actor.optimizer.step()
        
        # Update target networks towards current networks
        self._update_target_networks()
        
    def _update_target_networks(self):
        current_params = list(self.critic.parameters()) + list(self.actor.parameters())
        target_params = list(self.target_critic.parameters()) + list(self.target_actor.parameters())
        for current, target in zip(current_params, target_params):
            target.data = self.target_lr * current.data + (1. - self.target_lr) * target.data

In [None]:
class Noise(object):
    def __init__(self, delta=0.5, sigma=0.5, ou_a=3., ou_mu=0.):
        self.delta = delta
        self.sigma = sigma
        self.ou_a = ou_a
        self.ou_mu = ou_mu

    def brownian_motion_log_returns(self):
        """
        This method returns a Wiener process. The Wiener process is also called Brownian motion. For more information
        about the Wiener process check out the Wikipedia page: http://en.wikipedia.org/wiki/Wiener_process
        :return: brownian motion log returns
        """
        sqrt_delta_sigma = np.sqrt(self.delta) * self.sigma
        return np.random.normal(loc=0, scale=sqrt_delta_sigma, size=None)

    def ornstein_uhlenbeck_level(self, prev_ou_level):
        """
        This method returns the rate levels of a mean-reverting ornstein uhlenbeck process.
        :return: the Ornstein Uhlenbeck level
        """
        drift = self.ou_a * (self.ou_mu - prev_ou_level) * self.delta
        randomness = self.brownian_motion_log_returns()
        return prev_ou_level + drift + randomness
    
    
class Environment:
    def __init__(self, environment):
        self.env = gym.make(environment)
        self.n_episodes = 0
        self.ou_level= 0. # Initialize noise
        self.noise_max_ep = 150
        self.noise = Noise()

    def run_episode(self, agent):
        self.n_episodes += 1
        state = self.env.reset()
        total_reward = 0 
        while True:            
            self.env.render()
            action = agent.policy(state)
            # For the first noise_max_ep episodes, add random noise to the action
            if self.n_episodes < self.noise_max_ep:
                self.ou_level = self.noise.ornstein_uhlenbeck_level(self.ou_level)
                action = action + self.ou_level
            next_state, reward, done, info = self.env.step(action)
            if done:
                next_state = np.zeros(self.env.observation_space.shape[0])
            agent.observe((state, action, reward, next_state))
            agent.experience_replay()
            state = next_state
            total_reward += reward
            if done:
                break
        print("Episode {}, total reward: {}".format(self.n_episodes, total_reward))

In [None]:
env = Environment('Pendulum-v0')
#env = Environment('BipedalWalker-v2')

print(env.env.action_space)
print(env.env.action_space.low)
print(env.env.action_space.high)

n_dim_states = env.env.observation_space.shape[0]
n_actions = env.env.action_space.shape[0]
action_bound = float(env.env.action_space.high[0])
agent = Agent(n_dim_states, n_actions, action_bound)

while(True):
    env.run_episode(agent)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Box(1,)
[-2.]
[2.]
Episode 1, total reward: -1533.4360590776994
Episode 2, total reward: -1517.0047584667063
Episode 3, total reward: -1665.288999792265
Episode 4, total reward: -779.9624288557235
Episode 5, total reward: -1520.8008022947545
Episode 6, total reward: -1674.6734097273395
Episode 7, total reward: -1784.0274073041755
Episode 8, total reward: -1601.151930761113
Episode 9, total reward: -1682.0645905398578
Episode 10, total reward: -1716.1563392706562
Episode 11, total reward: -1602.0125888253792
Episode 12, total reward: -1672.9329571427959
Episode 13, total reward: -1667.4845747271097
Episode 14, total reward: -1762.8801299257789
Episode 15, total reward: -1721.4609523922593
Episode 16, total reward: -1674.6462977256915
Episode 17, total reward: -1594.462865088