In [1]:
import numpy as np
import gym
from collections import deque
import random
import torch.autograd
import os
import time
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F 
from torch.autograd import Variable
import sys
import pickle
import matplotlib.pyplot as plt
import pybullet as p 
import pybullet 
import pybullet_envs.gym_pendulum_envs 
import pybullet_envs.gym_locomotion_envs
if torch.cuda.is_available():
    device = torch.device("cuda:0")  # you can continue going on here, like cuda:1 cuda:2....etc. 
    print("Running on the GPU")
else:
    device = torch.device("cpu")
    print("Running on the CPU")

Running on the GPU


In [2]:
# Ornstein-Ulhenbeck Process
# Taken from #https://github.com/vitchyr/rlkit/blob/master/rlkit/exploration_strategies/ou_strategy.py
class OUNoise(object):
    def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000):
        self.mu           = mu
        self.theta        = theta
        self.sigma        = max_sigma
        self.max_sigma    = max_sigma
        self.min_sigma    = min_sigma
        self.decay_period = decay_period
        self.action_dim   = action_space.shape[0]
        self.low          = action_space.low
        self.high         = action_space.high
        self.reset()
        
    def reset(self):
        self.state = np.ones(self.action_dim) * self.mu
        
    def evolve_state(self):
        x  = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
        self.state = x + dx
        return self.state
    
    def get_action(self, action, t=0):
        ou_state = self.evolve_state()
        self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period)
        return np.clip(action + ou_state, self.low, self.high)


# https://github.com/openai/gym/blob/master/gym/core.py
class NormalizedEnv(gym.ActionWrapper):
    """ Wrap action """

    def _action(self, action):
        act_k = (self.action_space.high - self.action_space.low)/ 2.
        act_b = (self.action_space.high + self.action_space.low)/ 2.
        return act_k * action + act_b

    def _reverse_action(self, action):
        act_k_inv = 2./(self.action_space.high - self.action_space.low)
        act_b = (self.action_space.high + self.action_space.low)/ 2.
        return act_k_inv * (action - act_b)
        

class Memory:
    def __init__(self, max_size):
        self.max_size = max_size
        self.buffer = deque(maxlen=max_size)
    
    def push(self, state, action, reward, next_state, done):
        experience = (state, action, np.array([reward]), next_state, done)
        self.buffer.append(experience)

    def sample(self, batch_size):
        state_batch = []
        action_batch = []
        reward_batch = []
        next_state_batch = []
        done_batch = []

        batch = random.sample(self.buffer, batch_size)

        for experience in batch:
            state, action, reward, next_state, done = experience
            state_batch.append(state)
            action_batch.append(action)
            reward_batch.append(reward)
            next_state_batch.append(next_state)
            done_batch.append(done)
        
        return state_batch, action_batch, reward_batch, next_state_batch, done_batch

    def __len__(self):
        return len(self.buffer)

In [7]:
class QValue(nn.Module):
    def __init__(
        self,
        input_size_state,
        input_size_action,
        hidden_sizes
    ):
        super(QValue, self).__init__()
        self.hidden_sizes = hidden_sizes
        self.input_size = input_size_action + input_size_state
        
        self.layers = nn.ModuleList()
        
        self.layers.append(nn.Linear(self.input_size, hidden_sizes[0]))
        self.layers.append(nn.ReLU())
        
        for i in range(len(hidden_sizes) - 1):
            self.layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i+1]))
            self.layers.append(nn.ReLU())
        
        self.layers.append(nn.Linear(hidden_sizes[len(hidden_sizes) - 1], 1))
        
    def forward(self, state, action):
        x = torch.cat([state, action], 1)
        
        for layer in self.layers:
            x = layer(x)
            
        return x
    
class Policy(nn.Module):
    def __init__(
        self,
        input_size_state,
        hidden_sizes,
        output_size
    ):
        super(Policy, self).__init__()
        self.hidden_sizes = hidden_sizes
        self.input_size = input_size_state
        
        self.layers = nn.ModuleList()
        
        self.layers.append(nn.Linear(self.input_size, hidden_sizes[0]))
        self.layers.append(nn.ReLU())
        
        for i in range(len(hidden_sizes) - 1):
            self.layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i+1]))
            self.layers.append(nn.ReLU())
        
        self.layers.append(nn.Linear(hidden_sizes[len(hidden_sizes) - 1], output_size))
        self.layers.append(nn.Tanh())
        
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x


In [18]:
class DDPGagent:
    def __init__(self, env, q_hidden_sizes=[32, 64, 128, 64, 32], p_hidden_sizes=[32, 64, 128, 64, 32], actor_learning_rate=1e-4, critic_learning_rate=1e-3, gamma=0.99, tau=1e-2, max_memory_size=int(1e5)):
        # Params
        self.num_states = env.observation_space.shape[0]
        self.num_actions = env.action_space.shape[0]
        self.gamma = gamma
        self.tau = tau

        # Networks

        self.actor = Policy(self.num_states, p_hidden_sizes, self.num_actions).to(device)
        self.actor_target = Policy(self.num_states, p_hidden_sizes, self.num_actions).to(device)
        self.critic = QValue(self.num_states, self.num_actions, q_hidden_sizes).to(device)
        self.critic_target = QValue(self.num_states, self.num_actions, q_hidden_sizes).to(device)
        
        for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()):
            target_param.data.copy_(param.data)

        for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_(param.data)
        
        # Training
        self.memory = Memory(max_memory_size)
        self.critic_criterion  = nn.MSELoss()
        self.actor_optimizer  = optim.Adam(self.actor.parameters(), lr=actor_learning_rate)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_learning_rate)
        self.actor_lr_scheduler = optim.lr_scheduler.StepLR(self.actor_optimizer, step_size=10, gamma=0.8)
        self.critic_lr_scheduler = optim.lr_scheduler.StepLR(self.actor_optimizer, step_size=10, gamma=0.8)

    def get_action(self, state):
        state = torch.from_numpy(state).float().to(device)
        action = self.actor.forward(state)
        return action.cpu().detach().numpy()
    
    def get_latest_lr(self):
        return self.critic_lr_scheduler.get_last_lr()
    
    def update_lr(self):
        self.critic_lr_scheduler.step()
        self.actor_lr_scheduler.step()
    
    def update(self, batch_size):
        states, actions, rewards, next_states, _ = self.memory.sample(batch_size)
        states = torch.FloatTensor(states).to(device)
        actions = torch.FloatTensor(actions).to(device)
        rewards = torch.FloatTensor(rewards).to(device)
        next_states = torch.FloatTensor(next_states).to(device)
    
        # Critic loss        
        Qvals = self.critic.forward(states, actions)
        next_actions = self.actor_target.forward(next_states)
        next_Q = self.critic_target.forward(next_states, next_actions.detach())
        Qprime = rewards + self.gamma * next_Q
        critic_loss = self.critic_criterion(Qvals, Qprime)

        # Actor loss
        policy_loss = -self.critic.forward(states, self.actor.forward(states)).mean()
        
        # update networks
        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()
        
        self.critic_optimizer.zero_grad()
        critic_loss.backward() 
        self.critic_optimizer.step()
        
        # update target networks 
        for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))

        for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))
            
    def save_agent_networks(self, prefix):
        torch.save(self.actor, prefix + '-actor.pt')
        torch.save(self.critic, prefix + '-critic.pt')
        
    def load_agent_networks(self, prefix):
        self.actor = torch.load(prefix + '-actor.pt').to(device)
        self.critic = torch.load(prefix + '-critic.pt').to(device)
        self.actor_target = self.actor
        self.critic_target = self.critic
        

In [19]:
env = gym.make('HalfCheetahBulletEnv-v0')
agent = DDPGagent(env, gamma=0.9999)
noise = OUNoise(env.action_space)
batch_size = 4096
rewards = []
avg_rewards = []
learningRates = []
episodeLengths = []
for episode in range(100):
    state = env.reset()
    noise.reset()
    episode_reward = 0
    step = 0
    done = False
    
    for i in range(300):
        action = agent.get_action(state)
        action = noise.get_action(action, step)
        new_state, reward, done, _ = env.step(action) 
        agent.memory.push(state, action, reward, new_state, done)
        
        if len(agent.memory) > batch_size:
            agent.update(batch_size)
        
        
        state = new_state
        episode_reward += reward
        step += 1
        
        if done:
            break
        
    print("episode: {} || reward for episode: {} || average reward: {} || episode length: {}\n".format(episode, np.round(episode_reward, decimals=2), np.mean(rewards[-25:]), step))
    
    episodeLengths.append(step)
    rewards.append(episode_reward)
    avg_rewards.append(np.mean(rewards[-25:]))
    learningRates.append(agent.get_latest_lr())
    agent.update_lr()



episode: 0 || reward for episode: -369.98 || average reward: nan || episode length: 300

episode: 1 || reward for episode: -421.75 || average reward: -369.97779971624885 || episode length: 300

episode: 2 || reward for episode: -263.49 || average reward: -395.8649411113156 || episode length: 300

episode: 3 || reward for episode: -152.01 || average reward: -351.73867340926364 || episode length: 300

episode: 4 || reward for episode: -401.4 || average reward: -301.8054987290412 || episode length: 300

episode: 5 || reward for episode: -317.95 || average reward: -321.72375164210104 || episode length: 300

episode: 6 || reward for episode: -135.4 || average reward: -321.09461911074226 || episode length: 300

episode: 7 || reward for episode: -274.02 || average reward: -294.5662105529726 || episode length: 300

episode: 8 || reward for episode: -290.83 || average reward: -291.9975526751362 || episode length: 300

episode: 9 || reward for episode: -286.39 || average reward: -291.86743500855

episode: 79 || reward for episode: -447.31 || average reward: -330.71671432116904 || episode length: 300

episode: 80 || reward for episode: -435.87 || average reward: -331.1240189699578 || episode length: 300

episode: 81 || reward for episode: -448.82 || average reward: -329.59504629049593 || episode length: 300

episode: 82 || reward for episode: -441.17 || average reward: -329.0516299674213 || episode length: 300

episode: 83 || reward for episode: -460.38 || average reward: -328.62082544283277 || episode length: 300

episode: 84 || reward for episode: -443.74 || average reward: -329.1339986552921 || episode length: 300

episode: 85 || reward for episode: -188.3 || average reward: -328.7488050241988 || episode length: 300

episode: 86 || reward for episode: -161.51 || average reward: -330.77451432813956 || episode length: 300

episode: 87 || reward for episode: -253.5 || average reward: -337.39955368027864 || episode length: 300

episode: 88 || reward for episode: -329.89 || averag

In [None]:
%matplotlib notebook
fig, ax = plt.subplots(3, sharex=True)
ax[0].plot(range(len(rewards)), rewards)
ax[0].plot(range(len(avg_rewards)), avg_rewards)
ax[0].set(ylabel='Reward')
ax[1].plot(range(len(episodeLengths)), episodeLengths)
ax[2].plot(range(len(learningRates)), np.log10(learningRates))
ax[1].set(ylabel='Episode Length')
ax[2].set(ylabel='Log Learning Rate')
ax[0].legend(["Episode", "Last 25 Avg"])
plt.xlabel('Episode')
plt.show()
plt.savefig('training-InvertedDoublePendulum')

In [None]:
agent.save_agent_networks('doublePendulum')

In [None]:
agent1 = agent

In [None]:
env = gym.make("InvertedDoublePendulumBulletEnv-v0")
agent = DDPGagent(env)
agent.load_agent_networks('doublePendulum')
# env.render()
states, actions, rewards, next_states, dones = [], [], [], [], []
collectedSamples = 0
samplesToPrint = 10000
while collectedSamples < 1e6:
    state = env.reset()
    done = False
    
    step = 0
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(np.array([action]))
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        next_states.append(next_state)
        dones.append(done)
        
        state = next_state
        step += 1
        collectedSamples += 1
        samplesToPrint -= 1
    
    if step != 1000:
        print(step)
        
    if samplesToPrint <= 0:
        print(collectedSamples)
        samplesToPrint = 10000

In [None]:
env.close()

In [None]:
actions_array = np.array(actions)
states_array = np.array(states)
next_states_array = np.array(next_states)
rewards_array = np.array(rewards)
dones_array = np.array(dones)

In [None]:
np.save("doubleInvertedPendulumDataset/actions_array", actions_array)
np.save("doubleInvertedPendulumDataset/states_array", states_array)
np.save("doubleInvertedPendulumDataset/next_states_array", next_states_array)
np.save("doubleInvertedPendulumDataset/rewards_array", rewards_array)
np.save("doubleInvertedPendulumDataset/dones_array", dones_array)

In [None]:
np.std(actions_array)