In [1]:
from collections import namedtuple, deque
import random
import gym
import numpy as np
import copy

import torch
from torch.autograd import Variable
import torch.nn as nn
from torch.optim import Adam

In [2]:
env = gym.make('CartPole-v1')

In [3]:
action_n = env.action_space.n
observation_shape = env.observation_space.shape

In [4]:
def clip_grads(net, low=-10, high=10):
    """Gradient clipping to the range [low, high]."""
    parameters = [param for param in net.parameters()
                  if param.grad is not None]
    for p in parameters:
        p.grad.data.clamp_(low, high)
        
if torch.cuda.is_available():
    def to_var(x, requires_grad=False, gpu=None):
        x = x.cuda(gpu)
        return Variable(x, requires_grad=requires_grad)
else:
    def to_var(x, requires_grad=False, gpu=None):
        return Variable(x, requires_grad=requires_grad)

In [5]:
class DoubleDQN:
    def __init__(self, model, gamma=0.95, learning_rate=1.e-4, memory_size=20000, action_n=action_n, batch_size=64):
        self.model = model # actor model
        self.target_model = copy.deepcopy(model)
        self.memory = deque(maxlen=memory_size)
        self.gamma = gamma
        self.action_n = action_n
        self.batch_size = batch_size
        self.loss_fn = nn.SmoothL1Loss()
#         self.loss_fn = nn.MSELoss()
        self.optimizer = Adam(model.parameters(), lr=learning_rate)
        self.episode2thresh=lambda i: 0.05+0.9*np.exp(-1. * i / 100) # if i>10 else 0
        
    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())
        
    def select_action(self,obs, episode=np.inf):
        thresh = self.episode2thresh(episode)
        if np.random.random()<thresh:
#             print("Exploration!",thresh)
            action = np.random.randint(self.action_n)
        else:
            state = to_var(torch.from_numpy(obs).float().unsqueeze(0))
            q_values = self.get_q_value(state)
            _, action_ = q_values.max(1)
            action = action_.data[0]
        return action
    
    def play(self, obs):
        state = to_var(torch.from_numpy(obs).float().unsqueeze(0))
        q_values = self.get_q_value(state)
        _, action_ = q_values.max(1)
        action = action_.data[0]
        return action
    
    def get_q_value(self, state):
        with torch.no_grad():   
            values = self.model(state)
        return values
    
    def get_target_q_value(self, state):
        with torch.no_grad():   
            values = self.target_model(state)
        return values
    
    def memorize(self, state, action, next_state, reward):
        self.memory.append((state, action, next_state, reward))
        
    def replay(self, batch_size):
        batch = random.sample(self.memory, batch_size)
        
        state_batch = to_var(torch.stack([torch.Tensor(b[0]) for b in batch]))
        
        action_batch = to_var(torch.stack([torch.LongTensor([b[1]]) for b in batch]))
        
        next_state_batch = to_var(torch.stack([torch.Tensor(b[2]) for b in batch if b[2] is not None]))
        non_final_mask = torch.ByteTensor([b[2] is not None for b in batch])
        
        reward_batch = to_var(torch.stack([torch.Tensor([b[3]]) for b in batch]))
#         reward_batch = to_var(torch.stack([torch.Tensor([b[3] if b[2] is not None else -10]) for b in batch]))
        
        curr_values = self.model(state_batch).gather(1,action_batch)
        
        next_state_q = self.get_q_value(next_state_batch)
        next_action_batch = next_state_q.max(1)[1].unsqueeze(-1)
        next_state_target_q = self.get_target_q_value(next_state_batch)  
        next_values = to_var(torch.zeros(batch_size,1).float())
        next_values[non_final_mask]= next_state_target_q.gather(1,next_action_batch)
        
        expected_values = next_values*self.gamma+reward_batch
        
        return self.loss_fn(curr_values,expected_values)
        
    def train(self):
        if len(self.memory)>self.batch_size:
            loss = self.replay(self.batch_size)
            self.optimizer.zero_grad()
            loss.backward()
#             clip_grads(self.model,-5,5)
            self.optimizer.step()
            return loss.data[0]
        else:
            print("Not enough experience.")

In [6]:
net = nn.Sequential(nn.Linear(observation_shape[0],64),nn.ReLU(),
                    nn.Linear(64,32),nn.ReLU(),
                    nn.Linear(32,action_n))
agent = DoubleDQN(model=net,gamma=0.8, learning_rate=1.e-4, memory_size=10000, batch_size=64)

In [7]:
# for episode in range(200):
#     obs = env.reset()
#     for _ in range(10000): # not exceed 10000 episodes
#         action = agent.select_action(obs,episode)
#         next_obs, reward, done, _  = env.step(action)
#         if done:
#             agent.memorize(obs, action, None, reward)
#             break
#         else:
#             agent.memorize(obs, action, next_obs, reward)
#             obs = next_obs

In [8]:
running_reward = 0
for episode in range(1000):
    obs = env.reset()
    total_reward = 0
    for _ in range(10000): # not exceed 10000 episodes
        action = agent.select_action(obs,episode)
        next_obs, reward, done, _  = env.step(action)
#         env.render()
        total_reward+=reward
        if done:
            agent.memorize(obs, action, None, reward)
            agent.update_target_model()
            break
        else:
            agent.memorize(obs, action, next_obs, reward)
            obs = next_obs
        train_loss = agent.train()
    count_gamma = 0.5
    running_reward = count_gamma*running_reward+(1-count_gamma)*total_reward
#         print(l)
    print(episode, total_reward, running_reward)
    if running_reward>env.spec.reward_threshold:
        break
print("Finished: %s@%s" %(running_reward,episode))

Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough experience.
Not enough 

In [9]:
torch.save(net.state_dict(), "double_dqn.pth")

In [10]:
for i in range(20):
    obs = env.reset()
    total_reward=0
    for _ in range(10000): # not exceed 10000 episodes
        action = agent.play(obs)
        obs, reward, done, _ = env.step(action)
        env.render()
        total_reward+=reward
        if done:
             break
    print("Encore: %s"%total_reward)

Encore: 500.0
Encore: 500.0
Encore: 500.0
Encore: 500.0
Encore: 500.0
Encore: 500.0
Encore: 500.0
Encore: 500.0
Encore: 500.0
Encore: 492.0
Encore: 500.0
Encore: 500.0
Encore: 500.0
Encore: 500.0
Encore: 500.0
Encore: 500.0
Encore: 484.0
Encore: 500.0
Encore: 500.0
Encore: 500.0


# double dqn is much better than vanilla dqn