# First DQN

Implementing https://arxiv.org/abs/1312.5602 algorithm

* Try one of the simplest environment: Cart pole

In [1]:
import numpy as np
import gym
env = gym.make('CartPole-v0')

[2017-02-08 16:24:11,495] Making new env: CartPole-v0


* Set all the seeds to 42:

In [2]:
env.seed(42)
np.random.seed(42)
gym.spaces.seed(42)

* Play one randome game with random actions:

In [3]:
observation = env.reset()

done = False
total_reward = 0
while not done:
    env.render(close=True)
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    total_reward += reward
    
print 'total reward:', total_reward



total reward: 12.0


In [4]:
def clone_model(model):
    # Requires Keras 1.0.7 since get_config has breaking changes.
    config = {
        'class_name': model.__class__.__name__,
        'config': model.get_config(),
    }
    clone = model_from_config(config)
    clone.set_weights(model.get_weights())
    return clone

* Make an agent:

In [5]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Reshape
from keras.layers import Activation
from keras.layers import Dropout
from keras.optimizers import SGD
from keras import backend as K

def make_model():
    model = Sequential()
    model.add( Dense(output_dim = 20, input_shape = observation.shape ) )
    model.add( Activation('sigmoid') )
    model.add( Dense(output_dim = env.action_space.n ) )
    model.add( Activation('linear') )

    model.compile(loss='mse', optimizer=SGD(lr=0.01) )
    
    return model

class Agent:
    def __init__(self):
        #Possible actions
        self.actions = np.arange(env.action_space.n)
        
        #NN
        self.nn = make_model()
        self.nn_target = make_model()
        
    def Q(self, observation, target = False):
        model = self.nn
        if target : model = self.nn_target
        return model.predict(observation[None,:])[0]
    
    def Q_batch(self, observation, target = False):
        model = self.nn
        if target : model = self.nn_target
        return model.predict_on_batch(observation)
    
    def sample_action(self, observation):
        return np.argmax( self.Q(observation) )
    
    def update_target(self):
        self.nn_target.set_weights(self.nn.get_weights())
        

Using Theano backend.


* Define a function to play one game:

In [6]:
import copy

def play(agent, env, show = False, save = False): 
    if save : env.monitor.start('cartpole-experiment', force=True)

    done = False
    observation = env.reset()
    total_reward = 0.0
    count = 0
    while not done and count<2000: 
        q = agent.Q(observation)
        action = np.argmax( q )
        
        if show: 
            env.render("human")
        
        observation, reward, done, info = env.step(action)
        
        total_reward += reward
        count += 1
    
    if save : env.monitor.close()
            
    return total_reward

* Test the untrained agent:

In [7]:
agent = Agent()
print 'total reward:', play(agent, env)

total reward: 10.0


* Train the agent:

In [8]:
%%time

env.seed(42)
np.random.seed(42)
gym.spaces.seed(42)

from collections import deque

N_steps = 10000
N = 50000
batch_size = 32
gamma = 0.99
C = 20

D = deque(maxlen=N)

step = 0
while step<N_steps:
    eps = max( 0.1, 1.0 - 3.0*step/float(N_steps) )
    
    obs = env.reset()
    done = False
    total_reward = 0.0
    while not done and step<N_steps: 
        if np.random.binomial(1, p=eps) == 1:
            action = np.random.choice( env.action_space.n )
        else:
            action = agent.sample_action(obs)
            
        obs_p, reward, done, info = env.step( action )
        total_reward += reward

        D.append( {'obs':obs, 'action':action, 'reward':reward, 'obs_p':obs_p, 'done':done} )
            
        obs = obs_p

        batch = np.random.choice( D , size=min(batch_size, len(D)), replace=False )
        
        batch_obs = []
        batch_obs_p = []
        for b in batch:
            batch_obs.append(b['obs'])
            batch_obs_p.append(b['obs_p'])
        
        batch_obs = np.array(batch_obs)
        batch_obs_p = np.array(batch_obs_p)
        
        Q_pred = np.max( agent.Q_batch(batch_obs_p, target=True), axis = 1 )
        
        #So cheap, same trick as in keras-rl
        y = agent.Q_batch(batch_obs)
        for i, b in enumerate(batch):            
            y[i, b['action'] ] = b['reward']
            if not b['done'] :
                y[i, b['action'] ] += gamma*Q_pred[i]
            
        agent.nn.train_on_batch( batch_obs , y )
        
        if step%C == 0:
            agent.update_target()
        
        step += 1
        
    print 'step:', step, 'eps:', eps, 'total reward:', total_reward

step: 27 eps: 1.0 total reward: 27.0
step: 39 eps: 0.9919 total reward: 12.0
step: 61 eps: 0.9883 total reward: 22.0
step: 79 eps: 0.9817 total reward: 18.0
step: 91 eps: 0.9763 total reward: 12.0
step: 100 eps: 0.9727 total reward: 9.0
step: 113 eps: 0.97 total reward: 13.0
step: 125 eps: 0.9661 total reward: 12.0
step: 137 eps: 0.9625 total reward: 12.0
step: 148 eps: 0.9589 total reward: 11.0
step: 161 eps: 0.9556 total reward: 13.0
step: 176 eps: 0.9517 total reward: 15.0
step: 192 eps: 0.9472 total reward: 16.0
step: 202 eps: 0.9424 total reward: 10.0
step: 239 eps: 0.9394 total reward: 37.0
step: 272 eps: 0.9283 total reward: 33.0
step: 283 eps: 0.9184 total reward: 11.0
step: 323 eps: 0.9151 total reward: 40.0
step: 346 eps: 0.9031 total reward: 23.0
step: 368 eps: 0.8962 total reward: 22.0
step: 391 eps: 0.8896 total reward: 23.0
step: 404 eps: 0.8827 total reward: 13.0
step: 418 eps: 0.8788 total reward: 14.0
step: 433 eps: 0.8746 total reward: 15.0
step: 474 eps: 0.8701 total

* Test the trained agent:

In [10]:
print 'total reward:', play(agent, env, save =  False, show = True)

total reward: 759.0
