# First DQN

Implementing https://arxiv.org/abs/1312.5602 algorithm

* Try one of the simplest environment: Cart pole

In [1]:
import numpy as np
import gym
env = gym.make('CartPole-v0')

[2017-02-08 13:13:27,366] Making new env: CartPole-v0


* Set all the seeds to 42:

In [2]:
env.seed(42)
np.random.seed(42)
gym.spaces.seed(42)

* Play one randome game with random actions:

In [3]:
observation = env.reset()

done = False
total_reward = 0
while not done:
    env.render(close=True)
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    total_reward += reward
    
print 'total reward:', total_reward



total reward: 12.0


* Make an agent:

In [4]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Reshape
from keras.layers import Activation
from keras.layers import Dropout
from keras.optimizers import SGD
from keras import backend as K

class Agent:
    def __init__(self):
        #Possible actions
        self.actions = np.arange(env.action_space.n)
        
        #NN
        self.nn = Sequential()

        self.nn.add( Dense(output_dim = 20, input_shape = observation.shape ) )
        self.nn.add( Activation('sigmoid') )
        self.nn.add( Dense(output_dim = env.action_space.n ) )
        self.nn.add( Activation('linear') )

        self.nn.compile(loss='mse', optimizer=SGD(lr=0.01) )
        
    def Q(self, observation):
        return self.nn.predict(observation[None,:])[0]
    
    def Q_batch(self, observation):
        return self.nn.predict_on_batch(observation)
    
    def sample_action(self, observation):
        return np.argmax( self.Q(observation) )
        

Using Theano backend.


* Define a function to play one game:

In [5]:
import copy

def play(agent, env, show = False, save = False): 
    if save : env.monitor.start('cartpole-experiment', force=True)

    done = False
    observation = env.reset()
    total_reward = 0.0
    count = 0
    while not done and count<2000: 
        q = agent.Q(observation)
        action = np.argmax( q )
        
        if show: 
            env.render("human")
        
        observation, reward, done, info = env.step(action)
        
        total_reward += reward
        count += 1
    
    if save : env.monitor.close()
            
    return total_reward

* Test the untrained agent:

In [6]:
agent = Agent()
print 'total reward:', play(agent, env)

total reward: 10.0


* Train the agent:

In [7]:
%%time

env.seed(42)
np.random.seed(42)
gym.spaces.seed(42)

from collections import deque

N_steps = 10000
N = 50000
batch_size = 32
gamma = 0.99

D = deque(maxlen=N)

step = 0
while step<N_steps:
    eps = max( 0.1, 1.0 - 3.0*step/float(N_steps) )
    
    obs = env.reset()
    done = False
    total_reward = 0.0
    while not done and step<N_steps: 
        if np.random.binomial(1, p=eps) == 1:
            action = np.random.choice( env.action_space.n )
        else:
            action = agent.sample_action(obs)
            
        obs_p, reward, done, info = env.step( action )
        total_reward += reward

        D.append( {'obs':obs, 'action':action, 'reward':reward, 'obs_p':obs_p, 'done':done} )
            
        obs = obs_p

        batch = np.random.choice( D , size=min(batch_size, len(D)), replace=False )
        
        batch_obs = []
        batch_obs_p = []
        for b in batch:
            batch_obs.append(b['obs'])
            batch_obs_p.append(b['obs_p'])
        
        batch_obs = np.array(batch_obs)
        batch_obs_p = np.array(batch_obs_p)
        
        Q_pred = np.max( agent.Q_batch(batch_obs_p), axis = 1 )
        
        #So cheap, same trick as in keras-rl
        y = agent.Q_batch(batch_obs)
        for i, b in enumerate(batch):            
            y[i, b['action'] ] = b['reward']
            if not b['done'] :
                y[i, b['action'] ] += gamma*Q_pred[i]
            
        agent.nn.train_on_batch( batch_obs , y )
        
        step += 1
        
    print 'step:', step, 'eps:', eps, 'total reward:', total_reward

step: 27 eps: 1.0 total reward: 27.0
step: 39 eps: 0.973 total reward: 12.0
step: 61 eps: 0.961 total reward: 22.0
step: 78 eps: 0.939 total reward: 17.0
step: 95 eps: 0.922 total reward: 17.0
step: 110 eps: 0.905 total reward: 15.0
step: 120 eps: 0.89 total reward: 10.0
step: 146 eps: 0.88 total reward: 26.0
step: 162 eps: 0.854 total reward: 16.0
step: 184 eps: 0.838 total reward: 22.0
step: 199 eps: 0.816 total reward: 15.0
step: 216 eps: 0.801 total reward: 17.0
step: 240 eps: 0.784 total reward: 24.0
step: 266 eps: 0.76 total reward: 26.0
step: 280 eps: 0.734 total reward: 14.0
step: 366 eps: 0.72 total reward: 86.0
step: 391 eps: 0.634 total reward: 25.0
step: 435 eps: 0.609 total reward: 44.0
step: 522 eps: 0.565 total reward: 87.0
step: 611 eps: 0.478 total reward: 89.0
step: 738 eps: 0.389 total reward: 127.0
step: 854 eps: 0.262 total reward: 116.0
step: 1008 eps: 0.146 total reward: 154.0
step: 1132 eps: 0.1 total reward: 124.0
step: 1164 eps: 0.1 total reward: 32.0
step: 11

* Test the trained agent:

In [8]:
print 'total reward:', play(agent, env, save =  True, show = True)

total reward: 240.0
