# First DQN

Implementing https://arxiv.org/abs/1312.5602 algorithm

* Try one of the simplest environment: Cart pole

In [1]:
import numpy as np
import gym
env = gym.make('CartPole-v0')

[2017-02-08 11:23:57,896] Making new env: CartPole-v0


* Set all the seeds to 42:

In [2]:
env.seed(42)
np.random.seed(42)
gym.spaces.seed(42)

* Play one randome game with random actions:

In [3]:
observation = env.reset()

done = False
total_reward = 0
while not done:
    env.render(close=True)
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    total_reward += reward
    
print 'total reward:', total_reward



total reward: 12.0


* Make an agent:

In [4]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Reshape
from keras.layers import Activation
from keras.layers import Dropout
from keras.optimizers import SGD
from keras import backend as K

class Agent:
    def __init__(self):
        #Possible actions
        self.actions = np.arange(env.action_space.n)
        
        #NN
        self.nn = Sequential()

        self.nn.add( Dense(output_dim = 20, input_shape = observation.shape ) )
        self.nn.add( Activation('sigmoid') )
        self.nn.add( Dense(output_dim = env.action_space.n ) )
        self.nn.add( Activation('linear') )

        self.nn.compile(loss='mse', optimizer=SGD(lr=0.1) )
        
    def Q(self, observation):
        return self.nn.predict(observation[None,:])[0]
    
    def Q_batch(self, observation):
        return self.nn.predict_on_batch(observation)
    
    def sample_action(self, observation):
        return np.argmax( self.Q(observation) )
        

Using Theano backend.


* Define a function to play one game:

In [5]:
import copy

def play(agent, env, first_observation, show = False): 
    e = copy.deepcopy(env)
    
    done = False
    observation = first_observation
    total_reward = 0.0
    count = 0
    while not done and count<2000: 
        q = agent.Q(observation)
        action = np.argmax( q )
        
        if show : 
            e.render()
        
        observation, reward, done, info = e.step(action)
        
        total_reward += reward
        count += 1
        
    return total_reward

* Test the untrained agent:

In [6]:
agent = Agent()
first_observation = env.reset()
print 'Q for the initial state:', agent.Q( first_observation )
print 'total reward:', play(agent, env, first_observation, show = False)

Q for the initial state: [ 0.40602553 -0.40782896]
total reward: 10.0


* Train the agent:

In [7]:
%%time

from collections import deque

N_steps = 3000
N = 50000
batch_size = 32
gamma = 0.99

D = deque(maxlen=N)

step = 0
while step<N_steps:
    eps = max( 0.1, 1.0 - 5.0*step/float(N_steps) )
    
    obs = env.reset()
    done = False
    total_reward = 0.0
    while not done and step<N_steps: 
        if np.random.binomial(1, p=eps) == 1:
            action = np.random.choice( env.action_space.n )
        else:
            action = agent.sample_action(obs)
            
        obs_p, reward, done, info = env.step( action )
        total_reward += reward

        D.append( {'obs':obs, 'action':action, 'reward':reward, 'obs_p':obs_p, 'done':done} )
            
        obs = obs_p

        batch = np.random.choice( D , size=min(batch_size, len(D)), replace=False )
        
        batch_obs = []
        batch_obs_p = []
        for b in batch:
            batch_obs.append(b['obs'])
            batch_obs_p.append(b['obs_p'])
        
        batch_obs = np.array(batch_obs)
        batch_obs_p = np.array(batch_obs_p)
        
        Q_pred = np.max( agent.Q_batch(batch_obs_p), axis = 1 )
        
        #So cheap, same trick as in keras-rl
        y = agent.Q_batch(batch_obs)
        for i, b in enumerate(batch):            
            y[i, b['action'] ] = b['reward']
            if not b['done'] :
                y[i, b['action'] ] += gamma*Q_pred[i]
            
        agent.nn.train_on_batch( batch_obs , y )
        
        step += 1
        
    print 'step:', step, 'eps:', eps, 'total reward:', total_reward

step: 15 eps: 1.0 total reward: 15.0
step: 31 eps: 0.975 total reward: 16.0
step: 69 eps: 0.948333333333 total reward: 38.0
step: 81 eps: 0.885 total reward: 12.0
step: 92 eps: 0.865 total reward: 11.0
step: 106 eps: 0.846666666667 total reward: 14.0
step: 115 eps: 0.823333333333 total reward: 9.0
step: 196 eps: 0.808333333333 total reward: 81.0
step: 213 eps: 0.673333333333 total reward: 17.0
step: 248 eps: 0.645 total reward: 35.0
step: 279 eps: 0.586666666667 total reward: 31.0
step: 324 eps: 0.535 total reward: 45.0
step: 356 eps: 0.46 total reward: 32.0
step: 434 eps: 0.406666666667 total reward: 78.0
step: 533 eps: 0.276666666667 total reward: 99.0
step: 778 eps: 0.111666666667 total reward: 245.0
step: 1104 eps: 0.1 total reward: 326.0
step: 1275 eps: 0.1 total reward: 171.0
step: 1603 eps: 0.1 total reward: 328.0
step: 1905 eps: 0.1 total reward: 302.0
step: 2226 eps: 0.1 total reward: 321.0
step: 2248 eps: 0.1 total reward: 22.0
step: 2351 eps: 0.1 total reward: 103.0
step: 25

* Test the trained agent:

In [8]:
first_observation = env.reset()
print 'Q for the initial state:', agent.Q( first_observation )
print 'total reward:', play(agent, env, first_observation, show = True)

Q for the initial state: [ 214.53590393  218.30377197]
total reward: 2000.0
