In [1]:
from PIL import Image
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Convolution2D, Permute
from keras.optimizers import Adam
import keras.backend as K

from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
INPUT_SHAPE = (84, 84)
WINDOW_LENGTH = 4

In [3]:
class AtariProcessor(Processor):
    
    def process_observation(self, observation):
        
        assert observation.ndim == 3                        # Assert dimension (height, width, channel)
        
        img = Image.fromarray(observation)                  # Retrieve image from array
        
        img = img.resize(INPUT_SHAPE).convert('L')          # Resize and convert to grayscale
        
        processed_observation = np.array(img)               # Convert back to array
        
        assert processed_observation.shape == INPUT_SHAPE   # Assert input shape
        
        return processed_observation.astype('uint8')        # Save processed observation in experience memory (8bit)
    

    def process_state_batch(self, batch):

        processed_batch = batch.astype('float32') / 255.    #Convert the batches of images to float32 datatype
        
        return processed_batch
    

    def process_reward(self, reward):
        
        return np.clip(reward, -1., 1.)                     # Clip reward

In [4]:
# Initialize space invaders environment from OpenAi gym (Atari dependency required)

env = gym.make('SpaceInvaders-v0')
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

In [19]:
# Define input shape
input_shape = (WINDOW_LENGTH,) + INPUT_SHAPE

# Build Conv2D model
model = Sequential()
model.add(Permute((2, 3, 1), input_shape=input_shape))

model.add(Convolution2D(32, (8, 8), strides=(4, 4), activation='relu'))
model.add(Convolution2D(64, (4, 4), strides=(2, 2), activation='relu'))
model.add(Convolution2D(64, (3, 3), strides=(1, 1), activation='relu'))

model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dense(nb_actions, activation='linear')) # Last layer: no. of neurons corresponds to action space 
                                                  # Linear activation

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
permute_2 (Permute)          (None, 84, 84, 4)         0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 20, 20, 32)        8224      
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 9, 9, 64)          32832     
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 7, 7, 64)          36928     
_________________________________________________________________
flatten_2 (Flatten)          (None, 3136)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 512)               1606144   
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 3078      
Total para

In [20]:
# Initialize sequential memory for experience replay

memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)

In [21]:
# Initialize epsilon greedy exploration policy ( Mihn et al., 2015)
# Try Boltzmann Q policy

policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), 
                              attr='eps',
                              value_max=1.,
                              value_min=.1,
                              value_test=.05,
                              nb_steps=1000000)

In [26]:
#Initialize the atari_processor() class

processor = AtariProcessor()

# Initialize the DQN agent 


dqn = DQNAgent(model=model,                 #Compiled neural network model
               nb_actions=nb_actions,       #Action space
               policy=policy,               #Policy chosen (Try Boltzman Q policy)
               memory=memory,               #Replay memory (Try Episode Parameter memory)
               processor=processor,         #Atari processor class
               nb_steps_warmup=50000,       #Warmup steps to ignore initially (due to random initial weights)
               gamma=.99,                   #Discount factor
               train_interval=4,            #Training intervals
               delta_clip=1.,               #Reward clipping
              )




In [16]:
#Double DQN

double_dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               policy=policy,
               memory=memory,
               processor=processor,
               nb_steps_warmup=50000,
               gamma=.99, 
               target_model_update=1e-2,
               train_interval=4,
               delta_clip=1.,
               enable_double_dqn=True,
              )

In [22]:
#Dueling DQN

dueling_dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               policy=policy,
               memory=memory,
               processor=processor,
               nb_steps_warmup=50000,
               gamma=.99, 
               target_model_update=10000,
               train_interval=4,
               delta_clip=1.,
               enable_dueling_network=True,
               dueling_type='avg'
              )

In [27]:
# Compile DQN agent

dqn.compile(optimizer=Adam(lr=.00025), metrics=['mae'])

In [None]:
# Initiate training

dqn.fit(env, nb_steps=1750000)   #visualize=True

Training for 1750000 steps ...
Interval 1 (0 steps performed)

In [25]:
# Test agent

dqn.test(env, nb_episodes=10, visualize=True)

Testing for 10 episodes ...
Episode 1: reward: 3.000, steps: 654
Episode 2: reward: 11.000, steps: 807
Episode 3: reward: 8.000, steps: 812
Episode 4: reward: 3.000, steps: 475
Episode 5: reward: 4.000, steps: 625
Episode 6: reward: 9.000, steps: 688
Episode 7: reward: 5.000, steps: 652
Episode 8: reward: 12.000, steps: 826
Episode 9: reward: 2.000, steps: 632
Episode 10: reward: 3.000, steps: 643


<keras.callbacks.History at 0x24280aadc50>

##  || DQN algorithm ||



    initialize replay memory
    initialize Q-Value function with random weights
    sample initial state from environment

    Keep repeating:

        choose an action to perform:

            with probability ε select a random action
            otherwise select action with argmax a Q(s, a')

        execute chosen action
        collect reward and next state
        save experience <s, a, r, s'> in replay memory

        sample random transitions <s, a, r, s'> from replay memory
        compute target variable for each mini-batch transition:

            if s' is terminal state then target = r
            otherwise t = r + γ max a'Q(s', a')

        train the network with loss (target - Q(s,a)`^2)

        s = s'

    until done
            
    