In [1]:
# imports

import gym
import random as rd
from collections import deque
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
# implementing DQN class

class DQN:

    REPLAY_MEMORY_SIZE = 5000 			# number of tuples in experience replay  
    EPSILON = 1 						# epsilon of epsilon-greedy exploation
    EPSILON_DECAY = 0.999 				# exponential decay multiplier for epsilon
    HIDDEN1_SIZE = 16 					# size of hidden layer 1
    HIDDEN2_SIZE = 16 					# size of hidden layer 2
    EPISODES_NUM = 2000 				# number of episodes to train on. Ideally shouldn't take longer than 2000
    MAX_STEPS = 200 					# maximum number of steps in an episode 
    LEARNING_RATE = 0.001 				# learning rate and other parameters for SGD/RMSProp/Adam
    MINIBATCH_SIZE = 10 				# size of minibatch sampled from the experience replay
    DISCOUNT_FACTOR = 0.9 				# MDP's gamma
    TARGET_UPDATE_FREQ = 50 			# number of steps (not episodes) after which to update the target networks 
    LOG_DIR = './logs' 					# directory wherein logging takes place
    EPSILON_MIN = 0.05


    # Create and initialize the environment
    def __init__(self, env):
        self.env = gym.make(env)
        assert len(self.env.observation_space.shape) == 1
        self.input_size = self.env.observation_space.shape[0]		# In case of cartpole, 4 state features
        self.output_size = self.env.action_space.n					# In case of cartpole, 2 actions (right/left)
        self.model = None
        self.target_model = None
        self.replay_buffer = None
        self.epsilon = self.EPSILON

    # Create the Q-network
    def initialize_network(self):

        ############################################################
        # Design your q-network here.
        # 
        # Add hidden layers and the output layer. For instance:
        # 
        # with tf.name_scope('output'):
        #	W_n = tf.Variable(
        # 			 tf.truncated_normal([self.HIDDEN_n-1_SIZE, self.output_size], 
        # 			 stddev=0.01), name='W_n')
        # 	b_n = tf.Variable(tf.zeros(self.output_size), name='b_n')
        # 	self.Q = tf.matmul(h_n-1, W_n) + b_n
        #
        #############################################################
        
        # Model designed using keras layers
        self.model = keras.Sequential([
                layers.InputLayer(input_shape=(self.input_size,)),
                layers.Dense(self.HIDDEN1_SIZE, activation='relu', name='hidden1', kernel_initializer='RandomNormal'),
                layers.Dense(self.HIDDEN2_SIZE, activation='relu', name='hidden2', kernel_initializer='RandomNormal'),
                layers.Dense(self.output_size, activation='linear', name='output', kernel_initializer='RandomNormal')
        ])

        ############################################################
        # Next, compute the loss.
        #
        # First, compute the q-values. Note that you need to calculate these
        # for the actions in the (s,a,s',r) tuples from the experience replay's minibatch
        #
        # Next, compute the l2 loss between these estimated q-values and 
        # the target (which is computed using the frozen target network)
        #
        ############################################################
        
        ############################################################
        # Finally, choose a gradient descent algorithm : SGD/RMSProp/Adam. 
        #
        # For instance:
        # optimizer = tf.train.GradientDescentOptimizer(self.LEARNING_RATE)
        # global_step = tf.Variable(0, name='global_step', trainable=False)
        # self.train_op = optimizer.minimize(self.loss, global_step=global_step)
        #
        ############################################################

        # Assigned descent algo. and loss function in one line
        self.model.compile(loss=keras.losses.MeanSquaredError(),optimizer=keras.optimizers.Adam(lr=self.LEARNING_RATE))
        self.model.summary()
        
        # create a target model a clone to our model i.e. target network
        self.target_model = keras.models.clone_model(self.model)
        self.target_model.build((None, self.input_size))
        self.target_model.compile(loss=keras.losses.MeanSquaredError(),optimizer=keras.optimizers.Adam(lr=self.LEARNING_RATE))
        self.target_model.set_weights(self.model.get_weights())

        ############################################################

    def train(self, episodes_num=EPISODES_NUM):

        # Initialize summary for TensorBoard 
        summary_writer = tf.summary.create_file_writer(self.LOG_DIR)
        summary = tf.summary
        # Alternatively, you could use animated real-time plots from matplotlib 
        # (https://stackoverflow.com/a/24228275/3284912)

#         # Initialize the TF session
#         self.session = tf.Session()
#         self.session.run(tf.global_variables_initializer())

        ############################################################
        # Initialize other variables (like the replay memory)
        ############################################################
        
        # Using deque
        self.replay_buffer = deque(maxlen=self.REPLAY_MEMORY_SIZE)
        total_steps = 0

        ############################################################
        # Main training loop
        # 
        # In each episode, 
        #	pick the action for the given state, 
        #	perform a 'step' in the environment to get the reward and next state,
        #	update the replay buffer,
        #	sample a random minibatch from the replay buffer,
        # 	perform Q-learning,
        #	update the target network, if required.
        #
        #
        #
        # You'll need to write code in various places in the following skeleton
        #
        ############################################################

        for episode in range(episodes_num):

            state = np.array([self.env.reset()])

            ############################################################
            # Episode-specific initializations go here.
            ############################################################
            
            episode_length = 0
            score = 0
            
            ############################################################

            while True:
                ############################################################
                # Pick the next action using epsilon greedy and execute it
                ############################################################
                
                episode_length += 1
                total_steps += 1
                if(rd.random() < self.epsilon):
                    act = self.env.action_space.sample()
                else:
                    act = np.argmax(self.model.predict(state)[0])

                ############################################################
                # Step in the environment. Something like: 
                # next_state, reward, done, _ = self.env.step(action)
                ############################################################

                next_state, reward, done, _ = self.env.step(act)
                next_state = np.array([next_state])
                score += reward
                state = next_state
                
                ############################################################
                # Update the (limited) replay buffer. 
                #
                # Note : when the replay buffer is full, you'll need to 
                # remove an entry to accommodate a new one.
                ############################################################

                # The max length in deque removes oldest if buffer size exceeds it
                self.replay_buffer.append((state,act,reward,next_state,done))

                ############################################################
                # Sample a random minibatch and perform Q-learning (fetch max Q at s') 
                #
                # Remember, the target (r + gamma * max Q) is computed    
                # with the help of the target network.
                # Compute this target and pass it to the network for computing 
                # and minimizing the loss with the current estimates
                #
                ############################################################
                
                # not starting network update until it has a batch size elements
                if len(self.replay_buffer) == self.REPLAY_MEMORY_SIZE:
                    replay_batch = rd.sample(self.replay_buffer,self.MINIBATCH_SIZE)
                    for st, act, rwd, nst, d in replay_batch:
                        if d:
                            y = rwd
                        else:
                            y = (rwd + self.DISCOUNT_FACTOR * np.max(self.target_model.predict(nst)[0]))
                        target = self.model.predict(state)
                        target[0][act] = y
                        self.model.fit(st, target, epochs = 1, verbose = 0) # verbose = 0
                        
                    if self.epsilon > self.EPSILON_MIN:
                        self.epsilon *= self.EPSILON_DECAY

                ############################################################
                # Update target weights. 
                #
                # Something along the lines of:
                # if total_steps % self.TARGET_UPDATE_FREQ == 0:
                # 	target_weights = self.session.run(self.weights)
                ############################################################

                if total_steps%self.TARGET_UPDATE_FREQ == 0:
                    self.target_model.set_weights(self.model.get_weights())

                ############################################################
                # Break out of the loop if the episode ends
                #
                # Something like:
                # if done or (episode_length == self.MAX_STEPS):
                # 	break
                #
                ############################################################
                
                if done or episode_length == self.MAX_STEPS:
                    break


            ############################################################
            # Logging. 
            #
            # Very important. This is what gives an idea of how good the current
            # experiment is, and if one should terminate and re-run with new parameters
            # The earlier you learn how to read and visualize experiment logs quickly,
            # the faster you'll be able to prototype and learn.
            #
            # Use any debugging information you think you need.
            # For instance :

            print("Training: Episode = %d, Length = %d, Global step = %d" % (episode, episode_length, total_steps))
            with summary_writer.as_default():
                summary.scalar("episode length",episode ,step=episode_length)


    # Simple function to visually 'test' a policy
    def playPolicy(self):

        done = False
        steps = 0
        state = self.env.reset()

        # we assume the CartPole task to be solved if the pole remains upright for 200 steps
        while not done and steps < 200: 
            self.env.render()
            action = np.argmax(self.target_model.predict(state)[0])
            state, _, done, _ = self.env.step(action)
            steps += 1

        return steps

In [None]:
# Create and initialize the model
dqn = DQN('CartPole-v0')
dqn.initialize_network()

print("\nStarting training...\n")
dqn.train()
print("\nFinished training...\nCheck out some demonstrations\n")

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
hidden1 (Dense)              (None, 16)                80        
_________________________________________________________________
hidden2 (Dense)              (None, 16)                272       
_________________________________________________________________
output (Dense)               (None, 2)                 34        
Total params: 386
Trainable params: 386
Non-trainable params: 0
_________________________________________________________________

Starting training...

Training: Episode = 0, Length = 37, Global step = 37
Training: Episode = 1, Length = 15, Global step = 52
Training: Episode = 2, Length = 28, Global step = 80
Training: Episode = 3, Length = 13, Global step = 93
Training: Episode = 4, Length = 13, Global step = 106
Training: Episode = 5, Length = 19, Global step = 125
Training: Episode = 6, Length = 17, Global st

Training: Episode = 169, Length = 21, Global step = 3680
Training: Episode = 170, Length = 20, Global step = 3700
Training: Episode = 171, Length = 20, Global step = 3720
Training: Episode = 172, Length = 13, Global step = 3733
Training: Episode = 173, Length = 14, Global step = 3747
Training: Episode = 174, Length = 15, Global step = 3762
Training: Episode = 175, Length = 28, Global step = 3790
Training: Episode = 176, Length = 24, Global step = 3814
Training: Episode = 177, Length = 19, Global step = 3833
Training: Episode = 178, Length = 57, Global step = 3890
Training: Episode = 179, Length = 10, Global step = 3900
Training: Episode = 180, Length = 15, Global step = 3915
Training: Episode = 181, Length = 39, Global step = 3954
Training: Episode = 182, Length = 13, Global step = 3967
Training: Episode = 183, Length = 16, Global step = 3983
Training: Episode = 184, Length = 14, Global step = 3997
Training: Episode = 185, Length = 17, Global step = 4014
Training: Episode = 186, Length

In [None]:
# Visualize the learned behaviour for a few episodes
results = []
for i in range(50):
    episode_length = dqn.playPolicy()
    print("Test steps = ", episode_length)
    results.append(episode_length)
print("Mean steps = ", sum(results) / len(results))	

print("\nFinished.")
print("\nCiao, and hasta la vista...\n")