In [1]:
import gym
import random
import numpy as np
import tensorflow as tf
from tensorflow.python.framework import ops
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
from collections import deque
print("Gym:", gym.__version__)
print("Tensorflow:", tf.__version__)

Instructions for updating:
non-resource variables are not supported in the long term
Gym: 0.21.0
Tensorflow: 2.6.0


In [2]:
#solve cart pole environment 
env_name = "CartPole-v0"
env = gym.make(env_name)
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)

Observation space: Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
Action space: Discrete(2)


In [3]:
class QNetwork():
    def __init__(self, state_dim, action_size):
        #state dim is the network input size and the action size is the network output size
        #we create our state placeholder as a float with the first dimension is none for any batch size
        #and then the rest of the dimension coming from the values in the state dim
        self.state_in = tf.placeholder(tf.float32, shape=[None, *state_dim])
        #the action placeholder will be integer for the action index and having
        # a shape of just batch size
        self.action_in = tf.placeholder(tf.int32, shape=[None])
        #the target placeholder will be a float just have the length of the batch size
        self.q_target_in = tf.placeholder(tf.float32, shape=[None])
        #to convert our action index placeholder to one hot encoded vector 
        #and specifing the depth as the number of action
        action_one_hot = tf.one_hot(self.action_in, depth=action_size)
        
        #passing in the state to a dense layer with 100 hidden units ans with a relu activation function
        self.hidden1 = tf.layers.dense(self.state_in, 100, activation=tf.nn.relu)
        #get the q values for each action in the state by passing this to another dense layer 
        #outputing action_size units
        self.q_state = tf.layers.dense(self.hidden1, action_size, activation=None)
        #single q value for a state action come from multiplying the states q value with the one hot action vector
        #and the reducing this to a single value 
        self.q_state_action = tf.reduce_sum(tf.multiply(self.q_state, action_one_hot), axis=1)
        
        #the loss is the square different between the predicted q state action and the q target
        self.loss = tf.reduce_mean(tf.square(self.q_state_action - self.q_target_in))
        #we use the adam optimizer
        self.optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(self.loss)
     
    #this function takes in : the session, state, action and q target
    def update_model(self, session, state, action, q_target):
        #run the optimizer passing in the state action and the q taget into the placeholder in the feed dictionary
        feed = {self.state_in: state, self.action_in: action, self.q_target_in: q_target}
        session.run(self.optimizer, feed_dict=feed)
    
    #to get the q state output which take a tensorflow session and the input state
    def get_q_state(self, session, state):
        #run the q state operation in the session passing in the state to the state in placeholder to the feed dictionary
        q_state = session.run(self.q_state, feed_dict={self.state_in: state})
        return q_state

In [4]:
class ReplayBuffer():
    #maxlen: max length of the buffer 
    def __init__(self, maxlen):
        self.buffer = deque(maxlen=maxlen)
     
    #to add an experience tuple we simply append it to the buffer 
    def add(self, experience):
        self.buffer.append(experience)
        
    def sample(self, batch_size):
        sample_size = min(len(self.buffer), batch_size)
        samples = random.choices(self.buffer, k=sample_size)
        #we want to separate the experience tuple ito individual lists of state, actions , next state and so on.
        #so we impact the tuples into the zip function and then convert each sequence to a list so we just return as a tuple of list
        return map(list, zip(*samples))

In [5]:
class DQNAgent():
    def __init__(self, env):
        self.state_dim = env.observation_space.shape
        self.action_size = env.action_space.n
        #the agent class will use the network graphe
        #create an instant to our q network using the state dimenstion and the action size
        self.q_network = QNetwork(self.state_dim, self.action_size)
        #we create an instance of replay buffer in the constructer
        self.replay_buffer = ReplayBuffer(maxlen=10000)
        self.gamma = 0.97
        #the propability of selecting an action randomly over the greedy choice
        #start with 1 for always randomly exploring 
        self.eps = 1.0
        
        #we define the tensorflow session for running inputs throw a graph
        self.sess = tf.Session()
        #to initialize the weight and biais of our dense layer
        self.sess.run(tf.global_variables_initializer())
        
    #update our get action function to return an action for the given state using the q network
    #the agent need to select the action with the highest predicted q value
    #so we need a weight to get the output q state vector from my input state
    def get_action(self, state):
        #single batch size list
        q_state = self.q_network.get_q_state(self.sess, [state])
        #select the action with the highest q value with numpy argmax function
        #we define our greedy action
        action_greedy = np.argmax(q_state)
        #we will save a random action as the randomly selected index from the range of action in the sees
        action_random = np.random.randint(self.action_size)
        #we can select one of these actions depending on whether a randomly generated number between 0 and 1 is less than epsilon
        #if it is we select the random choice ! otherwise we select the greedy choice
        action = action_random if random.random() < self.eps else action_greedy
        return action
    
    #when the agent select an action in the environment and recieve a reward and next state we need 
    #a function to calculate the target q value and train the network
    #a function train that take in the current time which take state, action, next_state, reward and whether we terminate or not
    def train(self, state, action, next_state, reward, done):
        
        #we need to add each new experience tuples to the replay buffer 
        self.replay_buffer.add((state, action, next_state, reward, done))
        #then get a list of each experience tuples by simpling from the buffer 
        states, actions, next_states, rewards, dones = self.replay_buffer.sample(50)
        #to calculate the target value we first need the q value of the next state to take the maximum value 
        #so we call the same get_q_state function from our neural network passing in the session and the next states
        q_next_states = self.q_network.get_q_state(self.sess, next_states)
        #we have to make an adjustement for the case whether none a next state after the terminal state 
        #dones list to index the next state q values setting all corresponding states in the sees to a zero vector 
        q_next_states[dones] = np.zeros([self.action_size])
        #we calculate our q target as the reward + gamma and then multiplying by the maximum q value of the next state 
        q_targets = rewards + self.gamma * np.max(q_next_states, axis=1)
        #call the function update_model to train the q network with the state, actions and calculated q_targets
        self.q_network.update_model(self.sess, states, actions, q_targets)
        
        #finaly we need to decrease epsilon after each episode  
        #it should gives a minimum value of epsilon in case our training need more exploration
        if done: self.eps = max(0.1, 0.99*self.eps)
    #deconstructer to close the tensorflow session
    def __del__(self):
        self.sess.close()

In [6]:
agent = DQNAgent(env)
num_episodes = 400

for ep in range(num_episodes):
    state = env.reset()
    total_reward = 0
    done = False
    while not done:
        #during training we will call our agent train function after each step in the environment 
        #that gives us next state, reward and done
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        agent.train(state, action, next_state, reward, done)
        env.render()
        total_reward += reward
        state = next_state
        
    print("Episode: {}, total_reward: {:.2f}".format(ep, total_reward))



Episode: 0, total_reward: 36.00
Episode: 1, total_reward: 12.00
Episode: 2, total_reward: 46.00
Episode: 3, total_reward: 20.00
Episode: 4, total_reward: 46.00
Episode: 5, total_reward: 18.00
Episode: 6, total_reward: 29.00
Episode: 7, total_reward: 17.00
Episode: 8, total_reward: 60.00
Episode: 9, total_reward: 25.00
Episode: 10, total_reward: 13.00
Episode: 11, total_reward: 16.00
Episode: 12, total_reward: 30.00
Episode: 13, total_reward: 11.00
Episode: 14, total_reward: 19.00
Episode: 15, total_reward: 25.00
Episode: 16, total_reward: 25.00
Episode: 17, total_reward: 75.00
Episode: 18, total_reward: 13.00
Episode: 19, total_reward: 11.00
Episode: 20, total_reward: 22.00
Episode: 21, total_reward: 13.00
Episode: 22, total_reward: 32.00
Episode: 23, total_reward: 17.00
Episode: 24, total_reward: 26.00
Episode: 25, total_reward: 38.00
Episode: 26, total_reward: 54.00
Episode: 27, total_reward: 40.00
Episode: 28, total_reward: 16.00
Episode: 29, total_reward: 21.00
Episode: 30, total_r

Episode: 240, total_reward: 170.00
Episode: 241, total_reward: 128.00
Episode: 242, total_reward: 121.00
Episode: 243, total_reward: 130.00
Episode: 244, total_reward: 200.00
Episode: 245, total_reward: 200.00
Episode: 246, total_reward: 200.00
Episode: 247, total_reward: 149.00
Episode: 248, total_reward: 200.00
Episode: 249, total_reward: 190.00
Episode: 250, total_reward: 200.00
Episode: 251, total_reward: 200.00
Episode: 252, total_reward: 145.00
Episode: 253, total_reward: 200.00
Episode: 254, total_reward: 200.00
Episode: 255, total_reward: 200.00
Episode: 256, total_reward: 195.00
Episode: 257, total_reward: 200.00
Episode: 258, total_reward: 200.00
Episode: 259, total_reward: 189.00
Episode: 260, total_reward: 200.00
Episode: 261, total_reward: 200.00
Episode: 262, total_reward: 200.00
Episode: 263, total_reward: 200.00
Episode: 264, total_reward: 200.00
Episode: 265, total_reward: 200.00
Episode: 266, total_reward: 200.00
Episode: 267, total_reward: 200.00
Episode: 268, total_

In [7]:
env.close()