In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gym
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras import Model

from tensorflow import keras

#tf.enable_eager_execution()

from tensorflow.keras.layers import Conv1D, Flatten

from baselines.common.atari_wrappers import make_atari, wrap_deepmind

# Deep Q Network

In [8]:
seed=123
gamma=0.99
epsilon=1.0
epsilon_min=0.1
epsion_max=1.0
epsilon_interval=(epsion_max-epsilon_min)
max_steps_per_episode=1000
env = make_atari("BreakoutNoFrameskip-v4")
env = wrap_deepmind(env, frame_stack=True, scale=True)
env.seed(seed)

[123, 151010689]

In [3]:
env.render()

True

In [11]:
env.close()

In [4]:
env.action_space.sample()

5

In [86]:
env.observation_space

Box(128,)

In [9]:
from tensorflow.keras.layers import Conv2D

In [10]:
num_actions=4
def deep_q_network():
    inputs =Input(shape=(84, 84, 4,))
    layer1=Conv2D(32,kernel_size=8,strides=4,activation='relu')(inputs)
    layer2=Conv2D(64,kernel_size=4,strides=2,activation='relu')(layer1)
    layer3=Conv2D(64,kernel_size=3,strides=1,activation='relu')(layer2)
    layer4=Flatten()(layer3)
    layer5=Dense(512,activation='relu')(layer4)
    action=Dense(num_actions,activation='linear')(layer5)
    model=Model(inputs=inputs, outputs=action)
    return model

In [11]:
deep_q_network().summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 84, 84, 4)]       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 20, 20, 32)        8224      
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 9, 9, 64)          32832     
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 7, 7, 64)          36928     
_________________________________________________________________
flatten (Flatten)            (None, 3136)              0         
_________________________________________________________________
dense (Dense)                (None, 512)               1606144   
_________________________________________________________________
dense_1 (Dense)              (None, 4)                

In [12]:
model=deep_q_network()
model_target=deep_q_network()

In [13]:
from tensorflow import keras

In [None]:
optimizer=keras.optimizers.Adam(learning_rate=0.00025,clipnorm=1.0)
action_history=[]
state_history=[]
state_next_history=[]
reward_history=[]
done_history=[]
episode_reward_history=[]
running_reward=0
episode_count=0
frame_count=0
batch_size = 32
epsilon_random_frames=50000
epsilon_greedy_frames=1000000.0
max_memory_length=100000
update_after_actions=4
update_target_network=10000
loss_function=keras.losses.Huber()

while True:
    state=np.array(env.reset())
    episode_reward=0
    for time_stamp in range(1,max_steps_per_episode):
        frame_count+=1
        if frame_count<epsilon_random_frames or epsilon>np.random.rand(1)[0]:
            action=np.random.choice(num_actions)
        else:
            state_tensor=tf.convert_to_tensor(state)
            state_tensor=tf.expand_dims(state_tensor,0)
            action_probs=model(state_tensor,training=False)
            action=tf.argmax(action_probs[0]).numpy()
            
        epsilon-=epsilon_interval/epsilon_greedy_frames
        
        epsilon=max(epsilon,epsilon_min)
        env.render()
        state_next,reward,done,info=env.step(action)
        
        state_next=np.array(state_next)
        
        episode_reward+=reward
        
        action_history.append(action)
        state_history.append(state)
        done_history.append(done)
        state_next_history.append(state_next)
        reward_history.append(reward)
        
        state=state_next
        
        if frame_count%update_after_actions==0 and len(done_history)>batch_size:
            indices=np.random.choice(range(len(done_history)),size=batch_size)
            state_sample=np.array([state_history[i] for i in indices])
            state_next_sample=np.array([state_next_history[i] for i in indices])
            reward_sample=[reward_history[i] for i in indices]
            action_sample=[action_history[i] for i in indices]
            done_sample=tf.convert_to_tensor([float(done_history[i]) for i in indices])
            
            future_rewards=model_target.predict(state_next_sample)
            
            updated_q_values=reward_sample + gamma*tf.reduce_max(future_rewards,axis=1)
            
            masks=tf.one_hot(action_sample,num_actions)
            
            with tf.GradientTape() as tape:
                q_values=model(state_sample)
                q_action=tf.reduce_sum(tf.multiply(q_values,masks),axis=1)
                loss=loss_function(updated_q_values,q_action)
                
            grads=tape.gradient(loss,model.trainable_variables)
            optimizer.apply_gradients(zip(grads,model.trainable_variables))
        if frame_count % update_target_network == 0:
            # update the the target network with new weights
            model_target.set_weights(model.get_weights())
            # Log details
            template = "running reward: {:.2f} at episode {}, frame count {}"
            print(template.format(running_reward, episode_count, frame_count))

        # Limit the state and reward history
        if len(reward_history) > max_memory_length:
            del reward_history[:1]
            del state_history[:1]
            del state_next_history[:1]
            del action_history[:1]
            del done_history[:1]

        if done:
            break

    # Update running reward to check condition for solving
    episode_reward_history.append(episode_reward)
    if len(episode_reward_history) > 100:
        del episode_reward_history[:1]
    running_reward = np.mean(episode_reward_history)

    episode_count += 1

    if running_reward > 40:  # Condition to consider the task solved
        print("Solved at episode {}!".format(episode_count))
        break
                
            
            


running reward: 0.21 at episode 296, frame count 10000
running reward: 0.45 at episode 566, frame count 20000
running reward: 0.25 at episode 865, frame count 30000
running reward: 0.32 at episode 1175, frame count 40000


In [90]:
env.close()