In [1]:
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import gym
from tensorflow.keras.layers import Dense,Activation,Flatten,Conv2D,Permute
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy,EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import FileLogger,ModelIntervalCheckpoint

In [2]:
env = gym.make("BreakoutDeterministic-v4")
num_actions = env.action_space.n

In [3]:
num_actions

4

In [4]:
IMG_SHAPE = (84,84)
WINDOW_LENGTH = 4

In [5]:
class ImageProcessor(Processor):
    def process_observation(self,obs):
        img = Image.fromarray(obs)
        img = img.resize(IMG_SHAPE)
        img = img.convert('L')
        img = np.array(img)
        return img.astype('uint8')
    def process_state_batch(self,batch):
        processed_batch = batch.astype('float32')/255
        return processed_batch
    def process_reward(self,reward):
        return np.clip(reward,-1.0,1.0)

In [6]:
input_shape = (WINDOW_LENGTH,IMG_SHAPE[0],IMG_SHAPE[1])

In [7]:
print(input_shape)

(4, 84, 84)


In [8]:
S_Net = Sequential()
S_Net.add(Permute((2,3,1),input_shape = input_shape))
S_Net.add(Conv2D(32,(8,8),strides=(4,4),kernel_initializer ='he_normal'))
S_Net.add(Activation('relu'))
S_Net.add(Conv2D(64,(4,4),strides=(2,2),kernel_initializer ='he_normal'))
S_Net.add(Activation('relu'))
S_Net.add(Conv2D(64,(3,3),strides=(1,1),kernel_initializer ='he_normal'))
S_Net.add(Activation('relu'))
S_Net.add(Flatten())
S_Net.add(Dense(512))
S_Net.add(Activation('relu'))
S_Net.add(Dense(num_actions))
S_Net.add(Activation('linear'))


In [9]:
memory = SequentialMemory(limit=1000000,window_length = WINDOW_LENGTH)

In [10]:
processor = ImageProcessor()

In [11]:
"""Linear Annealing Policy computes a current threshold value and
    transfers it to an inner policy which chooses the action. The threshold
    value is following a linear function decreasing over time."""
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                             attr='eps',
                             value_max = 1.0,
                             value_min=0.1,
                             value_test = 0.05,
                             nb_steps = 1000000)

In [12]:
dqn = DQNAgent(model = S_Net,nb_actions = num_actions,policy =policy,
              memory = memory,processor=processor,nb_steps_warmup=50000,
              gamma=0.99,target_model_update = 10000,train_interval = 4,
              delta_clip=1)

In [13]:
dqn.compile(Adam(learning_rate=0.00025),metrics=['mae'])

In [14]:
weights_filename = "DQN_BreakOut.h5f"
checkpoint_filename = 'DQN_Checkpoint.h5f'
checkpoint_callback = ModelIntervalCheckpoint(checkpoint_filename,interval=100000 )

In [15]:
#dqn.fit(env,nb_steps=1200000,visualize=True,verbose=1)

In [16]:
#dqn.test(env,nb_episodes = 10,visualize = True)

In [18]:
"""Loading the pretrained weights"""
S_Net.load_weights("weights.h5f")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f726c659c10>

In [19]:
policy = EpsGreedyQPolicy(0.1)

In [20]:
dqn = DQNAgent(model=S_Net,nb_actions = num_actions,policy = policy,
              memory = memory,processor=processor)

In [21]:
dqn.compile(Adam(learning_rate=0.00025),metrics=['mae'])

In [22]:
dqn.test(env,nb_episodes=2,visualize=True)

Testing for 2 episodes ...




Episode 1: reward: 40.000, steps: 1513
Episode 2: reward: 40.000, steps: 1513


<tensorflow.python.keras.callbacks.History at 0x7f726c663f70>