## <center>Reinforcement Learning on Images</center>

In [None]:
from PIL import Image  # To transform the image in the Processor
import numpy as np
import gym
import matplotlib.pyplot as plt
import time

# Convolutional Backbone Network
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, Permute
from tensorflow.keras.optimizers import Adam

# Keras-RL
from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import ModelIntervalCheckpoint

### Exploring the model

In [None]:
env = gym.make("BreakoutDeterministic-v4")
env.reset()  
for _ in range(5):
    env.render(mode="human") 
    action = env.action_space.sample()  
    env.step(action)  
    time.sleep(0.5)
env.close()

In [None]:
# HYPERPARAMETERS
IMG_SHAPE = (84, 84)
WINDOW_LENGTH = 4
num_actions = env.action_space.n

### Building the model

Based on those settings we create our processor. It is the same processor as in the last notebook, with the addition that it standardizes the data into the [0, 1] intervall which often decreases the necessary training time. <br />
We perform this standardization routine in the process_state_batch function, which is only executed on the current batch and not on the complete replay memory which decreases RAM usage by a factor of 4.
Additionally we clip the reward in the intervall [-1, 1] which might speed up the training

In [None]:
class ImageProcessor(Processor):
    def process_observation(self, observation):
        # First convert the numpy array to a PIL Image
        img = Image.fromarray(observation)
        # Then resize the image
        img = img.resize(IMG_SHAPE)
        # And convert it to grayscale  (The L stands for luminance)
        img = img.convert("L")
        # Convert the image back to a numpy array and finally return the image
        img = np.array(img)
        # Normalization of the image into the intervall [0,1] to support training
        img = img.astype('float32') / 255.0
                
        return img

    def process_reward(self, reward):
        return np.clip(reward, -1., 1.)


In [None]:
input_shape = (WINDOW_LENGTH,) + IMG_SHAPE
input_shape

As our input consists of 4 consecutive frames, each having the shape $(84 \times 84)$, the input to the network has the shape $(84 \times 84 \times 4)$.
But as the Convolutional Layers expect our input to be of shape $(4 \times 84 \times 84)$ we add a permute layer at the beginning to swap the channels


In [None]:
model = Sequential()
model.add(Permute((2, 3, 1), input_shape=input_shape)) # 0 not used as it is used as a placeholder for the batch dimension

model.add(Conv2D(filters=32, 
                       kernel_size=(8, 8), 
                       strides=(4, 4),
                       padding='valid',
                       activation="relu",
                       kernel_initializer='he_normal') # Gives a better perfomance, used in CNN
                       )
model.add(Conv2D(filters=64, kernel_size=(4, 4), strides=(2, 2), padding='valid',
                       activation="relu", kernel_initializer='he_normal'))

model.add(Conv2D(filters=64, kernel_size=(3, 3), strides=(1, 1), padding='valid',
                       activation="relu", kernel_initializer='he_normal'))

model.add(Flatten())
model.add(Dense(512, activation="relu"))
model.add(Dense(num_actions, activation="linear"))
print(model.summary())

#### Definition of some parameters

In [None]:
memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
processor = ImageProcessor()

# We use again a LinearAnnealedPolicy to implement the epsilon greedy action selection with decaying epsilon. 
# As we need to train for at least a million steps, we set the number of steps to 1,000,000
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1.0, value_min=0.1, value_test=0.05,
                              nb_steps=1000000)


Finally we define the agent and compile it. The agent is defined in the same way as in the previous lectures with an additional train_interval of 4 (we only train on every 4th step). <br />
Besides that we clip delta (the error) to 1.<br />
Both, clipping and train_interval often increase the result

In [None]:
dqn = DQNAgent(model=model, 
               nb_actions=num_actions, 
               policy=policy, 
               memory=memory,
               processor=processor, 
               nb_steps_warmup=50000, 
               gamma=0.99, 
               target_model_update=10000,
               batch_size=128,
               train_interval=WINDOW_LENGTH, 
               delta_clip=1)

dqn.compile(Adam(learning_rate=0.00025), metrics=['mae'])

### Training

As the training might take several hours, we store our current model each 500,000 steps. <br />
We can use the *ModelIntervalCheckpoint(checkpoint_name, interval)* to do so and store it in a callback variable which we pass to the fit method as a callback

In [None]:
weights_filename = 'weights.h5f'
checkpoint_weights_filename = 'dqn_' + 'weights_{step}.h5f'
checkpoint_callback = ModelIntervalCheckpoint(checkpoint_weights_filename, interval=500000)

In [None]:
dqn.fit(env, nb_steps=1500000, callbacks=[checkpoint_callback], log_interval=10000, visualize=False)

# After training is done, we save the final weights one more time.
dqn.save_weights(weights_filename, overwrite=True)

In [None]:
dqn.test(env, nb_episodes=5, visualize=True)

If you only want to load your model for evaluation, you can use the exact same code from above without calling **fit()**. <br />
You can also leave out the warmup steps, gamma and the targe model update variables when defining the DQNAgent as they are only needed for training.

In [None]:
# Load the weights
model.load_weights("name")

#You can chose an arbitrary policy for evaluation, in this case it's better a small value to ensure explotation
policy = EpsGreedyQPolicy(0.1)


# Initialize the DQNAgent with the new model and updated policy and compile it
dqn = DQNAgent(model=model, nb_actions=num_actions, policy=policy, memory=memory,
               processor=processor)
dqn.compile(Adam(lr=.00025), metrics=['mae'])

dqn.test(env, nb_episodes=5, visualize=True)