In [1]:
import gymnasium as gym
import numpy as np
import random
import tensorflow as tf
tf.get_logger().setLevel('INFO')

In [2]:
def preprocess(frame: np.array):
    """
    Preprocessing
    - extract luminosity 0.299*R + 0.587*G + 0.114*B
    - reshape to 84x84
    """
    def getLuminosity(r, g, b):
        return 0.299 * r + 0.587 * g + 0.114 * b

    # reshape
    reshaped_frame = tf.image.resize(frame, [84, 84]).numpy()

    preprocessed_frame = list()
    
    for x in reshaped_frame:
        x_list = list()
        for y in x:
            x_list.append(getLuminosity(y[0], y[1], y[2]))
        
        preprocessed_frame.append(x_list)

    return np.array(preprocessed_frame).reshape(84, 84, 1)

In [3]:
def create_model(input_shape, output_classes):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(32, (8,8), strides=4, activation='relu', input_shape=input_shape),
        tf.keras.layers.Conv2D(64, (4,4), strides=2, activation='relu'),
        tf.keras.layers.Conv2D(64, (3,3), strides=1, activation='relu'),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(512, activation='relu'),
        tf.keras.layers.Dense(output_classes)
    ])

    return model

In [8]:
class PrioritizedReplay():
    """
    Prioritized Replay Memory
    - Contains list with replay (state, action reward, next_state)
    - Contains list with priorities
    - Maximum size of max_size (after this point the instance starts removing the element with the least priority)
    """
    def __init__(self, max_size):
        self.replay_memory = list()
        self.prio_memory = list()
        self.sum_of_prio = 0
        self.len = 0
        self.max_size = max_size
        
    def add_transition(self, priority, state, action, reward, next_state):
        self.replay_memory.append((state, action, reward, next_state))
        self.prio_memory.append(priority)
        self.len += 1
        self.sum_of_prio += priority
    
    def update_transition_prio(self, index, new_prio):
        diff_in_prio = new_prio - self.prio_memory[index]
        self.sum_of_prio += diff_in_prio
        self.prio_memory[index] = new_prio
    
    def clean_up(self):
        # finds the element that has the lowest prio and removes it
        if self.len >= self.max_size:
            min_idx = np.argmin(self.prio_memory)
            self.replay_memory.pop(min_idx)
            self.len -= 1
            self.sum_of_prio -= self.prio_memory[min_idx]
            self.prio_memory.pop(min_idx)
    
    def sample(self):
        # build probability list
        probability_list = [prio/self.sum_of_prio for prio in self.prio_memory]
        reference = random.uniform(0, 1)
        return_index = -1
        return_sample = None
        
        # return index and sample of the lucky element of the list
        current = 0
        for i, prob in enumerate(probability_list):
            current += prob
            
            if current < reference:
                return_index = i
                return_sample = self.replay_memory[i]
                
                return return_index, return_sample 
        
        return_index = self.len-1
        return_sample = self.replay_memory[-1]
                
        return return_index, return_sample

In [9]:
import time

In [58]:
def train(environment: gym.Env,
          q1_network: tf.keras.models.Sequential, 
          q2_network: tf.keras.models.Sequential,
          network_update_frequency = 10000,
          minibatch_size = 32,
          min_exploration = 0.1,
          max_exploration = 1,
          exploration_frame = 1000000,
          total_frames=1000000,
          replay_memory_size = 5000,
          no_op_max = 30,
          gamma_discount_factor = 0.99,
         learning_rate = 0.00025,
         discount_factor = 0.99,
         momentum = 0.95,
         max_p = 1,
         prio_small_pos = 0.05,
         training_frequency = 20):
    
    # Decay for exploratory behavior: starts in exploratory mode and eventually 
    exploratory_decay_rate = (max_exploration - min_exploration)/exploration_frame
    
    # Optimizer
    optimizer=tf.keras.optimizers.RMSprop(
        learning_rate=learning_rate,
        rho=discount_factor,
        momentum=gradient_momentum)
    
    episode_scores = list()
    frame_diff = list()
    
    frame_number = 0
    episode = 0
    
    # Instance of prioritized replay
    replay = PrioritizedReplay(replay_memory_size)
    
    while frame_number <= total_frames:
        state, info = environment.reset()
        frame_number = info['frame_number']
        
        # processed state
        proc_state = preprocess(state)
        terminated, truncated = False, False

        replay_memory = list()
        n_op = 0

        # logging
        ep_reward = 0
        
        max_priority = max_p

        while not terminated and not truncated:
            reference = random.uniform(0, 1)

            # exploratory factor
            if info['frame_number'] > exploration_frame:
                exploratory_factor = min_exploration
            else:
                exploratory_factor = max_exploration - exploratory_decay_rate * info['frame_number']

            # e greedy with linear decay of exploratory factor
            if reference < exploratory_factor:
                action = environment.action_space.sample() 
            else:
                action = np.argmax(q1_network(np.array([proc_state])))

            # take action and observe
            next_state, reward, terminated, truncated, info = env.step(action)
            proc_next_state = preprocess(next_state)

            ep_reward += reward

            # store in replay memory
            replay.add_transition(max_p, 
                                  np.array([proc_state]), 
                                  action, 
                                  reward, 
                                  np.array([proc_next_state]))
            
            # keep the replay at a specific size due to constraints on (our) hardware
            replay.clean_up()

            # increment no op
            n_op += 1

            # if no_op timeframe is still valid, continue loop
            if n_op < no_op_max:
                continue
            
            # train the network every training_frequency operations
            if n_op % training_frequency == 0:
                accumulated_error = 0

                # Gradient Tape records the forward pass
                with tf.GradientTape() as tape:
                    for _ in range(minibatch_size):
                        # Sample replay memory
                        index, sample = replay.sample()
                        s = sample[0]
                        a = sample[1]
                        r = sample[2]
                        s_= sample[3]

                        # TD ERROR
                        error = r + gamma_discount_factor*(q2_network(s_)[:, np.argmax(q1_network(s_))]) - q1_network(s)[:, a]
                        accumulated_error += error

                        replay.update_transition_prio(index, error + prio_small_pos)

                # Calculate gradients with respect to every trainable variable
                gradients = tape.gradient(accumulated_error, q1_network.trainable_variables)
                optimizer.apply_gradients(zip(gradients, q1_network.trainable_variables))

            # update target network every network_update_frequency operations
            if n_op % network_update_frequency == 0:
                q2_network = tf.keras.models.clone_model(q1_network)


        episode_scores.append(ep_reward)
        frame_diff.append(info['frame_number'] - frame_number)

        episode += 1
        
        # saves trained weights every 10 episodes
        if episode % 10 == 0:
            action_value_network.save_weights(f'./saved_models/double_dqn/seaquest_action_value_network_ep{episode}')
            target_action_value_network.save_weights(f'./saved_models/double_dqn/sequest_target_action_value_network_ep{episode}')

            
    return episode_scores, frame_diff

In [None]:
env = gym.make('ALE/Seaquest-v5')

# vars
total_frames = 1000000

minibatch_size = 35
replay_memory_size = 5000
target_network_update_frequency = 100 # corresponds to C in the pseudocode
discount_factor = 0.99 # gamma
learning_rate = 0.00025
gradient_momentum = 0.95
initial_exploration = 1
final_exploration = 0.1
training_frequency = 10

final_exploration_frame = total_frames / 50 # 1000000
no_op_max = 32

action_value_network = create_model((84,84,1), env.action_space.n)

target_action_value_network = tf.keras.models.clone_model(action_value_network)

scores, frames = train(env,
              action_value_network,
              target_action_value_network,
              no_op_max=no_op_max,
              exploration_frame=final_exploration_frame,
              network_update_frequency=target_network_update_frequency,
              replay_memory_size=replay_memory_size,
              total_frames=total_frames,
                      training_frequency=training_frequency)

In [None]:
# Scores and episode frames taken from logs

scores=[[80, 340, 540, 420, 360],
        [240, 520, 360, 300, 320],
        [380, 640, 220, 440, 580],
        [320, 360, 420, 540, 400],
        [380, 420, 360, 260, 220],
        [260, 260, 420, 300, 400],
        [520, 360, 400, 320, 140],
        [380, 240, 180, 620, 440]]

frames=[[2189, 4597, 6721, 5554, 5350],
        [3934, 6786, 5230, 4330, 5114],
        [5606, 8198, 3561, 6125, 7450],
        [4985, 4985, 5674, 7033, 6058],
        [5242, 6430, 5110, 4089, 3549],
        [4790, 4198, 5546, 4713, 6554],
        [6774, 5466, 5674, 5098, 2950],
        [5098, 3950, 3230, 8094, 5978]]