In [1]:
import gymnasium as gym
import numpy as np
import random
import tensorflow as tf
tf.get_logger().setLevel('INFO')

In [2]:
def preprocess(frame: np.array):
    """
    Preprocessing
    - extract luminosity 0.299*R + 0.587*G + 0.114*B
    - reshape to 84x84
    """
    def getLuminosity(r, g, b):
        return 0.299 * r + 0.587 * g + 0.114 * b

    # reshape
    reshaped_frame = tf.image.resize(frame, [84, 84]).numpy()

    preprocessed_frame = list()
    
    for x in reshaped_frame:
        x_list = list()
        for y in x:
            x_list.append(getLuminosity(y[0], y[1], y[2]))
        
        preprocessed_frame.append(x_list)

    return np.array(preprocessed_frame).reshape(84, 84, 1)

In [3]:
def create_model(input_shape, output_classes):
    """
    - Network
        - input: 84x84x4
        - conv1: 32 filters of 8x8 with stride 4 rectifier nonlinearly
        - conv2: 64 filters of 4x4 with stride 2 rectifier nonlinearly
        - conv3: 64 filters of 3x3 with stride 1 rectifier
        - dense 512 rectifier units
        - output: action space
    """

    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(32, (8,8), strides=4, activation='relu', input_shape=input_shape),
        tf.keras.layers.Conv2D(64, (4,4), strides=2, activation='relu'),
        tf.keras.layers.Conv2D(64, (3,3), strides=1, activation='relu'),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(512, activation='relu'),
        tf.keras.layers.Dense(output_classes)
    ])

    return model

In [5]:
import time

In [11]:
def train(environment: gym.Env, 
          n_episodes: int, 
          q1_network: tf.keras.models.Sequential, 
          q2_network: tf.keras.models.Sequential,
          epochs: int,
          network_update_frequency = 10000,
          minibatch_size = 32,
          min_exploration = 0.1,
          max_exploration = 1,
          exploration_frame = 1000000,
          total_frames=1000000,
          replay_memory_size = 1000000,
          no_op_max = 30,
          gamma_discount_factor = 0.99,
         learning_rate = 0.00025,
         discount_factor = 0.99,
         momentum = 0.95):
    
    exploratory_decay_rate = (max_exploration - min_exploration)/exploration_frame
    
    loss_fn = tf.keras.losses.Huber()
    optimizer=tf.keras.optimizers.RMSprop(
        learning_rate=learning_rate,
        rho=discount_factor,
        momentum=gradient_momentum)
    
    epoch_scores = list()
    frame_number = 0
    
    while frame_number <= total_frames:
        for epoch in range(epochs):
            episode_scores = list()
            print(f'Epoch: {epoch}')

            for episode in range(n_episodes):
                state, info = environment.reset()
                frame_number = info['frame_number']
                proc_state = preprocess(state)
                terminated, truncated = False, False

                replay_memory = list()
                n_op = 0

                # logging
                ep_reward = 0
                losses = list()

                while not terminated and not truncated:
                    #print(f"Frame {info['frame_number']}")
                    #start = time.time()
                    reference = random.uniform(0, 1)

                    # exploratory factor
                    if info['frame_number'] > exploration_frame:
                        exploratory_factor = min_exploration
                    else:
                        exploratory_factor = max_exploration - exploratory_decay_rate * info['frame_number']

                    # e greedy with linear decay of exploratory factor
                    if reference < exploratory_factor:
                        action = environment.action_space.sample() 
                    else:
                        action = np.argmax(q1_network(np.array([proc_state])))

                    # take action and observe
                    next_state, reward, terminated, truncated, info = env.step(action)
                    proc_next_state = preprocess(next_state)

                    ep_reward += reward

                    # store in replay memory
                    replay_memory.append((np.array([proc_state]), action, reward, np.array([proc_next_state])))
                    if len(replay_memory) >= replay_memory_size:
                        replay_memory.pop(0)

                    # increment no op
                    n_op += 1

                    # if no_op timeframe is still valid, continue loop
                    if n_op < no_op_max:
                        continue

                    # Sample replay memory
                    minibatch = random.sample(replay_memory, minibatch_size)

                    accumulated_loss = 0

                    # Gradient Tape records the forward pass
                    with tf.GradientTape() as tape:
                        for transition in minibatch:
                            s = transition[0]
                            a = transition[1]
                            r = transition[2]
                            s_= transition[3]

                            if terminated:
                                y = r
                            else:
                                y = r + gamma_discount_factor*(np.argmax(q2_network(s_)))

                            #with tf.GradientTape() as tape:
                            #tape.watch(s)
                            y_ = q1_network(s)[:, a] # workaround so that the result has shape (1,)

                                # Calculate loss
                            accumulated_loss += loss_fn(y, y_)

                    # Calculate gradients with respect to every trainable variable
                    gradients = tape.gradient(accumulated_loss, q1_network.trainable_variables)
                    optimizer.apply_gradients(zip(gradients, q1_network.trainable_variables))

                    losses.append(accumulated_loss)

                    if n_op % network_update_frequency == 0:
                        q2_network = tf.keras.models.clone_model(q1_network)

                    #end = time.time()

                    #print(f'\tend: {end - start}')

                episode_scores.append(ep_reward)

                print(f"\tEpisode {episode} with score {ep_reward} in the frame {info['frame_number']} and mean loss {np.mean(losses)}")

            epoch_scores.append(episode_scores)

    return epoch_scores

In [None]:
env = gym.make('ALE/SpaceInvaders-v5')

# vars
total_frames = 1000000

minibatch_size = 32
replay_memory_size = 20000
agent_history_length = 4
target_network_update_frequency = 200 #10000 # corresponds to C in the pseudocode
discount_factor = 0.99
learning_rate = 0.00025
gradient_momentum = 0.95
squared_gradient_momentum = 0.95
min_squared_gradient = 0.01
initial_exploration = 1
final_exploration = 0.1
final_exploration_frame = total_frames / 50 # 1000000
replay_start_size = 50000
no_op_max = 32


n_episodes = 10
n_epochs = 20

action_value_network = create_model((84,84,1), env.action_space.n)

target_action_value_network = tf.keras.models.clone_model(action_value_network)

scores = train(env,
              n_episodes,
              action_value_network,
              target_action_value_network,
              n_epochs,
              no_op_max=no_op_max,
              exploration_frame=final_exploration_frame,
              network_update_frequency=target_network_update_frequency,
              replay_memory_size=replay_memory_size,
              total_frames=total_frames)

Epoch: 0
	Episode 0 with score 85.0 in the frame 1545 and mean loss 15.163395881652832
	Episode 1 with score 110.0 in the frame 3268 and mean loss 22.31584930419922
	Episode 2 with score 135.0 in the frame 4995 and mean loss 18.546566009521484
	Episode 3 with score 45.0 in the frame 6768 and mean loss 21.50010871887207
	Episode 4 with score 20.0 in the frame 8517 and mean loss 21.054903030395508
	Episode 5 with score 210.0 in the frame 11012 and mean loss 26.75890350341797
	Episode 6 with score 290.0 in the frame 13937 and mean loss 32.25963592529297
	Episode 7 with score 180.0 in the frame 16472 and mean loss 20.420379638671875
	Episode 8 with score 105.0 in the frame 18229 and mean loss 25.978275299072266
	Episode 9 with score 50.0 in the frame 20310 and mean loss 21.67955207824707
Epoch: 1
	Episode 0 with score 255.0 in the frame 22789 and mean loss 47.754024505615234
	Episode 1 with score 475.0 in the frame 28792 and mean loss 41.77257537841797
	Episode 2 with score 185.0 in the fr

	Episode 2 with score 185.0 in the frame 271946 and mean loss 42.91035079956055
	Episode 3 with score 90.0 in the frame 273695 and mean loss 26.363506317138672
	Episode 4 with score 210.0 in the frame 276172 and mean loss 23.66357421875
	Episode 5 with score 15.0 in the frame 277311 and mean loss 25.29501724243164
	Episode 6 with score 420.0 in the frame 281744 and mean loss 45.70964813232422
	Episode 7 with score 15.0 in the frame 283687 and mean loss 25.92783546447754
	Episode 8 with score 110.0 in the frame 286398 and mean loss 31.55324363708496
	Episode 9 with score 270.0 in the frame 291085 and mean loss 36.502803802490234
Epoch: 11
	Episode 0 with score 185.0 in the frame 293644 and mean loss 42.032623291015625
	Episode 1 with score 110.0 in the frame 296053 and mean loss 38.425148010253906
	Episode 2 with score 440.0 in the frame 299032 and mean loss 47.10844421386719
	Episode 3 with score 410.0 in the frame 301757 and mean loss 54.479957580566406
	Episode 4 with score 280.0 in 

	Episode 4 with score 140.0 in the frame 554624 and mean loss 41.665626525878906
	Episode 5 with score 155.0 in the frame 556949 and mean loss 52.178733825683594
	Episode 6 with score 210.0 in the frame 559456 and mean loss 52.92827606201172
	Episode 7 with score 80.0 in the frame 560939 and mean loss 15.887828826904297
	Episode 8 with score 115.0 in the frame 563726 and mean loss 31.581083297729492
	Episode 9 with score 30.0 in the frame 565745 and mean loss 35.196319580078125
Epoch: 1
	Episode 0 with score 35.0 in the frame 567688 and mean loss 17.86121368408203
	Episode 1 with score 50.0 in the frame 569759 and mean loss 27.202539443969727
	Episode 2 with score 55.0 in the frame 571050 and mean loss 19.138446807861328
	Episode 3 with score 105.0 in the frame 572961 and mean loss 19.2747859954834
	Episode 4 with score 285.0 in the frame 576440 and mean loss 30.18077278137207
	Episode 5 with score 155.0 in the frame 578865 and mean loss 15.40310287475586
	Episode 6 with score 135.0 in

In [None]:
scores

In [None]:
env.close() # there is a chance it is not needed for now