In [18]:
import tensorflow as tf
import keras
from keras import layers
import gymnasium as gym
import ale_py
import cv2
import numpy as np
from gymnasium.wrappers.atari_preprocessing import AtariPreprocessing
from gymnasium.wrappers.frame_stack import FrameStack
from gymnasium import ObservationWrapper

gym.register_envs(ale_py)

In [19]:
# env = gym.make('SpaceInvadersNoFrameskip-v4', render_mode='human')

# observation, info = env.reset()

# episode_over = False
# while not episode_over:
#     action = env.action_space.sample()
#     observation, reward, terminated, truncated, info = env.step(action)

#     episode_over = terminated or truncated

# env.close()

In [20]:
env = gym.make("SpaceInvadersNoFrameskip-v4", render_mode="rgb_array")
env = AtariPreprocessing(env)
env = FrameStack(env, num_stack=4)

num_actions = env.action_space.n
trigger = lambda t: t & 20
env = gym.wrappers.RecordVideo(env, video_folder="./videos", episode_trigger=trigger, disable_logger=True)

In [21]:
def create_q_model():
    return keras.Sequential(
        [
            keras.Input(shape=(84, 84, 4)),
            layers.Conv2D(32, kernel_size=8, strides=4, activation="relu"),
            layers.Conv2D(64, kernel_size=4, strides=2, activation="relu"),
            layers.Conv2D(64, kernel_size=3, strides=1, activation="relu"),
            layers.Flatten(),
            layers.Dense(512, activation="relu"),
            layers.Dense(num_actions, activation="linear"),
        ]
    )

In [22]:
model = create_q_model()
model_target = create_q_model()

optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)

In [23]:
action_history = []
state_history = []
state_next_history = []
rewards_history = []
episode_reward_history = []
done_history = []
running_reward = 0
episode_count = 0
frame_count = 0

In [None]:
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.1
epsilon_max = 1.0
epsilon_interval = epsilon_max - epsilon_min
batch_size = 32 # Enlarge (maybe) for better performance

# Maximun episodes to run, set to zero means run till solved
max_episodes = 0
# Max frames to run
max_frames = 1e7

# Number of frames to take random actions and observe output
epsilon_random_frames = 1e5 # Exploring longer before exploitation
# Number of frames for exploration
epsilon_greedy_frames = 2e6 # Slow down epsilon decay
# Maximum length of replay buffer
max_memory_length = 1e6
# Abort if more than the below frames are spent in a single game (results in truncated = True)
max_steps_per_episode = 10000
# How often should the action network be updated
update_after_actions = 4
# How often should the Q-network be cloned from the action network?
update_target_network = 10000
# Use Huber loss for stability (specifically for Adam)
loss_function = keras.losses.Huber()

In [25]:
print(model_target.input_shape)

(None, 84, 84, 4)


In [None]:
# # Function for preprocessing
# def preprocess_frame(frame, prev_frame=None):
#     # Ensure frame is a numpy array
#     frame = np.array(frame)
#     # Convert to grayscale
#     frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
#     # Normalize to 0-1
#     frame = frame / 255.0
#     # Resize to 84x84
#     frame = cv2.resize(frame, (84, 84), interpolation=cv2.INTER_AREA)
#     # Frame differencing (if previous frame exists)
#     if prev_frame is not None:
#         frame = frame - prev_frame
#     return frame
    

<function __main__.preprocess_frame(frame, prev_frame=None)>

In [None]:
# Replay buffer (Segmented sampling)

class ReplayBuffer:
    def __init__(self, max_length):
        self.buffer = []
        self.priorities = [] # Holds TD errors or priorities
        self.max_length = max_length

    def add()

In [None]:
# prev_frame = None
while True:
    observation, _ = env.reset()
    state = preprocess_frame(observation) # Preprocess the initial state
    # prev_frame = state
    episode_reward = 0

    for timestep in range(1, max_steps_per_episode):
        frame_count += 1

        if frame_count < epsilon_random_frames or epsilon > np.random.rand(1)[0]:
            # Take random action
            action = np.random.choice(num_actions)
        else:
            state_tensor = tf.convert_to_tensor(state, dtype=tf.float32)
            # Ensure the state tensor has the correct shape (batch_size, height, width, channels)
            state_tensor = tf.transpose(state_tensor, perm=[1, 2, 0])  # Change to (84, 84, 4)
            state_tensor = tf.expand_dims(state_tensor, axis=0)  # Add batch dimension
            action_probs = model(state_tensor, training=False)
            # Take best action
            action = keras.ops.argmax(action_probs[0].numpy())

        # Perform the action and get the next frame
        observation, reward, done, _, _ = env.step(action)

        # Preprocess the next frame
        preprocess_frame = preprocess_frame(observation, prev_frame)
        prev_frame = preprocess_frame # Update the previous frame

        # Store the preprocessed frame in the replay buffer
        state_next = np.array(preprocess_frame)
        
        epsilon -= epsilon_interval / epsilon_greedy_frames
        epsilon = max(epsilon, epsilon_min)

        state_next, reward, done, _, _ = env.step(action)
        state_next = np.array(state_next)

        episode_reward += reward

        # Replay buffer 
        action_history.append(action)
        state_history.append(state)
        state_next_history.append(state_next)
        done_history.append(done)
        rewards_history.append(reward)
        
        # Update state
        state = state_next

        if frame_count % update_after_actions == 0 and len(done_history) > batch_size:
            # get indices of samples for replay buffers
            indices = np.random.choice(range(len(done_history)), size=batch_size)

            # Prepare batches
            state_sample = np.array([state_history[i] for i in indices])
            state_sample = tf.transpose(state_sample, perm=[0, 2, 3, 1]) # Correct shape
            state_sample = tf.cast(state_sample, tf.float32) # Ensure the tensor has the correct data type (float32)
            
            state_next_sample = np.array([state_next_history[i] for i in indices])
            state_next_sample = tf.transpose(state_next_sample, perm=[0, 2, 3, 1]) # Correct shape
            state_next_sample = tf.cast(state_next_sample, tf.float32) # Ensure float32
            
            rewards_sample = np.array([rewards_history[i] for i in indices])
            action_sample = np.array([action_history[i] for i in indices])
            done_sample = tf.convert_to_tensor([float(done_history[i]) for i in indices])

            # Predict Q-values for the next states
            future_rewards = model_target.predict(state_next_sample, verbose=0)
            
            # Update Q values
            updated_q_values = rewards_sample + gamma * tf.reduce_max(future_rewards, axis=1)
            updated_q_values = updated_q_values * (1 - done_sample) # Mask terminal states 

            # One-hot encoding of actions
            masks = tf.one_hot(action_sample, num_actions)

            with tf.GradientTape() as tape:
                # Compute loss
                q_values = model(state_sample)
                q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
                loss = loss_function(updated_q_values, q_action)

            # Backpropagation    
            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
        
        if frame_count % update_target_network == 0:
            model_target.set_weights(model.get_weights())
            print(f"Best score of last 100: {np.max(episode_reward_history)}, running reward: {running_reward} at episode {episode_count}, frame {frame_count}")
            model.save(f"Space-Invaders_q_model{episode_count}.keras")

        if len(rewards_history) > max_memory_length:
            del rewards_history[:1]
            del state_history[:1]
            del state_next_history[:1]
            del action_history[:1]
            del done_history[:1]

        if done:
            break
        
    episode_reward_history.append(episode_reward)
    if len(episode_reward_history) > 100:
        del episode_reward_history[:1]
    running_reward = np.mean(episode_reward_history)

    episode_count += 1

    if running_reward > 200:
        print(f"Solved at episode {episode_count}!")
        model.save(f"Space-Invaders_q_model_solved.keras")
        break

    if (max_episodes > 0 and episode_count >= max_episodes):
        print(f"Stopped at episode {episode_count}!")
        break
    if (max_frames > 0 and frame_count >= max_frames):
        print(f"Stopped at frame {frame_count}!")
        break

error: OpenCV(4.10.0) d:\a\opencv-python\opencv-python\opencv\modules\imgproc\src\color.simd_helpers.hpp:92: error: (-2:Unspecified error) in function '__cdecl cv::impl::`anonymous-namespace'::CvtHelper<struct cv::impl::`anonymous namespace'::Set<3,4,-1>,struct cv::impl::A0x46dff480::Set<1,-1,-1>,struct cv::impl::A0x46dff480::Set<0,2,5>,4>::CvtHelper(const class cv::_InputArray &,const class cv::_OutputArray &,int)'
> Invalid number of channels in input image:
>     'VScn::contains(scn)'
> where
>     'scn' is 84
