In [1]:
import os
import tensorflow as tf

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # 0 = all messages, 1 = filter out INFO, 2 = filter out WARNING, 3 = filter out ERROR
tf.get_logger().setLevel('ERROR')

import warnings
warnings.filterwarnings('ignore')


import logging
gym_logger = logging.getLogger('gym')
gym_logger.setLevel(logging.ERROR)

In [2]:
import wandb
wandb.init(project="ActorCritic grayscale", entity = "rl_proj")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mroysgc[0m ([33mrl_proj[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
import gym
import random
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from collections import namedtuple, deque
from ale_py import ALEInterface
import wandb
import imageio
import matplotlib.pyplot as plt

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

ale = ALEInterface()

class Config:
    EPSILON_START = 1.0
    EPSILON_END = 0.01
    EPSILON_DECAY_RATE = 0.99
    EPISODES = 50  
    BATCH_SIZE = 128
    GAMMA = 0.999
    MAX_STEPS_PER_EPISODE = 1000
    LEARNING_RATE = 1e-4 
    MEMORY_SIZE = 10000

config = Config()
best_reward = 0

#env = gym.make("Assault-v4", render_mode="rgb_array")
env = gym.make("Assault-v4") 
n_actions = env.action_space.n

class ReplayMemory(object):
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)
        self.transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward', 'done'))

    def push(self, *args):
        self.memory.append(self.transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)



class Actor(keras.Model):
    def __init__(self, n_actions):
        super(Actor, self).__init__()
        #self.conv1 = layers.Conv2D(32, 8, strides=4, activation="relu")
        self.conv1 = layers.Conv2D(32, 8, strides=4, activation="relu", kernel_initializer='he_normal')
        #self.conv2 = layers.Conv2D(64, 4, strides=2, activation="relu")
        self.conv2 = layers.Conv2D(64, 4, strides=2, activation=None)  # Remove activation here
        self.batch_norm1 = layers.BatchNormalization()
        self.conv3 = layers.Conv2D(64, 3, strides=1, activation="relu")
        self.flatten = layers.Flatten()
        self.d1 = layers.Dense(512, activation="relu", kernel_regularizer=keras.regularizers.l2(0.001))
        self.d2 = layers.Dense(n_actions, activation="softmax")  # Output layer for action probabilities

    def call(self, inputs):
        x = self.conv1(inputs)
        #x = self.conv2(x)
        x = self.conv2(x)
        x = tf.nn.relu(self.batch_norm1(x))
        x = self.conv3(x)
        x = self.flatten(x)
        x = self.d1(x)
        return self.d2(x)


class Critic(keras.Model):
    def __init__(self):
        super(Critic, self).__init__()
        self.conv1 = layers.Conv2D(32, 8, strides=4, activation="relu")
        self.conv2 = layers.Conv2D(64, 4, strides=2, activation="relu")
        self.conv3 = layers.Conv2D(64, 3, strides=1, activation="relu")
        self.flatten = layers.Flatten()
        self.d1 = layers.Dense(512, activation="relu")
        self.d2 = layers.Dense(1)

    def call(self, inputs):
        x = self.conv1(inputs)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.flatten(x)
        x = self.d1(x)
        return self.d2(x)

actor_model = Actor(n_actions)

critic_model = Critic()

def rgb_to_grayscale(rgb):
    return np.dot(rgb[..., :3], [0.2989, 0.5870, 0.1140])

actor_model.build(input_shape=(None, 210, 160, 1))  # 1 channel for grayscale
critic_model.build(input_shape=(None, 210, 160, 1))

dummy_input = np.random.random((1, 210, 160, 1))  # Updated for grayscale
actor_model(dummy_input)
critic_model(dummy_input)

lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-4,
    decay_steps=10000,
    decay_rate=0.9)
actor_optimizer = keras.optimizers.Adam(learning_rate=lr_schedule)
critic_optimizer = keras.optimizers.Adam(learning_rate=lr_schedule)

memory = ReplayMemory(config.MEMORY_SIZE)


def take_action(state, epsilon):

    if random.random() < epsilon:
        return env.action_space.sample()
    else:
        action_probabilities = actor_model.predict(state)

        if np.isnan(action_probabilities).any():
            return env.action_space.sample() 
        return np.random.choice(n_actions, p=np.squeeze(action_probabilities))

def optimize_model():
    if len(memory) < config.BATCH_SIZE:
        return  # Exit the function if not enough samples

    # Sample a batch of transitions from the replay memory
    transitions = memory.sample(config.BATCH_SIZE)
    batch = memory.transition(*zip(*transitions))

    # Convert the batches into numpy arrays for processing
    state_batch = np.array(batch.state).reshape(-1, 210, 160, 1)
    
    action_batch = np.array(batch.action)
    reward_batch = np.array(batch.reward)
    next_state_batch = np.array(batch.next_state).reshape(-1, 210, 160, 1)
    
    done_batch = np.array(batch.done)

    # Critic Update
    with tf.GradientTape() as tape:
        # Get the values from the Critic model
        values = critic_model(state_batch)
        # Create a dummy target for simplicity
        dummy_target = tf.random.uniform(shape=values.shape)
        # Compute a simple mean squared error

        values_squeezed = tf.squeeze(values)
        if len(values_squeezed.shape) > 1:
            raise ValueError("Critic model's output is not a 1D array")

        critic_loss = tf.math.reduce_mean(tf.math.square(dummy_target - values))

    critic_grads = tape.gradient(critic_loss, critic_model.trainable_variables)
    critic_grads, _ = tf.clip_by_global_norm(critic_grads, 1.0)  # Gradient clipping
    critic_optimizer.apply_gradients(zip(critic_grads, critic_model.trainable_variables))

    # Debugging: Print gradients and corresponding variables
    for grad, var in zip(critic_grads, critic_model.trainable_variables):
        if grad is None:
            print(f"Gradient is None for variable {var.name}")

    # Filter out None gradients
    critic_grads_and_vars = [(grad, var) for grad, var in zip(critic_grads, critic_model.trainable_variables) if grad is not None]

    # Apply gradients if there are valid ones
    if critic_grads_and_vars:
        critic_optimizer.apply_gradients(critic_grads_and_vars)
    else:
        print("No valid gradients to apply.")

    # Actor Update
    with tf.GradientTape() as tape:
        # Predict the action probabilities for the current state
        action_probs = actor_model(state_batch)
        # Create a one-hot encoded mask for the taken actions
        action_mask = tf.one_hot(action_batch, n_actions)
        # Select the probabilities for the actions that were actually taken
        selected_action_probs = tf.reduce_sum(action_probs * action_mask, axis=1)

        # Adjust dummy target values shape to match the values
        dummy_target_values = np.zeros_like(values_squeezed.numpy())
        advantage = dummy_target_values - values_squeezed

        epsilon = 1e-8
        actor_loss = -tf.math.reduce_mean(tf.math.log(selected_action_probs + epsilon) * advantage)

    actor_grads = tape.gradient(actor_loss, actor_model.trainable_variables)
    actor_grads, _ = tf.clip_by_global_norm(actor_grads, 0.5)  # Gradient clipping
    actor_optimizer.apply_gradients(zip(actor_grads, actor_model.trainable_variables))
    for grad in actor_grads:
        if tf.reduce_any(tf.math.is_inf(grad)).numpy() or tf.reduce_any(tf.math.is_nan(grad)).numpy():
            print("Inf or NaN detected in actor gradients")

episode_rewards = []
epsilon = config.EPSILON_START

for episode in range(config.EPISODES):
    state= env.reset()

    state = rgb_to_grayscale(state) / 255.0
    state = state.reshape(1, 210, 160, 1)  # Correctly reshape

    done = False
    episode_reward = 0
    steps = 0
    info ={'ale.lives': 4, 'episode_frame_number': 2, 'frame_number': 2}
    frames = []
    

    while not done and info.get("ale.lives") >= 0: #steps < config.MAX_STEPS_PER_EPISODE and info.get("lives") >= 0:
        action = take_action(state, epsilon)
        
        step_result = env.step(action)
        
        next_state, reward, done, info = step_result

        next_state = rgb_to_grayscale(next_state) / 255.0
        next_state = next_state.reshape(1, 210, 160, 1)  

        memory.push(state, action, next_state, reward, done)
        optimize_model()

        frame = env.render(mode="rgb_array")
        frames.append(frame)

        state = next_state
        episode_reward += reward

        if reward != 0:
            print("step: ", steps, "action: ", action, " reward: ", reward)
            print("Lives: ", info.get("ale.lives"))

        steps += 1

    print(f"Episode: {episode+1}, Reward: {episode_reward}, Epsilon: {epsilon}")

    if episode_reward > best_reward:
        best_reward = episode_reward

        actor_model.save("./best_actor_model", save_format="tf")
        
        critic_model.save("./best_critic_model", save_format="tf")
        print("New best model saved with reward:", episode_reward)

        gif_path = f"./episode_{episode+1}_reward_{episode_reward}.gif"
        
        imageio.mimsave(gif_path, frames, duration=20)

    episode_rewards.append(episode_reward)

    epsilon = max((epsilon * config.EPSILON_DECAY_RATE), config.EPSILON_END)

    # Log episode metrics and GIF to wandb
    wandb.log({"episode": episode + 1, "reward": episode_reward, "epsilon": epsilon, "episode_gif": wandb.Video(gif_path, fps=4, format="gif")})

env.close()

import matplotlib.pyplot as plt
plt.figure(figsize=(10, 5))
plt.plot(episode_rewards)
plt.title("Rewards per Episode")
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.grid(True)


plot_path = "./rewards_plot.png"
plt.savefig(plot_path)
plt.show()

wandb.log({"Training process of Actor-Critic": wandb.Image(plot_path)})


step:  68 action:  2  reward:  21.0
Lives:  4
step:  99 action:  1  reward:  21.0
Lives:  3
step:  154 action:  2  reward:  21.0
Lives:  3
step:  290 action:  3  reward:  21.0
Lives:  2
step:  320 action:  4  reward:  21.0
Lives:  2
step:  343 action:  2  reward:  21.0
Lives:  2
step:  451 action:  4  reward:  21.0
Lives:  1
step:  459 action:  4  reward:  21.0
Lives:  1
step:  494 action:  5  reward:  21.0
Lives:  1
Episode: 1, Reward: 189.0, Epsilon: 1.0




New best model saved with reward: 189.0
step:  34 action:  6  reward:  21.0
Lives:  4
step:  52 action:  2  reward:  21.0
Lives:  4
step:  89 action:  2  reward:  21.0
Lives:  4
step:  96 action:  1  reward:  21.0
Lives:  4
step:  131 action:  1  reward:  21.0
Lives:  3
step:  146 action:  0  reward:  21.0
Lives:  3
step:  199 action:  6  reward:  21.0
Lives:  3
step:  279 action:  0  reward:  21.0
Lives:  2
step:  317 action:  5  reward:  21.0
Lives:  2
step:  368 action:  1  reward:  21.0
Lives:  1
Episode: 2, Reward: 210.0, Epsilon: 0.99




New best model saved with reward: 210.0
step:  19 action:  0  reward:  21.0
Lives:  4
step:  60 action:  1  reward:  21.0
Lives:  4
step:  124 action:  1  reward:  21.0
Lives:  4
step:  158 action:  6  reward:  21.0
Lives:  4
step:  174 action:  4  reward:  21.0
Lives:  4
step:  300 action:  5  reward:  21.0
Lives:  3
step:  644 action:  2  reward:  21.0
Lives:  1
step:  662 action:  2  reward:  21.0
Lives:  1
step:  668 action:  2  reward:  21.0
Lives:  1
Episode: 3, Reward: 189.0, Epsilon: 0.9801
step:  20 action:  6  reward:  21.0
Lives:  4
step:  98 action:  2  reward:  21.0
Lives:  4
step:  145 action:  0  reward:  21.0
Lives:  4
step:  159 action:  0  reward:  21.0
Lives:  4
step:  184 action:  4  reward:  21.0
Lives:  4
step:  281 action:  5  reward:  21.0
Lives:  2
step:  309 action:  1  reward:  21.0
Lives:  2
step:  325 action:  6  reward:  21.0
Lives:  2
step:  348 action:  6  reward:  21.0
Lives:  2
step:  514 action:  0  reward:  21.0
Lives:  1
step:  565 action:  4  rewar



New best model saved with reward: 273.0
step:  14 action:  2  reward:  21.0
Lives:  4
step:  22 action:  2  reward:  21.0
Lives:  4
step:  31 action:  1  reward:  21.0
Lives:  4
step:  92 action:  6  reward:  21.0
Lives:  4
step:  141 action:  2  reward:  21.0
Lives:  4
step:  280 action:  3  reward:  21.0
Lives:  3
step:  322 action:  2  reward:  21.0
Lives:  3
step:  341 action:  5  reward:  21.0
Lives:  3
step:  364 action:  1  reward:  21.0
Lives:  3
step:  382 action:  6  reward:  21.0
Lives:  3
step:  508 action:  2  reward:  21.0
Lives:  2
step:  552 action:  6  reward:  21.0
Lives:  2
step:  584 action:  4  reward:  21.0
Lives:  2
step:  596 action:  1  reward:  21.0
Lives:  2
step:  617 action:  6  reward:  21.0
Lives:  2
step:  673 action:  6  reward:  21.0
Lives:  2
step:  744 action:  1  reward:  21.0
Lives:  1
step:  822 action:  1  reward:  21.0
Lives:  1
step:  859 action:  5  reward:  21.0
Lives:  1
Episode: 5, Reward: 399.0, Epsilon: 0.96059601




New best model saved with reward: 399.0
step:  96 action:  4  reward:  21.0
Lives:  4
step:  125 action:  6  reward:  21.0
Lives:  4
step:  142 action:  4  reward:  21.0
Lives:  4
step:  172 action:  1  reward:  21.0
Lives:  4
step:  196 action:  1  reward:  21.0
Lives:  4
step:  236 action:  3  reward:  21.0
Lives:  4
step:  335 action:  2  reward:  21.0
Lives:  3
step:  386 action:  3  reward:  21.0
Lives:  3
step:  431 action:  4  reward:  21.0
Lives:  3
step:  494 action:  2  reward:  21.0
Lives:  2
step:  599 action:  0  reward:  21.0
Lives:  2
step:  709 action:  3  reward:  21.0
Lives:  2
step:  766 action:  1  reward:  21.0
Lives:  2
step:  773 action:  1  reward:  21.0
Lives:  2
step:  836 action:  4  reward:  21.0
Lives:  1
Episode: 6, Reward: 315.0, Epsilon: 0.9509900498999999
step:  20 action:  4  reward:  21.0
Lives:  4
step:  100 action:  0  reward:  21.0
Lives:  4
step:  117 action:  2  reward:  21.0
Lives:  4
step:  139 action:  4  reward:  21.0
Lives:  4
step:  197 act



New best model saved with reward: 441.0
step:  21 action:  3  reward:  21.0
Lives:  4
step:  81 action:  3  reward:  21.0
Lives:  4
step:  103 action:  0  reward:  21.0
Lives:  4
step:  150 action:  5  reward:  21.0
Lives:  4
step:  172 action:  3  reward:  21.0
Lives:  4
step:  235 action:  2  reward:  21.0
Lives:  3
step:  356 action:  1  reward:  21.0
Lives:  2
step:  393 action:  3  reward:  21.0
Lives:  2
step:  405 action:  5  reward:  21.0
Lives:  2
step:  564 action:  3  reward:  21.0
Lives:  1
step:  604 action:  5  reward:  21.0
Lives:  1
step:  676 action:  3  reward:  21.0
Lives:  1
Episode: 9, Reward: 252.0, Epsilon: 0.92274469442792
step:  14 action:  5  reward:  21.0
Lives:  4
step:  89 action:  0  reward:  21.0
Lives:  3
step:  139 action:  1  reward:  21.0
Lives:  3
step:  190 action:  2  reward:  21.0
Lives:  3
step:  264 action:  1  reward:  21.0
Lives:  2
step:  272 action:  5  reward:  21.0
Lives:  2
step:  322 action:  6  reward:  21.0
Lives:  2
step:  328 action:

In [None]:

#TEST THE SAVED MODEL
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt

actor_model = keras.models.load_model("./best_actor_model")

critic_model = keras.models.load_model("./best_critic_model")

env = gym.make("Assault-v4", render_mode = "rgb_array")
n_actions = env.action_space.n

def take_action(state):
    action_probabilities = actor_model.predict(state)
    return np.random.choice(n_actions, p=np.squeeze(action_probabilities))


rewards_per_episode = []
best_reward = 0

for episode in range(50):
    state, info= env.reset()
    #state = state / 255.0 
    state = rgb_to_grayscale(state) / 255.0
    state = state.reshape(1, 210, 160, 1)  # Correctly reshape
    done = False
    episode_reward = 0
    frames = []
    info ={'lives': 4, 'episode_frame_number': 2, 'frame_number': 2}

    while not done:
        action = take_action(state)
        next_state, reward, done,_, info = env.step(action)[:5]
        
        next_state = rgb_to_grayscale(next_state) / 255.0
        next_state = next_state.reshape(1, 210, 160, 1)  # Correctly reshape
        #next_state = next_state / 255.0  # Normaliza los valores de píxeles

        frame = env.render()
        frames.append(frame)

        state = next_state
        episode_reward += reward

        if reward != 0:
            print("action: ", action, " reward: ", reward)
            print("Lives: ", info.get("ale.lives"))

    rewards_per_episode.append(episode_reward)
    print(f"Episode {episode + 1}: Reward = {episode_reward}")

    if episode_reward > best_reward:
        best_reward = episode_reward  
        
        gif_path = f"./test_episode_{episode+1}_reward_{episode_reward}.gif"
        imageio.mimsave(gif_path, frames, fps=30)  

    # Log episode metrics and GIF to wandb
    wandb.log({"episode": episode + 1, "reward": episode_reward, "epsilon": epsilon, "episode_gif": wandb.Video(gif_path, fps=4, format="gif")})


env.close()

# Grafica las recompensas por episodio
plt.plot(rewards_per_episode)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Reward per Episode')
plt.show()

plot_path = "./test_rewards_plot.png"
plt.savefig(plot_path)


wandb.log({"Testing of Actor-Critic": wandb.Image(plot_path)})
