In [1]:
import os
import tensorflow as tf

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # 0 = all messages, 1 = filter out INFO, 2 = filter out WARNING, 3 = filter out ERROR
tf.get_logger().setLevel('ERROR')

import warnings
warnings.filterwarnings('ignore')


import logging
gym_logger = logging.getLogger('gym')
gym_logger.setLevel(logging.ERROR)

In [None]:
#API KEY: 73c9a156b91f3e0c01c3d5f332d23bfc66f4cdbf

In [2]:
import wandb
wandb.init(project="DuelingDQN", entity = "rl_proj")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mroysgc[0m ([33mrl_proj[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
import os
import gym
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from collections import namedtuple, deque
import time
from ale_py import ALEInterface
import imageio
import matplotlib.pyplot as plt

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

np.random.seed(42)
tf.random.set_seed(42)

ale = ALEInterface()

class Config:
    EPSILON_START = 1.0
    EPSILON_END = 0.01
    EPSILON_DECAY_RATE = 0.99
    EPISODES = 1500
    TARGET_UPDATE = 100
    BATCH_SIZE = 128
    GAMMA = 0.999
    MAX_STEPS_PER_EPISODE = 1000
    LEARNING_RATE = 2.5e-4
    MEMORY_SIZE = 10000

config = Config()
best_reward = 0

env = gym.make("Assault-v4")
n_actions = env.action_space.n


class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)
        self.transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward', 'done'))

    def push(self, *args):
        """Save a transition"""
        self.memory.append(self.transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

class DuelingDQN(keras.Model):
    def __init__(self, n_actions):
        super(DuelingDQN, self).__init__()
        self.layer1 = layers.Conv2D(16, 5, strides=2, activation="relu")
        self.bn1 = layers.BatchNormalization()
        self.layer2 = layers.Conv2D(16, 5, strides=2, activation="relu")
        self.bn2 = layers.BatchNormalization()
        self.layer3 = layers.Conv2D(32, 5, strides=2, activation="relu")
        self.bn3 = layers.BatchNormalization()
        self.flatten = layers.Flatten()
        self.layer4 = layers.Dense(512, activation="relu")

        # Dueling DQN specific layers
        self.state_value = layers.Dense(1)
        self.action_advantage = layers.Dense(n_actions)

    def call(self, inputs):
        x = self.layer1(inputs)
        x = self.bn1(x)
        x = self.layer2(x)
        x = self.bn2(x)
        x = self.layer3(x)
        x = self.bn3(x)
        x = self.flatten(x)
        x = self.layer4(x)

        state_value = self.state_value(x)
        action_advantage = self.action_advantage(x)

        # Combine state and advantage values
        q_values = state_value + (action_advantage - tf.reduce_mean(action_advantage, axis=1, keepdims=True))
        return q_values


model = DuelingDQN(n_actions)
model_target = DuelingDQN(n_actions)
memory = ReplayMemory(config.MEMORY_SIZE)


optimizer = keras.optimizers.Adam(learning_rate=config.LEARNING_RATE, clipnorm=1.0)
loss_function = keras.losses.Huber()

def take_action(state, epsilon):
    if random.random() < epsilon:
        return env.action_space.sample()
    else:
        q_values = model.predict(state[np.newaxis, ...])
        return np.argmax(q_values[0])

def optimize_model():
    if memory.__len__() < config.BATCH_SIZE:
        return
    transitions = memory.sample(config.BATCH_SIZE)
    batch = memory.transition(*zip(*transitions))

    state_batch = np.array(batch.state)
    action_batch = np.array(batch.action)
    next_state_batch = np.array(batch.next_state)
    rewad_batch = np.array(batch.reward)
    done_batch = np.array(batch.done, dtype=np.int8)

    future_rewards = model_target(next_state_batch)
    target = rewad_batch + config.GAMMA * tf.reduce_max(future_rewards, axis=-1) * (1 - done_batch)

    action_mask = tf.one_hot(action_batch, n_actions)

    with tf.GradientTape() as tape:
        q_values = model(state_batch)
        q_action = tf.reduce_sum(tf.multiply(q_values, action_mask), axis=-1)
        loss = loss_function(target, q_action)

    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    wandb.log({"loss": loss.numpy()})

    #model.save_weights('/Users/roy/Desktop/UNI')

# Checkpoint setup
checkpoint_path = "training_checkpoints/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)


episode_rewards = []
losses = []

best_reward = 0
best_episode = 0
best_frames = []

epsilon = config.EPSILON_START


for episode in range(config.EPISODES):
    state = env.reset()  
    state = state / 255.0
    #state = env.reset() / 255.0
    done = False
    episode_reward = 0
    steps = 0
    info ={'ale.lives': 4, 'episode_frame_number': 2, 'frame_number': 2}
    frames = []
    

    while not done and info.get("ale.lives") > 0: #steps < config.MAX_STEPS_PER_EPISODE and info.get("lives") >= 0: #info.get("lives") > 0:
        action = take_action(state, epsilon)
        step_result = env.step(action)
        next_state, reward, done, info = step_result
        next_state = next_state / 255.0
        #print("info: ", info)

        memory.push(state, action, next_state, reward, done)
        optimize_model()

        frame = env.render(mode="rgb_array")
        frames.append(frame)

        state = next_state
        episode_reward += reward

        if reward != 0:
            print("step: ", steps, "action: ", action, " reward: ", reward)
            print("Lives: ", info.get("ale.lives"))

        steps += 1
        
        #epsilon = max(epsilon - (EPSILON_START - EPSILON_END) / EPSILON_DECAY, EPSILON_END)
        
        
    print(f"\nEpisodio: {episode+1}, Recompensa: {episode_reward}, Epsilon: {epsilon}")
    
    if episode_reward > best_reward:
        best_reward = episode_reward
        # Ensure the model is built
        if not model.built:
            dummy_input = np.zeros((1, *env.observation_space.shape))
            model(dummy_input)
        # Save the model
        #CAMBIAR PATH a carpeta duelingDQN
        model_save_path = f"./dueling_model"
        model.save(model_save_path)
        print(f"Model saved")

        gif_path = f"./episode_{episode+1}_reward_{episode_reward}.gif"
        imageio.mimsave(gif_path, frames, format='GIF', fps=30)
    
    epsilon = max((epsilon * config.EPSILON_DECAY_RATE), config.EPSILON_END)
    
    wandb.log({"episode": episode + 1, "reward": episode_reward, "epsilon": epsilon})
    episode_rewards.append(episode_reward)
    
    if (episode + 1) % config.TARGET_UPDATE == 0:
        model_target.set_weights(model.get_weights())

env.close()



Episodio: 1, Recompensa: 0.0, Epsilon: 1.0
step:  16 action:  5  reward:  21.0
Lives:  4
step:  78 action:  3  reward:  21.0
Lives:  4

Episodio: 2, Recompensa: 42.0, Epsilon: 0.99




Model saved
step:  20 action:  4  reward:  21.0
Lives:  4


KeyboardInterrupt: 

In [7]:
plt.figure(figsize=(10, 5))
plt.plot(episode_rewards, label="Recompensas por episodio")
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.title("Evolution of the rewards per episode")
plt.legend()

#CAMBIAR PATH a carpeta duelingDQN
plt.savefig(f"./reward_plot_episode_{episode+1}.png")
wandb.log({"training process Dueling DQN": wandb.Image(plt)})
plt.close()

In [10]:
#TEST THE SAVED MODEL
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt

# Carga los modelos
#CAMBIAR PATH a carpeta duelingDQN
dueling_model = keras.models.load_model(f"./dueling_model")

# Crea el entorno
env = gym.make("Assault-v4")
n_actions = env.action_space.n

rewards_per_episode = []
best_reward = 0

for episode in range(500):
    state= env.reset()
    state = state / 255.0  
    done = False
    episode_reward = 0
    frames = []
    info ={'ale.lives': 4, 'episode_frame_number': 2, 'frame_number': 2}

    while not done:
        action = np.argmax(dueling_model.predict(state[np.newaxis, ...]))
        next_state, reward, done,  info = env.step(action)
        next_state = next_state / 255.0  

        frame = env.render(mode = 'rgb_array')
        frames.append(frame)

        state = next_state
        episode_reward += reward

        if reward != 0:
            print("action: ", action, " reward: ", reward)
            print("Lives: ", info.get("ale.lives"))

    rewards_per_episode.append(episode_reward)
    print(f"Episode {episode + 1}: Reward = {episode_reward}")

    if episode_reward > best_reward:
        best_reward = episode_reward 
        
        gif_path = f"./test_episode_{episode+1}_reward_{episode_reward}.gif"
        imageio.mimsave(gif_path, frames, fps=30)

    # Log episode metrics and GIF to wandb
    wandb.log({"episode": episode + 1, "reward": episode_reward, "epsilon": epsilon, "episode_gif": wandb.Video(gif_path, fps=4, format="gif")})


env.close()

# Grafica las recompensas por episodio
plt.plot(rewards_per_episode)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Reward per Episode')
plt.show()

action:  1  reward:  21.0
Lives:  3
action:  0  reward:  21.0
Lives:  2


KeyboardInterrupt: 