In [None]:
import os
import gym
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from collections import namedtuple, deque
import time
from ale_py import ALEInterface
import imageio
import matplotlib.pyplot as plt
import warnings
import logging
from utils_dueling import *

In [None]:

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # 0 = all messages, 1 = filter out INFO, 2 = filter out WARNING, 3 = filter out ERROR
tf.get_logger().setLevel('ERROR')

warnings.filterwarnings('ignore')


gym_logger = logging.getLogger('gym')
gym_logger.setLevel(logging.ERROR)

In [2]:
#API KEY: 73c9a156b91f3e0c01c3d5f332d23bfc66f4cdbf

In [None]:
#import wandb
#wandb.init(project="DuelingDQN", entity = "rl_proj")

In [None]:
# Suppress TensorFlow logging
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Set random seeds for reproducibility
np.random.seed(42)  # Seed for NumPy random number generator
tf.random.set_seed(42)  # Seed for TensorFlow random number generator

# Initialize ALEInterface (Arcade Learning Environment)
ale = ALEInterface()

# Configuration settings
config = Config()
best_reward = 0  # Variable to track the best reward achieved

# Create the environment for the Assault game with RGB array rendering
env = gym.make("Assault-v4", render_mode="rgb_array")
n_actions = env.action_space.n  # Number of possible actions in the environment

# Initialize Dueling DQN models for training and target
model = DuelingDQN(n_actions)
model_target = DuelingDQN(n_actions)

# Initialize replay memory
memory = ReplayMemory(config.MEMORY_SIZE)

# Set up the optimizer and loss function for training
optimizer = keras.optimizers.Adam(learning_rate=config.LEARNING_RATE, clipnorm=1.0)
loss_function = keras.losses.Huber()

# Lists to store episode rewards and losses
episode_rewards = []
losses = []

best_reward = 0
best_episode = 0
best_frames = []

epsilon = config.EPSILON_START  # Initial epsilon value for epsilon-greedy strategy

# Training loop
for episode in range(config.EPISODES):
    state, info = env.reset()
    state = state / 255.0  # Normalize state
    done = False  # Boolean to track if the episode is done
    episode_reward = 0  # Reward accumulated in the episode
    steps = 0  # Step counter
    info = {'lives': 4, 'episode_frame_number': 2, 'frame_number': 2}  # Info dictionary
    frames = []  # List to store frames for GIF

    while not done and info.get("lives") > 0:  # Loop until the episode is done or lives are exhausted
        action = take_action(state, epsilon, env, model)  # Take an action based on the current state
        step_result = env.step(action)  # Step the environment with the selected action
        next_state, reward, done, _, info = step_result[:5]  # Unpack the result
        next_state = next_state / 255.0  # Normalize the next state

        # Store the experience in replay memory
        memory.push(state, action, next_state, reward, done)
        # Optimize models based on the stored experience
        optimize_model(memory, config, model, model_target, n_actions, loss_function, optimizer)

        # Render the environment and store the frame
        frame = env.render()
        frames.append(frame)

        # Update state and episode reward
        state = next_state
        episode_reward += reward

        # Print step details if reward is non-zero
        if reward != 0:
            print("step: ", steps, "action: ", action, " reward: ", reward)
            print("Lives: ", info.get("lives"))

        steps += 1

    # Print episode summary
    print(f"\nEpisodio: {episode+1}, Recompensa: {episode_reward}, Epsilon: {epsilon}")

    # Check if the episode reward is the best so far and save models if so
    if episode_reward > best_reward:
        best_reward = episode_reward
        # Ensure the model is built
        if not model.built:
            dummy_input = np.zeros((1, *env.observation_space.shape))
            model(dummy_input)
        # Save the model
        model_save_path = f"./dueling_model"
        model.save(model_save_path)
        print(f"Model saved")

        # Save the frames as a GIF
        gif_path = f"./episode_{episode+1}_reward_{episode_reward}.gif"
        imageio.mimsave(gif_path, frames, format='GIF', fps=30)

    # Update epsilon for the epsilon-greedy strategy
    epsilon = max((epsilon * config.EPSILON_DECAY_RATE), config.EPSILON_END)

    # Log episode metrics to wandb (Weights & Biases)
    wandb.log({"episode": episode + 1, "reward": episode_reward, "epsilon": epsilon})
    episode_rewards.append(episode_reward)
    
    # Update the target model weights
    if (episode + 1) % config.TARGET_UPDATE == 0:
        model_target.set_weights(model.get_weights())

# Close the environment after training
env.close()


In [5]:
plot_rewards(episode_rewards, episode)


In [None]:
# Import necessary libraries
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt

# Load the saved Dueling DQN model
dueling_model = keras.models.load_model(f"./dueling_model")

# Create the Assault-v4 environment with RGB array rendering
env = gym.make("Assault-v4", render_mode = "rgb_array")
n_actions = env.action_space.n  # Number of possible actions in the environment

# List to store rewards for each episode
rewards_per_episode = []
best_reward = 0  # Variable to track the best reward achieved

# Testing loop for 500 episodes
for episode in range(500):
    state, info = env.reset()
    state = state / 255.0  # Normalize the initial state
    done = False  # Boolean to track if the episode is done
    episode_reward = 0  # Reward accumulated in the episode
    frames = []  # List to store frames for GIF
    info = {'lives': 4, 'episode_frame_number': 2, 'frame_number': 2}  # Info dictionary

    while not done:
        # Select action based on the model's prediction
        action = np.argmax(dueling_model.predict(state[np.newaxis, ...]))
        step_result = env.step(action)  # Step the environment with the selected action
        next_state, reward, done, _, info = step_result[:5]  # Unpack the result
        next_state = next_state / 255.0  # Normalize the next state

        # Render the environment and store the frame
        frame = env.render()
        frames.append(frame)

        # Update state and episode reward
        state = next_state
        episode_reward += reward

        # Print action and reward details if reward is non-zero
        if reward != 0:
            print("action: ", action, " reward: ", reward)
            print("Lives: ", info.get("lives"))

    rewards_per_episode.append(episode_reward)  # Append the episode reward to the list
    print(f"Episode {episode + 1}: Reward = {episode_reward}")

    # Save the best performing episode as a GIF
    if episode_reward > best_reward:
        best_reward = episode_reward 
        gif_path = f"./test_episode_{episode+1}_reward_{episode_reward}.gif"
        imageio.mimsave(gif_path, frames, fps=30)

    # Uncomment to log episode metrics and GIF to wandb
    #wandb.log({"episode": episode + 1, "reward": episode_reward, "epsilon": epsilon, "episode_gif": wandb.Video(gif_path, fps=4, format="gif")})

# Close the environment after testing
env.close()

# Plot the rewards per episode
plt.plot(rewards_per_episode)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Reward per Episode')
plt.show()
