In [None]:
import os
import gym
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from collections import namedtuple, deque
from ale_py import ALEInterface
import wandb
import imageio
import matplotlib.pyplot as plt
import warnings
import logging
from utils_AC import *



In [1]:
import os
import tensorflow as tf

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # 0 = all messages, 1 = filter out INFO, 2 = filter out WARNING, 3 = filter out ERROR
tf.get_logger().setLevel('ERROR')

warnings.filterwarnings('ignore')


gym_logger = logging.getLogger('gym')
gym_logger.setLevel(logging.ERROR)

In [15]:
#API KEY: 73c9a156b91f3e0c01c3d5f332d23bfc66f4cdbf

In [None]:
#import wandb
#wandb.init(project="ActorCritic", entity = "rl_proj")

In [None]:
# Suppress TensorFlow logging
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Set random seeds for reproducibility
np.random.seed(42)  # Seed for NumPy random number generator
tf.random.set_seed(42)  # Seed for TensorFlow random number generator

# Initialize ALEInterface (Arcade Learning Environment)
ale = ALEInterface()

# Configuration settings
config = Config()
best_reward = 0  # Variable to track the best reward achieved

# Create the environment for the Assault game
env = gym.make("Assault-v4", render_mode="rgb_array")
n_actions = env.action_space.n  # Number of possible actions in the environment

# Initialize actor model
actor_model = Actor(n_actions)
print("Number of actions:", n_actions)
actor_model.build(input_shape=(None, 210, 160, 3))  # Build the actor model with the input shape
actor_model.summary()  # Display the summary of the actor model
dummy_input = np.random.random((1, 210, 160, 3))  # Create a dummy input for testing
actor_model(dummy_input)  # Test the actor model with the dummy input

# Initialize critic model
critic_model = Critic()
critic_model.build(input_shape=(None, 210, 160, 3))  # Build the critic model with the input shape
critic_model.summary()  # Display the summary of the critic model
dummy_input = np.random.random((1, 210, 160, 3))  # Create another dummy input for testing
critic_model(dummy_input)  # Test the critic model with the dummy input

# Learning rate schedule for optimizers
lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-4,
    decay_steps=10000,
    decay_rate=0.9)
actor_optimizer = keras.optimizers.Adam(learning_rate=lr_schedule)  # Optimizer for actor model
critic_optimizer = keras.optimizers.Adam(learning_rate=lr_schedule)  # Optimizer for critic model

# Initialize replay memory
memory = ReplayMemory(config.MEMORY_SIZE)

# Variable to store rewards for each episode
episode_rewards = []
epsilon = config.EPSILON_START  # Initial epsilon value for epsilon-greedy strategy

# Training loop
for episode in range(config.EPISODES):
    state, info = env.reset()
    state = state / 255.0 # Reset the environment and normalize the state
    done = False  # Boolean to track if the episode is done
    episode_reward = 0  # Reward accumulated in the episode
    steps = 0  # Step counter
    info = {'ale.lives': 4, 'episode_frame_number': 2, 'frame_number': 2}  # Info dictionary
    frames = []  # List to store frames for creating a GIF
    max_steps = 100  # Maximum steps per episode

    while not done and info.get("ale.lives") > 0:  # Loop until the episode is done or lives are exhausted
        action = take_action(state, epsilon, env, actor_model, n_actions)  # Take an action based on the current state
        step_result = env.step(action)  # Step the environment with the selected action
        next_state, reward, done, _, info = step_result[:5]  # Unpack the result
        next_state = next_state / 255.0  # Normalize the next state

        # Store the experience in replay memory
        memory.push(state, action, next_state, reward, done)
        # Optimize models based on the stored experience
        optimize_model(memory, config, critic_model, critic_optimizer, actor_model, actor_optimizer, n_actions)

        # Render the environment and store the frame
        frame = env.render()
        frames.append(frame)

        # Update state and episode reward
        state = next_state
        episode_reward += reward

        # Print step details if reward is non-zero
        if reward != 0:
            print("step: ", steps, "action: ", action, " reward: ", reward)
            print("Lives: ", info.get("ale.lives"))

        steps += 1

    # Print episode summary
    print(f"Episode: {episode+1}, Reward: {episode_reward}, Epsilon: {epsilon}")

    # Check if the episode reward is the best so far and save models if so
    if episode_reward > best_reward:
        best_reward = episode_reward
        actor_model.save("./best_actor_model", save_format="tf")
        critic_model.save("./best_critic_model", save_format="tf")
        print("New best model saved with reward:", episode_reward)

        # Save the frames as a GIF
        gif_path = f"./episode_{episode+1}_reward_{episode_reward}.gif"
        imageio.mimsave(gif_path, frames, fps=30)

    # Add episode reward to the list
    episode_rewards.append(episode_reward)

    # Update epsilon for the epsilon-greedy strategy
    epsilon = max((epsilon * config.EPSILON_DECAY_RATE), config.EPSILON_END)

    # Log episode metrics and GIF to wandb (Weights & Biases)
    #wandb.log({"episode": episode + 1, "reward": episode_reward, "epsilon": epsilon, "episode_gif": wandb.Video(gif_path, fps=4, format="gif")})

# Close the environment after training
env.close()


In [None]:
plot_rewards(episode_rewards)

In [None]:
#TEST THE SAVED MODEL
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt

actor_model = keras.models.load_model("./best_actor_model")

critic_model = keras.models.load_model("./best_critic_model")

env = gym.make("Assault-v4", render_mode = "rgb_array")
n_actions = env.action_space.n

rewards_per_episode = []
best_reward = 0

for episode in range(500):
    state, info= env.reset()
    state = state / 255.0 
    done = False
    episode_reward = 0
    frames = []
    info ={'ale.lives': 4, 'episode_frame_number': 2, 'frame_number': 2}

    while not done:
        action = take_action_test(state, actor_model, n_actions)
        next_state, reward, done, _, info = env.step(action)[:5]
        next_state = next_state / 255.0  

        frame = env.render()
        frames.append(frame)

        state = next_state
        episode_reward += reward

        if reward != 0:
            print("action: ", action, " reward: ", reward)
            print("Lives: ", info.get("ale.lives"))

    rewards_per_episode.append(episode_reward)
    print(f"Episode {episode + 1}: Reward = {episode_reward}")

    if episode_reward > best_reward:
        best_reward = episode_reward  
        
        gif_path = f"./test_episode_{episode+1}_reward_{episode_reward}.gif"
        imageio.mimsave(gif_path, frames, fps=30)  

    # Log episode metrics and GIF to wandb
    #wandb.log({"episode": episode + 1, "reward": episode_reward, "epsilon": epsilon, "episode_gif": wandb.Video(gif_path, fps=4, format="gif")})


env.close()

plt.plot(rewards_per_episode)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Reward per Episode')
plt.show()