In [None]:
#73c9a156b91f3e0c01c3d5f332d23bfc66f4cdbf

In [1]:
import wandb
wandb.init(project="DQN", entity = "rl_proj")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/codespace/.netrc


In [1]:
import gym
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from collections import namedtuple, deque
import time
from ale_py import ALEInterface
import imageio
import matplotlib.pyplot as plt
from IPython.display import display, clear_output
from utils_DQN import *
import os

In [None]:

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

ale = ALEInterface()

np.random.seed(42)
tf.random.set_seed(42)


config= Config()
best_reward = 0

env = gym.make("Assault-v4") 
n_actions = env.action_space.n

#Model and memory creation
model = DQN(n_actions)
model_target = DQN(n_actions)
memory = ReplayMemory(10000)

# Defining optimizer and loss function
optimizer = keras.optimizers.Adam(learning_rate=2.5e-4, clipnorm=1.0)
loss_function = keras.losses.Huber()

episode_rewards = []
losses = []

best_reward = float("-inf")
best_episode = 0
best_frames = []

# Training the Agent
epsilon = config.EPSILON_START
for episode in range(config.EPISODES):
    # Initialize episode parameters
    state = env.reset() / 255.0
    done = False
    episode_reward = 0
    steps = 0
    info = {'ale.lives': 4, 'episode_frame_number': 2, 'frame_number': 2}
    frames = []
    episode_reward = 0

    # Execute the episode until termination or loss of lives
    while not done and info.get("ale.lives") >= 0:
        # Choose an action using epsilon-greedy strategy
        action = take_action(state, epsilon, env, model)

        # Perform the chosen action and observe the next state and reward
        step_result = env.step(action)
        next_state, reward, done, info = step_result
        next_state = next_state / 255.0

        # Store the transition in the replay memory
        memory.push(state, action, next_state, reward, done)

        # Optimize the Q-network using experience replay
        optimize_model(memory, config, model, model_target, n_actions, loss_function, optimizer)

        # Render the environment for visualization
        frame = env.render(mode='rgb_array')
        frames.append(frame)

        # Update the current state and accumulate episode reward
        state = next_state
        episode_reward += reward

        # Print information for non-zero rewards
        if reward != 0:
            print("step: ", steps, "action: ", action, " reward: ", reward)
            print("Lives: ", info.get("ale.lives"))

        steps += 1

    # Print episode summary
    print(f"\nEpisode: {episode+1}, Reward: {episode_reward}, Epsilon: {epsilon}")

    # Save the best models based on episode reward
    if episode_reward > best_reward:
        best_reward = episode_reward
        model.save("./best_DQN_model", save_format="tf")
        model_target.save("./best_DQN_model_target", save_format="tf")
        print("New best model saved with reward:", episode_reward)

        # Save episode frames as a GIF for visualization
        gif_path = f"./episode_{episode+1}_reward_{episode_reward}.gif"
        imageio.mimsave(gif_path, frames, fps=30)

    # Log episode statistics and update epsilon
    episode_rewards.append(episode_reward)
    epsilon = max((epsilon * config.EPSILON_DECAY_RATE), config.EPSILON_END)
    #wandb.log({"episode": episode + 1, "reward": episode_reward, "epsilon": epsilon})

    # Update the target Q-network periodically
    if (episode + 1) % config.TARGET_UPDATE == 0:
        model_target.set_weights(model.get_weights())

env.close()



In [None]:
#Plotting rewards per episode
plot_rewards(episode_rewards)


In [None]:
#TEST THE SAVED MODEL
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt

DQN_model = keras.models.load_model(f"./best_DQN_model")
DQN_model_target = keras.models.load_model(f"./best_DQN_model_target")

env = gym.make("Assault-v4")
n_actions = env.action_space.n


rewards_per_episode = []
best_reward = 0

# Loop through episodes for testing the trained agent
for episode in range(20):
    # Initialize episode parameters and reset the environment
    state = env.reset()
    state = state / 255.0  
    done = False
    episode_reward = 0
    frames = []
    info = {'ale.lives': 4, 'episode_frame_number': 2, 'frame_number': 2}

    # Execute the episode until termination
    while not done:
        # Choose an action using the learned Q-network
        action = take_action_test(state, model)

        # Perform the chosen action and observe the next state and reward
        next_state, reward, done, info = env.step(action)
        next_state = next_state / 255.0  

        # Render the environment for visualization
        frame = env.render(mode='rgb_array')
        frames.append(frame)

        # Update the current state and accumulate episode reward
        state = next_state
        episode_reward += reward

        # Print information for non-zero rewards
        if reward != 0:
            print("action: ", action, " reward: ", reward)
            print("Lives: ", info.get("ale.lives"))

    # Store and print the total reward for the episode
    rewards_per_episode.append(episode_reward)
    print(f"Episode {episode + 1}: Reward = {episode_reward}")

    # Save the best test episode frames as a GIF
    if episode_reward > best_reward:
        best_reward = episode_reward  
        gif_path = f"./test_episode_{episode+1}_reward_{episode_reward}.gif"
        imageio.mimsave(gif_path, frames, fps=30)

    # Log episode metrics and GIF to wandb
    #wandb.log({"episode": episode + 1, "reward": episode_reward, "epsilon": epsilon, "episode_gif": wandb.Video(gif_path, fps=4, format="gif")})

env.close()

# Plot the rewards per episode
plt.plot(rewards_per_episode)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Reward per Episode')
plt.show()
