In [2]:
import wandb
wandb.init(project="assault_1try", entity = "rl_proj")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/codespace/.netrc


In [3]:
import gym
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from collections import namedtuple, deque
import time
from ale_py import ALEInterface
import imageio
import matplotlib.pyplot as plt

ale = ALEInterface()

# Inicialización del entorno
#env = gym.make("Assault-v0", render_mode="rgb_array")
env = gym.make("Assault-v4", render_mode="rgb_array") #PROVAR AQUESTA VERSIO DEL MODEL
n_actions = env.action_space.n

# Parámetros modificados para una mayor exploración inicial y un decaimiento más lento
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY = 10000  # Aumenta para un decaimiento más lento
EPISODES = 5 # Más episodios para permitir un aprendizaje más prolongado
TARGET_UPDATE = 5
BATCH_SIZE = 128
GAMMA = 0.999
MAX_STEPS_PER_EPISODE = 500

class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)
        self.transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward', 'done'))

    def push(self, *args):
        """Save a transition"""
        self.memory.append(self.transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

class DQN(keras.Model):

    def __init__(self, n_actions):
        super(DQN, self).__init__()

        self.layer1 = layers.Conv2D(16, 5, strides=2, activation="relu")
        self.bn1 = layers.BatchNormalization()
        self.layer2 = layers.Conv2D(16, 5, strides=2, activation="relu")
        self.bn2 = layers.BatchNormalization()
        self.layer3 = layers.Conv2D(32, 5, strides=2, activation="relu")
        self.bn3 = layers.BatchNormalization()
        self.flatten = layers.Flatten()
        self.layer4 = layers.Dense(512, activation="relu")
        self.action = layers.Dense(n_actions, activation="linear")

    def call(self, inputs):
        x = self.layer1(inputs)
        x = self.bn1(x)
        x = self.layer2(x)
        x = self.bn2(x)
        x = self.layer3(x)
        x = self.bn3(x)
        x = self.flatten(x)
        x = self.layer4(x)
        return self.action(x)

# Creación del modelo y la memoria
model = DQN(n_actions)
model_target = DQN(n_actions)
memory = ReplayMemory(10000)

# Preparación del optimizador y la función de pérdida
optimizer = keras.optimizers.Adam(learning_rate=2.5e-4, clipnorm=1.0)
loss_function = keras.losses.Huber()

def take_action(state, epsilon):
    if random.random() < epsilon:
        return env.action_space.sample()
    else:
        q_values = model.predict(state[np.newaxis, ...])
        return np.argmax(q_values[0])

def optimize_model():
    if memory.__len__() < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    batch = memory.transition(*zip(*transitions))

    state_batch = np.array(batch.state)
    action_batch = np.array(batch.action)
    next_state_batch = np.array(batch.next_state)
    rewad_batch = np.array(batch.reward)
    done_batch = np.array(batch.done, dtype=np.int8)

    future_rewards = model_target(next_state_batch)
    target = rewad_batch + GAMMA * tf.reduce_max(future_rewards, axis=-1) * (1 - done_batch)

    action_mask = tf.one_hot(action_batch, n_actions)

    with tf.GradientTape() as tape:
        q_values = model(state_batch)
        q_action = tf.reduce_sum(tf.multiply(q_values, action_mask), axis=-1)
        loss = loss_function(target, q_action)

    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    #model.save_weights('/Users/roy/Desktop/UNI')

episode_rewards = []
losses = []

best_reward = -float("inf")
best_episode = 0
best_frames = []

# Entrenamiento del agente
epsilon = EPSILON_START
for episode in range(EPISODES):
    state, info = env.reset()  
    state = state / 255.0
    #state = env.reset() / 255.0
    done = False
    episode_reward = 0
    steps = 0
    info ={'lives': 4, 'episode_frame_number': 2, 'frame_number': 2}

    frames = []
    current_frames = []  # Almacena los frames del episodio actual
    episode_reward = 0

    while not done and steps < MAX_STEPS_PER_EPISODE and info.get("lives") >= 0:
        frame = env.render()
        frames.append(frame)
        #ESTO ES NUEVO- REVISAR
        current_frames.append(frame)
        # Comprobación y actualización de la mejor recompensa
        
        if episode_reward > best_reward:
            best_reward = episode_reward
            best_episode = episode
            best_frames = current_frames  # Actualiza los frames del mejor episodio

        action = take_action(state, epsilon)
        step_result = env.step(action)
        next_state, reward, done, _, info = step_result[:5]
        next_state = next_state / 255.0
        #print("info: ", info)

        memory.push(state, action, next_state, reward, done)
        optimize_model()

        state = next_state
        episode_reward += reward

        if reward != 0:
            print("step: ", steps, "action: ", action, " reward: ", reward)
            print("Lives: ", info.get("lives"))

        steps += 1
        
        epsilon = max(epsilon - (EPSILON_START - EPSILON_END) / EPSILON_DECAY, EPSILON_END)
        
        
    print(f"\nEpisodio: {episode+1}, Recompensa: {episode_reward}, Epsilon: {epsilon}")

    #gif_path = f"episode_{episode+1}.gif"
    #gif_path = f"/workspaces/RL_Project/Assault_gifs/episode_{episode+1}.gif"
    gif_path = f"/workspaces/RL_Project/Assault_gifs/best_episode_{best_episode+1}.gif"
    imageio.mimsave(gif_path, frames, format='GIF', fps=30)
    
    wandb.log({"episode": episode + 1, "reward": episode_reward, "epsilon": epsilon})
    episode_rewards.append(episode_reward)
    
    if (episode + 1) % TARGET_UPDATE == 0:
        model_target.set_weights(model.get_weights())

    # Al final de cada episodio, graficar y guardar las métricas
    plt.figure(figsize=(10, 5))
    plt.plot(episode_rewards, label="Recompensas por episodio")
    plt.xlabel("Episodio")
    plt.ylabel("Recompensa")
    plt.title("Evolución de la Recompensa por Episodio")
    plt.legend()
    plt.savefig(f"reward_plot_episode_{episode+1}.png")
    wandb.log({"reward_plot": wandb.Image(plt)})
    plt.close()

env.close()


2023-11-28 17:40:17.075960: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-28 17:40:17.261014: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-28 17:40:17.261053: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-28 17:40:17.293962: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-28 17:40:17.358529: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-28 17:40:17.359595: I tensorflow/core/platform/cpu_feature_guard.cc:1

step:  111 action:  5  reward:  21.0
Lives:  3


KeyboardInterrupt: 

In [None]:
#pip install imageio