In [1]:
import gym
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from collections import namedtuple, deque
import time
from ale_py import ALEInterface
import imageio

ale = ALEInterface()

# Inicialización del entorno
env = gym.make("Assault-v0", render_mode="rgb_array")
n_actions = env.action_space.n

# Parámetros modificados para una mayor exploración inicial y un decaimiento más lento
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY = 10000  # Aumenta para un decaimiento más lento
EPISODES = 100  # Más episodios para permitir un aprendizaje más prolongado
TARGET_UPDATE = 5
BATCH_SIZE = 128
GAMMA = 0.999
MAX_STEPS_PER_EPISODE = 200

class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)
        self.transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward', 'done'))

    def push(self, *args):
        """Save a transition"""
        self.memory.append(self.transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

class DQN(keras.Model):

    def __init__(self, n_actions):
        super(DQN, self).__init__()

        self.layer1 = layers.Conv2D(16, 5, strides=2, activation="relu")
        self.bn1 = layers.BatchNormalization()
        self.layer2 = layers.Conv2D(16, 5, strides=2, activation="relu")
        self.bn2 = layers.BatchNormalization()
        self.layer3 = layers.Conv2D(32, 5, strides=2, activation="relu")
        self.bn3 = layers.BatchNormalization()
        self.flatten = layers.Flatten()
        self.layer4 = layers.Dense(512, activation="relu")
        self.action = layers.Dense(n_actions, activation="linear")

    def call(self, inputs):
        x = self.layer1(inputs)
        x = self.bn1(x)
        x = self.layer2(x)
        x = self.bn2(x)
        x = self.layer3(x)
        x = self.bn3(x)
        x = self.flatten(x)
        x = self.layer4(x)
        return self.action(x)

# Creación del modelo y la memoria
model = DQN(n_actions)
model_target = DQN(n_actions)
memory = ReplayMemory(10000)

# Preparación del optimizador y la función de pérdida
optimizer = keras.optimizers.Adam(learning_rate=2.5e-4, clipnorm=1.0)
loss_function = keras.losses.Huber()

def take_action(state, epsilon):
    if random.random() < epsilon:
        return env.action_space.sample()
    else:
        q_values = model.predict(state[np.newaxis, ...])
        return np.argmax(q_values[0])

def optimize_model():
    if memory.__len__() < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    batch = memory.transition(*zip(*transitions))

    state_batch = np.array(batch.state)
    action_batch = np.array(batch.action)
    next_state_batch = np.array(batch.next_state)
    rewad_batch = np.array(batch.reward)
    done_batch = np.array(batch.done, dtype=np.int8)

    future_rewards = model_target(next_state_batch)
    target = rewad_batch + GAMMA * tf.reduce_max(future_rewards, axis=-1) * (1 - done_batch)

    action_mask = tf.one_hot(action_batch, n_actions)

    with tf.GradientTape() as tape:
        q_values = model(state_batch)
        q_action = tf.reduce_sum(tf.multiply(q_values, action_mask), axis=-1)
        loss = loss_function(target, q_action)

    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    #model.save_weights('/Users/roy/Desktop/UNI')

# Entrenamiento del agente
epsilon = EPSILON_START
for episode in range(EPISODES):
    state, info = env.reset()  
    state = state / 255.0
    #state = env.reset() / 255.0
    done = False
    episode_reward = 0
    steps = 0

    frames = []

    while not done and steps < MAX_STEPS_PER_EPISODE:
        frame = env.render()
        frames.append(frame)

        action = take_action(state, epsilon)
        step_result = env.step(action)
        next_state, reward, done, _ = step_result[:4]
        next_state = next_state / 255.0

        memory.push(state, action, next_state, reward, done)
        optimize_model()

        state = next_state
        episode_reward += reward
        steps += 1
        
        epsilon = max(epsilon - (EPSILON_START - EPSILON_END) / EPSILON_DECAY, EPSILON_END)
        
        print("action: ", action, " reward: ", reward)
    print(f"\nEpisodio: {episode+1}, Recompensa: {episode_reward}, Epsilon: {epsilon}")

    #gif_path = f"episode_{episode+1}.gif"
    gif_path = f"/workspaces/RL_Project/Assault_gifs/episode_{episode+1}.gif"
    imageio.mimsave(gif_path, frames, format='GIF', fps=30)

    if (episode + 1) % TARGET_UPDATE == 0:
        model_target.set_weights(model.get_weights())

env.close()


2023-11-28 11:22:25.014419: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-28 11:22:25.130195: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-28 11:22:25.130243: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-28 11:22:25.164542: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-28 11:22:25.194021: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-28 11:22:25.194909: I tensorflow/core/platform/cpu_feature_guard.cc:1

action:  6  reward:  0.0
action:  2  reward:  0.0
action:  4  reward:  0.0
action:  0  reward:  0.0
action:  2  reward:  0.0
action:  4  reward:  0.0
action:  3  reward:  0.0
action:  3  reward:  0.0
action:  6  reward:  0.0
action:  0  reward:  0.0
action:  2  reward:  0.0
action:  3  reward:  0.0
action:  6  reward:  0.0
action:  6  reward:  0.0
action:  0  reward:  0.0
action:  1  reward:  0.0
action:  6  reward:  0.0
action:  3  reward:  0.0
action:  1  reward:  0.0
action:  2  reward:  0.0
action:  5  reward:  0.0
action:  3  reward:  0.0
action:  3  reward:  0.0
action:  5  reward:  0.0
action:  4  reward:  0.0
action:  4  reward:  0.0
action:  3  reward:  0.0
action:  2  reward:  0.0
action:  2  reward:  0.0
action:  2  reward:  0.0
action:  2  reward:  0.0
action:  5  reward:  0.0
action:  3  reward:  0.0
action:  4  reward:  0.0
action:  3  reward:  0.0
action:  3  reward:  0.0
action:  0  reward:  0.0
action:  5  reward:  0.0
action:  0  reward:  0.0
action:  0  reward:  0.0


2023-11-28 11:22:35.250281: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 103219200 exceeds 10% of free system memory.
2023-11-28 11:22:35.311445: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 51609600 exceeds 10% of free system memory.
2023-11-28 11:22:35.354738: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 65814528 exceeds 10% of free system memory.
2023-11-28 11:22:35.466682: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 65814528 exceeds 10% of free system memory.
2023-11-28 11:22:35.512369: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 65814528 exceeds 10% of free system memory.


action:  5  reward:  0.0
action:  5  reward:  0.0
action:  2  reward:  0.0
action:  5  reward:  0.0
action:  1  reward:  0.0
action:  6  reward:  0.0
action:  4  reward:  0.0
action:  3  reward:  0.0
action:  1  reward:  0.0
action:  0  reward:  0.0
action:  3  reward:  0.0
action:  5  reward:  0.0
action:  6  reward:  0.0
action:  3  reward:  0.0
action:  3  reward:  0.0
action:  2  reward:  0.0
action:  3  reward:  0.0
action:  4  reward:  0.0
action:  4  reward:  0.0
action:  0  reward:  0.0
action:  1  reward:  0.0
action:  0  reward:  0.0
action:  4  reward:  0.0

Episodio: 3, Recompensa: 21.0, Epsilon: 0.985150000000006
action:  3  reward:  0.0
action:  0  reward:  0.0
action:  3  reward:  0.0
action:  4  reward:  0.0
action:  5  reward:  0.0
action:  0  reward:  0.0
action:  5  reward:  0.0
action:  5  reward:  0.0
action:  6  reward:  0.0
action:  1  reward:  0.0
action:  3  reward:  0.0
action:  3  reward:  0.0
action:  0  reward:  0.0
action:  0  reward:  0.0
action:  5  rewa

: 

In [None]:
#pip install imageio