# 02_train_dqn_pong.ipynb
Este notebook define la arquitectura DQN, configura el agente, entrena durante 10M pasos y evalúa en 50 episodios para obtener la media de recompensa.

In [None]:
# Imports esenciales
import gym
import numpy as np
from gym.wrappers import AtariPreprocessing, FrameStack
from rl.core import Processor
from rl.memory import SequentialMemory
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy
from rl.agents.dqn import DQNAgent
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Permute, Conv2D, BatchNormalization, Flatten, Dense

In [None]:
# Wrappers y Processor
def wrap_env(env):
    env = AtariPreprocessing(env,
                             frame_skip=4,
                             screen_size=84,
                             grayscale_obs=True,
                             scale_obs=True)
    env = FrameStack(env, num_stack=4)
    return env

class AtariProcessor(Processor):
    def process_observation(self, observation):
        return observation
    def process_state_batch(self, batch):
        return batch.astype('float32') / 255.0
    def process_reward(self, reward):
        return np.clip(reward, -1.0, 1.0)

In [None]:
# Parámetros globales y entorno
INPUT_SHAPE = (84, 84)
WINDOW_LENGTH = 4
env = wrap_env(gym.make('PongDeterministic-v4'))
nb_actions = env.action_space.n

In [None]:
# Arquitectura CNN para DQN
model = Sequential([
    Permute((2, 3, 1), input_shape=(WINDOW_LENGTH,) + INPUT_SHAPE),
    Conv2D(32, (8, 8), strides=4, activation='relu'),
    BatchNormalization(),
    Conv2D(64, (4, 4), strides=2, activation='relu'),
    BatchNormalization(),
    Conv2D(64, (3, 3), strides=1, activation='relu'),
    BatchNormalization(),
    Flatten(),
    Dense(512, activation='relu'),
    Dense(nb_actions, activation='linear')
])
model.summary()

In [None]:
# Configuración del agente DQN
policy = LinearAnnealedPolicy(
    EpsGreedyQPolicy(),
    attr='eps',
    value_max=1.0,
    value_min=0.001,
    value_test=0.001,
    nb_steps=2_000_000
)
memory = SequentialMemory(limit=2_000_000, window_length=WINDOW_LENGTH)
dqn = DQNAgent(
    model=model,
    nb_actions=nb_actions,
    policy=policy,
    memory=memory,
    processor=AtariProcessor(),
    nb_steps_warmup=100_000,
    gamma=0.995,
    train_interval=2,
    target_model_update=5_000,
    delta_clip=1.0,
    enable_double_dqn=True,
    enable_dueling_network=True,
    dueling_type='max',
    dueling_size=512
)
dqn.compile(Adam(learning_rate=6.25e-5), metrics=['mae'])

In [None]:
# Entrenamiento extendido (10M pasos)
dqn.fit(
    env,
    nb_steps=10_000_000,
    log_interval=250_000,
    verbose=2
)
dqn.save_weights('dqn_pong_weights.h5f', overwrite=True)

In [None]:
# Evaluación en modo test (50 episodios)
results = dqn.test(
    env,
    nb_episodes=50,
    visualize=False,
    policy=EpsGreedyQPolicy(eps=0.001)
)
import numpy as np
rewards = results.history['episode_reward']
print("Recompensa media:", np.mean(rewards))
print("Desviación estándar:", np.std(rewards))
print("Recompensa máxima:", np.max(rewards))
print("Recompensa mínima:", np.min(rewards))