In [None]:
# Instalación de dependencias
!pip install swig
!pip install gymnasium[classic_control]

import math
import random
from collections import deque
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import gymnasium as gym
import matplotlib.pyplot as plt


# Configuración de dispositivo (GPU si está disponible)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Collecting swig
  Downloading swig-4.4.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (3.5 kB)
Downloading swig-4.4.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.9 MB[0m [31m9.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/1.9 MB[0m [31m27.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.4.1


In [None]:
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super().__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [None]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def __len__(self):
        return len(self.memory)

    def sample(self, batch_size):
        batch = random.sample(self.memory, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        states_tensor = torch.tensor(np.array(states), dtype=torch.float32).to(device)
        next_states_tensor = torch.tensor(np.array(next_states), dtype=torch.float32).to(device)
        rewards_tensor = torch.tensor(rewards, dtype=torch.float32).to(device)
        dones_tensor = torch.tensor(dones, dtype=torch.float32).to(device)
        actions_tensor = torch.tensor(actions, dtype=torch.long).unsqueeze(1).to(device)

        return states_tensor, actions_tensor, rewards_tensor, next_states_tensor, dones_tensor

In [None]:
def select_action(state, online_network, epsilon, action_size):
    if random.random() < epsilon:
        return random.choice(range(action_size))
    else:
        with torch.no_grad():
            state_t = torch.as_tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
            q_values = online_network(state_t)
            return torch.argmax(q_values).item()

def epsilon_decay_by_step(step, start, end, decay):
    return end + (start - end) * math.exp(-step / decay)

def update_target_network(target_network, online_network, tau=1.0):
    """
    Si tau=1.0 es una actualización 'hard'.
    Si tau < 1.0 es una actualización 'soft'.
    """
    for target_param, online_param in zip(target_network.parameters(), online_network.parameters()):
        target_param.data.copy_(tau * online_param.data + (1.0 - tau) * target_param.data)

In [None]:
# Hiperparámetros
env_name = "CartPole-v1"
env = gym.make(env_name)
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Redes y optimizador
online_net = QNetwork(state_size, action_size).to(device)
target_net = QNetwork(state_size, action_size).to(device)
target_net.load_state_dict(online_net.state_dict())
optimizer = optim.Adam(online_net.parameters(), lr=1e-3)

# Parámetros DDQN
gamma = 0.99
buffer_capacity = 10_000
batch_size = 64
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 500 # Pasos para el decaimiento
target_update_freq = 10 # Frecuencia de actualización de la red target (en episodios)
num_episodes = 2000

replay_buffer = ReplayBuffer(buffer_capacity)

In [None]:
total_steps = 0
list_rewards = []
for episode in range(num_episodes):
    state, _ = env.reset()
    episode_reward = 0
    done = False

    while not done:
        # Epsilon decay
        epsilon = epsilon_end + (epsilon_start - epsilon_end) * math.exp(-1. * total_steps / epsilon_decay)
        #epsilon = epsilon_decay_by_step(total_steps, epsilon_start, epsilon_end, epsilon_decay)


        action = select_action(state, online_net, epsilon, action_size)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        replay_buffer.push(state, action, reward, next_state, done)
        state = next_state
        episode_reward += reward
        total_steps += 1

        if len(replay_buffer) > batch_size:
            states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)

            # Q(s,a) con red online
            q_values = online_net(states).gather(1, actions).squeeze(1)

            # --- LÓGICA DDQN ---
            with torch.no_grad():
                # 1. Selección de la mejor acción con red ONLINE
                next_actions = online_net(next_states).argmax(dim=1).unsqueeze(1)
                # 2. Evaluación de esa acción con red TARGET
                next_q_values = target_net(next_states).gather(1, next_actions).squeeze(1)
                # Target TD
                target_q = rewards + (gamma * next_q_values * (1 - dones))

            loss = F.mse_loss(q_values, target_q)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    list_rewards.append(episode_reward)
    # Actualización periódica de la red target
    if episode % target_update_freq == 0:
        update_target_network(target_net, online_net)

    if (episode + 1) % 100 == 0:
        print(f"Episodio {episode+1} | Recompensa: {episode_reward} | Epsilon: {epsilon:.2f}")

plt.plot(list_rewards)
plt.title('Recompensa por Episodio durante Entrenamiento')
plt.xlabel('Episodio')
plt.ylabel('Recompensa')
plt.show()
env.close()

Episodio 100 | Recompensa: 322.0 | Epsilon: 0.01
Episodio 200 | Recompensa: 500.0 | Epsilon: 0.01
Episodio 300 | Recompensa: 11.0 | Epsilon: 0.01
Episodio 400 | Recompensa: 500.0 | Epsilon: 0.01
Episodio 500 | Recompensa: 500.0 | Epsilon: 0.01
Episodio 600 | Recompensa: 500.0 | Epsilon: 0.01
Episodio 700 | Recompensa: 500.0 | Epsilon: 0.01
Episodio 800 | Recompensa: 500.0 | Epsilon: 0.01
Episodio 900 | Recompensa: 100.0 | Epsilon: 0.01
Episodio 1000 | Recompensa: 347.0 | Epsilon: 0.01


KeyboardInterrupt: 

In [None]:
def evaluate_policy(env_name, model, n_episodes=100):
    test_env = gym.make(env_name)
    rewards = []

    for _ in range(n_episodes):
        state, _ = test_env.reset()
        done = False
        total_reward = 0
        while not done:
            with torch.no_grad():
                state_t = torch.as_tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
                action = torch.argmax(model(state_t)).item()
            state, reward, terminated, truncated, _ = test_env.step(action)
            total_reward += reward
            done = terminated or truncated
        rewards.append(total_reward)

    avg_reward = np.mean(rewards)
    std_reward = np.std(rewards)
    return avg_reward, std_reward

# Ejecutar evaluación
mean_r, std_r = evaluate_policy(env_name, online_net)
print(f"\nEvaluación final:")
print(f"Recompensa media: {mean_r:.2f} +/- {std_r:.2f}")


Evaluación final:
Recompensa media: 500.00 +/- 0.00


In [None]:
!pip install "stable-baselines3[extra]"
!pip install shimmy[atari]
!pip install autorom[accept-rom-license]

!AutoROM --accept-license

AutoROM will download the Atari 2600 ROMs.
They will be installed to:
	/usr/local/lib/python3.12/dist-packages/AutoROM/roms

Existing ROMs will be overwritten.


In [None]:
import gymnasium as gym
from stable_baselines3 import A2C
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

# Definimos el nombre del entorno compatible
ENV_ID = "BreakoutNoFrameskip-v4"


# 1. Crear el entorno de entrenamiento
# Usamos 4 entornos paralelos para el método Actor-Critic
env = make_atari_env("PongNoFrameskip-v4", n_envs=8)
env = VecFrameStack(env, n_stack=4)

# 2. Configurar el entorno de evaluación y los Callbacks
eval_env = make_atari_env(ENV_ID, n_envs=1, seed=42)
eval_env = VecFrameStack(eval_env, n_stack=4)

# Umbral de recompensa solicitado: 25 puntos
callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=25, verbose=1)
eval_callback = EvalCallback(eval_env,
                             callback_on_new_best=callback_on_best,
                             eval_freq=5000,
                             best_model_save_path='./logs/',
                             verbose=1)

NameNotFound: Environment `PongNoFrameskip` doesn't exist.

In [None]:
# 3. Configurar el método Actor-Critic (A2C)
# CnnPolicy es la necesaria para procesar los frames de Atari
model = A2C("CnnPolicy", env, verbose=1)

# 4. Entrenar el modelo
# Definimos un máximo alto de pasos, pero el callback lo detendrá al llegar a 25 puntos
model.learn(total_timesteps=1000000, callback=eval_callback)