In [1]:
import gymnasium as gym
import ale_py
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from collections import deque, namedtuple
import matplotlib.pyplot as plt
from gymnasium.wrappers import FrameStackObservation, AtariPreprocessing
import os
import random
from copy import deepcopy

gym.register_envs(ale_py)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
ENV_NAME = "PongNoFrameskip-v4"
env = gym.make(ENV_NAME, render_mode="rgb_array")

env = AtariPreprocessing(env, grayscale_obs=True, scale_obs=True, frame_skip=4)
env = FrameStackObservation(env, 4)

n_actions = env.action_space.n
print("Action space:", n_actions)
obs_shape = env.observation_space.shape
print("Observation shape:", obs_shape)

SEED = 42
MAX_STEPS = 20000000
REPLAY_SIZE = 100000
WARMUP_STEPS = 10000
BATCH_SIZE = 32
GAMMA = 1
LR = 1e-4
TARGET_UPDATE_FREQ = 10000 
UPDATE_FREQ = 4
EPS_START = 1.0
EPS_END = 0.02
EPS_DECAY = 0.999995
SAVE_PATH = "./model_checkpoints"
EVAL_EVERY = 100000
EVAL_EPISODES = 10
trained = False

os.makedirs(SAVE_PATH, exist_ok=True)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

Action space: 6
Observation shape: (4, 84, 84)


<torch._C.Generator at 0x7b9cbc069210>

In [3]:
class DQN(nn.Module):
    def __init__(self, in_channels, n_actions):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(64 * 7 * 7, 512)
        self.out = nn.Linear(512, n_actions)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return self.out(x)

In [4]:
Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done'))

class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, *args):
        self.buffer.append(Transition(*args))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states = np.stack([b.state for b in batch])
        actions = np.array([b.action for b in batch], dtype=np.int64)
        rewards = np.array([b.reward for b in batch], dtype=np.float32)
        next_states = np.stack([b.next_state for b in batch])
        dones = np.array([b.done for b in batch], dtype=np.uint8)
        return states, actions, rewards, next_states, dones

    def __len__(self):
        return len(self.buffer)

def preprocess_observation(obs):
    """
    Acepta la observación que devuelve env.reset() o env.step()
    y devuelve un array float32 con shape (C,H,W) y valores en [0,1].
    Maneja (H,W,4) y (4,H,W).
    """
    arr = np.array(obs)
    if arr.ndim == 3 and arr.shape[2] == 4:  # (H,W,4)
        arr = np.transpose(arr, (2,0,1))
    elif arr.ndim == 3 and arr.shape[0] == 4:
        pass  # ya está (4,H,W)
    else:
        raise RuntimeError(f"Forma inesperada de observación: {arr.shape}")
    return arr.astype(np.float32)

def to_tensor(x, dtype=torch.float32):
    return torch.from_numpy(x).to(device, dtype=dtype)

if len(obs_shape) == 3:
    if obs_shape[2] == 4:
        in_ch = 4
    elif obs_shape[0] == 4:
        in_ch = 4
    else:
        raise RuntimeError("No pude inferir canales de la observación.")
else:
    raise RuntimeError("Observación con dimensionalidad inesperada.")

policy_net = DQN(in_ch, n_actions).to(device)
target_net = DQN(in_ch, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=LR)
replay = ReplayBuffer(REPLAY_SIZE)

def compute_td_loss(batch_size):
    states, actions, rewards, next_states, dones = replay.sample(batch_size)
    states_v = to_tensor(states)
    next_states_v = to_tensor(next_states)
    actions_v = torch.from_numpy(actions).to(device, dtype=torch.int64).unsqueeze(1)
    rewards_v = torch.from_numpy(rewards).to(device)
    dones_v = torch.from_numpy(dones).to(device, dtype=torch.uint8)

    q_values = policy_net(states_v).gather(1, actions_v).squeeze(1)

    with torch.no_grad():
        next_q_values = target_net(next_states_v).max(1)[0]
        next_q_values = next_q_values * (1 - dones_v)
        expected_q = rewards_v + GAMMA * next_q_values

    loss = F.smooth_l1_loss(q_values, expected_q)

    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        if param.grad is not None:
            param.grad.data.clamp_(-1, 1)
    optimizer.step()

    return loss.item()

In [None]:
def train():
    # --- Entrenamiento principal ---
    step_count = 0
    episode_rewards = []
    losses = []
    mean_rewards = []

    state, _ = env.reset()
    state = preprocess_observation(state)

    print("Rellenando replay buffer con acciones aleatorias...")
    while len(replay) < WARMUP_STEPS:
        action = env.action_space.sample()
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        next_state_p = preprocess_observation(next_state)
        replay.push(state, action, reward, next_state_p, done)
        state = next_state_p if not done else preprocess_observation(env.reset()[0])
        if done:
            state = preprocess_observation(env.reset()[0])

    print(f"Replay buffer listo: {len(replay)} transiciones. Comenzando entrenamiento...")

    episode_reward = 0.0
    episode_idx = 0
    eps = EPS_START

    while step_count < MAX_STEPS:
        eps = max(eps*EPS_DECAY, EPS_END)

        if random.random() < eps:
            action = env.action_space.sample()
        else:
            state_v = to_tensor(np.expand_dims(state, axis=0))
            with torch.no_grad():
                q_vals = policy_net(state_v)
                action = int(q_vals.argmax().item())

        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        next_state_p = preprocess_observation(next_state)
        replay.push(state, action, reward, next_state_p, done)

        state = next_state_p
        episode_reward += reward
        step_count += 1

        if step_count > WARMUP_STEPS and step_count % UPDATE_FREQ == 0:
            loss = compute_td_loss(BATCH_SIZE)
            losses.append(loss)

        if step_count % TARGET_UPDATE_FREQ == 0:
            target_net.load_state_dict(deepcopy(policy_net.state_dict()))
            print(f"[Paso {step_count}] Copiado a target network.")

        # fin de episodio
        if done:
            episode_rewards.append(episode_reward)
            episode_idx += 1
            # media móvil de últimas 100 episodios
            if len(episode_rewards) >= 1:
                m = np.mean(episode_rewards[-100:])
                mean_rewards.append(m)
            else:
                mean_rewards.append(episode_reward)
            if episode_idx % 10 == 0:
                print(f"Episode {episode_idx} | Step {step_count} | Episodic reward: {episode_reward:.1f} | "
                    f"eps={eps:.3f} | mean100={mean_rewards[-1]:.2f} | replay={len(replay)}")
            # reiniciar episodio
            state, _ = env.reset()
            state = preprocess_observation(state)
            episode_reward = 0.0

        if step_count % EVAL_EVERY == 0 and step_count > 0:
            eval_rewards = []
            for _ in range(EVAL_EPISODES):
                s, _ = env.reset()
                s = preprocess_observation(s)
                done_eval = False
                ep_r = 0.0
                while not done_eval:
                    s_v = to_tensor(np.expand_dims(s, axis=0))
                    with torch.no_grad():
                        a = int(policy_net(s_v).argmax().item())
                    s2, r, term, trunc, _ = env.step(a)
                    done_eval = term or trunc
                    s = preprocess_observation(s2)
                    ep_r += r
                eval_rewards.append(ep_r)
            avg_eval = float(np.mean(eval_rewards))
            print(f"*** Eval at step {step_count}: avg reward over {EVAL_EPISODES} eps = {avg_eval:.2f}")

            ckpt = {
                "policy_state_dict": policy_net.state_dict(),
                "target_state_dict": target_net.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "step": step_count,
                "episode": episode_idx
            }
            torch.save(ckpt, os.path.join(SAVE_PATH, f"checkpoint_{step_count}.pth"))
            print(f"Checkpoint guardado en {SAVE_PATH}/checkpoint_{step_count}.pth")

    torch.save(policy_net.state_dict(), os.path.join(SAVE_PATH, "dqn_final.pth"))
    print("Entrenamiento terminado. Modelo guardado en:", os.path.join(SAVE_PATH, "dqn_final.pth"))

    plt.figure(figsize=(12,6))
    plt.title("Recompensa episódica / media móvil")
    plt.xlabel("Episodios")
    plt.ylabel("Recompensa")
    plt.plot(episode_rewards, alpha=0.4, label="episodios")
    if len(mean_rewards) > 0:
        plt.plot(mean_rewards, label="media móvil 100")
    plt.legend()
    plt.grid()
    plt.show()
    return True

trained = train()

Rellenando replay buffer con acciones aleatorias...
Replay buffer listo: 10000 transiciones. Comenzando entrenamiento...
Episode 10 | Step 9661 | Episodic reward: -21.0 | eps=0.953 | mean100=-19.60 | replay=19661
[Paso 10000] Copiado a target network.
Episode 20 | Step 18625 | Episodic reward: -20.0 | eps=0.911 | mean100=-19.95 | replay=28625
[Paso 20000] Copiado a target network.
Episode 30 | Step 27837 | Episodic reward: -21.0 | eps=0.870 | mean100=-20.17 | replay=37837
[Paso 30000] Copiado a target network.
Episode 40 | Step 37000 | Episodic reward: -21.0 | eps=0.831 | mean100=-20.18 | replay=47000
[Paso 40000] Copiado a target network.
Episode 50 | Step 46157 | Episodic reward: -21.0 | eps=0.794 | mean100=-20.18 | replay=56157
[Paso 50000] Copiado a target network.
Episode 60 | Step 54782 | Episodic reward: -20.0 | eps=0.760 | mean100=-20.23 | replay=64782
[Paso 60000] Copiado a target network.
Episode 70 | Step 64146 | Episodic reward: -20.0 | eps=0.726 | mean100=-20.17 | replay=7

In [None]:
from PIL import Image

def watch_agent(env, model, max_steps=10000):
    state, _ = env.reset()
    done = False
    total_reward = 0
    images = []

    for _ in range(max_steps):
        state_tensor = (
            torch.tensor(state, dtype=torch.float32)
            .unsqueeze(0)
            .to(device)
        )

        with torch.no_grad():
            probs = model(state_tensor)
        action = torch.argmax(probs, dim=1).item()

        frame = env.render()

        # Convert to PIL image if needed
        if isinstance(frame, np.ndarray):
            frame = Image.fromarray(frame)

        images.append(frame)

        state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        total_reward += reward

        if done:
            break

    env.close()
    print("Total reward:", total_reward)
    return images

if not trained:
    # Load trained policy
    policy_net = DQN(4, env.action_space.n)
    checkpoint = torch.load(
        r"C:\Users\alvar\OneDrive\Documentos\Carrera\4\Paradigmas\Proyecto\Part1\checkpoint_best.pth",
        map_location=device
    )
    policy_net.load_state_dict(checkpoint["policy_state_dict"])

# Generate frames
images = watch_agent(env, policy_net)

# Save as GIF
gif_file = "./output.gif"
images[0].save(
    gif_file,
    save_all=True,
    append_images=images[1:],
    duration=60,
    loop=0
)

print(f"Saved GIF to {gif_file}")


Total reward: 21.0
Saved GIF to ./output.gif
