In [1]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

# ------- Entorno SingleStepABEnv (igual que antes) -------
class SingleStepABEnv(gym.Env):
    def __init__(self):
        super(SingleStepABEnv, self).__init__()
        self.action_space = gym.spaces.Discrete(2)
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32)
        self.reset()

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.state = np.array([0.0], dtype=np.float32)
        self.done = False
        info = {}
        return self.state, info

    def step(self, action):
        if self.done:
            raise RuntimeError("Episodio ya terminó. Llama reset().")
        if action == 0:  # A
            reward = 10.0 if np.random.rand() < 0.75 else 0.0
        else:            # B
            reward = 10.0 if np.random.rand() < 0.25 else 0.0
        self.done = True
        return self.state, reward, self.done, False, {}

# ------- Red Actor-Crítico -------
class ActorCriticNet(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden_size=64):
        super().__init__()
        self.fc1 = nn.Linear(obs_dim, hidden_size)
        self.actor = nn.Linear(hidden_size, act_dim)
        self.critic = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        logits = self.actor(x)
        value = self.critic(x)
        return logits, value

# ------- Función para calcular returns y ventajas en 1-step -------
def compute_returns_and_advantages(rewards, values, gamma=1.0):
    returns = []
    advantages = []
    for r, v in zip(rewards, values):
        G = r  # un solo paso
        A = G - v
        returns.append(G)
        advantages.append(A)
    return returns, advantages

def ppo_train(env, policy_net, optimizer, epochs=10, episodes_per_epoch=500, gamma=1.0, epsilon=0.2, entropy_coef=0.1, eps_greedy=0.1):
    for epoch in range(epochs):
        states, actions, rewards, log_probs_old, values_old = [], [], [], [], []

        # Recolectar datos
        for _ in range(episodes_per_epoch):
            state, _ = env.reset()
            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)

            # Forward
            logits, value = policy_net(state_tensor)
            dist = Categorical(logits=logits)

            # Epsilon-greedy
            if np.random.rand() < eps_greedy:
                action = np.random.randint(env.action_space.n)
                log_prob = dist.log_prob(torch.tensor(action))
            else:
                action = dist.sample().item()
                log_prob = dist.log_prob(torch.tensor(action))

            # Step
            next_state, reward, done, truncated, info = env.step(action)

            states.append(state)
            actions.append(action)
            rewards.append(reward)
            log_probs_old.append(log_prob.item())
            values_old.append(value.item())

        returns, advantages = compute_returns_and_advantages(rewards, values_old, gamma=gamma)

        # Convertir a tensores
        states_tensor       = torch.tensor(states,      dtype=torch.float32)
        actions_tensor      = torch.tensor(actions,     dtype=torch.long)
        old_log_probs_tensor = torch.tensor(log_probs_old, dtype=torch.float32)
        returns_tensor      = torch.tensor(returns,     dtype=torch.float32)
        advantages_tensor   = torch.tensor(advantages,  dtype=torch.float32)

        # Forward en batch
        logits, values = policy_net(states_tensor)
        dist = Categorical(logits=logits)
        log_probs = dist.log_prob(actions_tensor)

        ratio = torch.exp(log_probs - old_log_probs_tensor)
        ratio_clipped = torch.clamp(ratio, 1.0 - epsilon, 1.0 + epsilon)

        policy_loss_1 = ratio         * advantages_tensor
        policy_loss_2 = ratio_clipped * advantages_tensor
        policy_loss   = -torch.mean(torch.min(policy_loss_1, policy_loss_2))

        value_loss = torch.mean((values.squeeze() - returns_tensor)**2)
        entropy = dist.entropy().mean()

        # Aumentamos el peso de la entropía (entropy_coef) -> fomenta exploración
        loss = policy_loss + 0.5 * value_loss - entropy_coef * entropy

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        avg_reward = np.mean(rewards)
        print(f"Epoch {epoch+1}/{epochs}  Loss={loss.item():.3f}  PolicyLoss={policy_loss.item():.3f}  ValueLoss={value_loss.item():.3f}  Entropy={entropy.item():.3f}  AvgReward={avg_reward:.2f}")

In [2]:
def main():
    env = SingleStepABEnv()
    obs_dim = env.observation_space.shape[0] # 1
    act_dim = env.action_space.n            # 2

    policy_net = ActorCriticNet(obs_dim, act_dim, hidden_size=32)
    optimizer = optim.Adam(policy_net.parameters(), lr=5e-4)  # Reducimos un poco LR

    ppo_train(env, policy_net, optimizer,
              epochs=1000,           # más epochs si lo deseas
              episodes_per_epoch=500,
              gamma=1.0, epsilon=0.2,
              entropy_coef=0.1,    # subimos la entropía
              eps_greedy=0.1)      # Forzamos 10% de exploración aleatoria

    # Evaluación
    policy_net.eval()
    test_rewards = []
    n_test = 1000
    for _ in range(n_test):
        state, _ = env.reset()
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        with torch.no_grad():
            logits, value = policy_net(state_tensor)
            dist = Categorical(logits=logits)
            action = dist.sample().item()
        _, reward, _, _, _ = env.step(action)
        test_rewards.append(reward)

    print(f"\nEvaluación final: Recompensa media en {n_test} episodios = {np.mean(test_rewards):.2f}")

In [3]:
main()

  states_tensor       = torch.tensor(states,      dtype=torch.float32)


Epoch 1/1000  Loss=18.416  PolicyLoss=-4.607  ValueLoss=46.181  Entropy=0.680  AvgReward=4.80
Epoch 2/1000  Loss=18.170  PolicyLoss=-4.542  ValueLoss=45.560  Entropy=0.681  AvgReward=4.74
Epoch 3/1000  Loss=18.077  PolicyLoss=-4.517  ValueLoss=45.325  Entropy=0.682  AvgReward=4.72
Epoch 4/1000  Loss=17.226  PolicyLoss=-4.292  ValueLoss=43.174  Entropy=0.682  AvgReward=4.50
Epoch 5/1000  Loss=18.195  PolicyLoss=-4.547  ValueLoss=45.622  Entropy=0.683  AvgReward=4.76
Epoch 6/1000  Loss=17.951  PolicyLoss=-4.483  ValueLoss=45.005  Entropy=0.683  AvgReward=4.70
Epoch 7/1000  Loss=19.219  PolicyLoss=-4.818  ValueLoss=48.211  Entropy=0.684  AvgReward=5.04
Epoch 8/1000  Loss=17.541  PolicyLoss=-4.373  ValueLoss=43.964  Entropy=0.685  AvgReward=4.60
Epoch 9/1000  Loss=17.675  PolicyLoss=-4.408  ValueLoss=44.304  Entropy=0.685  AvgReward=4.64
Epoch 10/1000  Loss=17.282  PolicyLoss=-4.304  ValueLoss=43.309  Entropy=0.686  AvgReward=4.54
Epoch 11/1000  Loss=18.770  PolicyLoss=-4.699  ValueLoss=47