<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/Deep_Reinforcement_Learning_with_Proximal_Policy_Optimization_(PPO).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import gym
from torch.distributions import Categorical

class ActorCritic(nn.Module):
    def __init__(self, input_dim, action_dim):
        super(ActorCritic, self).__init__()
        self.actor = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, action_dim),
            nn.Softmax(dim=-1)
        )
        self.critic = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        action_probs = self.actor(x)
        state_value = self.critic(x)
        return action_probs, state_value

# Hyperparameters
gamma = 0.99
clip_epsilon = 0.2
critic_coef = 0.5
entropy_coef = 0.01
lr = 3e-4
num_episodes = 1000
update_epochs = 4
batch_size = 64

# Environment setup
env = gym.make("CartPole-v1")
input_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

model = ActorCritic(input_dim, action_dim)
optimizer = optim.Adam(model.parameters(), lr=lr)

# Storage for trajectories
class RolloutBuffer:
    def __init__(self):
        self.states = []
        self.actions = []
        self.logprobs = []
        self.rewards = []
        self.dones = []
        self.values = []

    def clear(self):
        self.states = []
        self.actions = []
        self.logprobs = []
        self.rewards = []
        self.dones = []
        self.values = []

buffer = RolloutBuffer()

# Advantage calculation
def compute_advantages(rewards, dones, values, next_value, gamma):
    advantages = []
    gae = 0
    for reward, done, value in zip(reversed(rewards), reversed(dones), reversed(values)):
        gae = reward + gamma * (1 - done) * gae - value
        advantages.insert(0, gae)
    returns = [a + v for a, v in zip(advantages, values)]
    return torch.tensor(advantages), torch.tensor(returns)

# Training loop
for episode in range(num_episodes):
    state = env.reset()
    buffer.clear()
    episode_reward = 0

    for t in range(200):  # Limit steps per episode
        state_tensor = torch.tensor(state, dtype=torch.float32)
        action_probs, value = model(state_tensor)

        # Sample action
        dist = Categorical(action_probs)
        action = dist.sample()
        log_prob = dist.log_prob(action)

        # Step environment
        next_state, reward, done, _ = env.step(action.item())
        episode_reward += reward

        # Store experience
        buffer.states.append(state)
        buffer.actions.append(action)
        buffer.logprobs.append(log_prob)
        buffer.rewards.append(reward)
        buffer.dones.append(done)
        buffer.values.append(value.item())

        state = next_state
        if done:
            break

    # Compute advantages and returns
    _, next_value = model(torch.tensor(next_state, dtype=torch.float32))
    advantages, returns = compute_advantages(
        buffer.rewards, buffer.dones, buffer.values, next_value.item(), gamma
    )

    # PPO Update
    for _ in range(update_epochs):
        for i in range(0, len(buffer.states), batch_size):
            # Batch samples
            states = torch.tensor(buffer.states[i:i+batch_size], dtype=torch.float32)
            actions = torch.tensor(buffer.actions[i:i+batch_size])
            logprobs = torch.stack(buffer.logprobs[i:i+batch_size])
            advs = advantages[i:i+batch_size]
            rets = returns[i:i+batch_size]

            # Forward pass
            new_action_probs, new_values = model(states)
            new_dist = Categorical(new_action_probs)
            new_logprobs = new_dist.log_prob(actions)

            # Policy ratio
            ratios = torch.exp(new_logprobs - logprobs.detach())

            # Clipped objective
            clipped_advs = torch.clamp(ratios, 1 - clip_epsilon, 1 + clip_epsilon) * advs
            policy_loss = -torch.min(ratios * advs, clipped_advs).mean()

            # Value function loss
            value_loss = nn.MSELoss()(new_values.squeeze(), rets)

            # Entropy for exploration
            entropy_loss = -new_dist.entropy().mean()

            # Total loss
            loss = policy_loss + critic_coef * value_loss + entropy_coef * entropy_loss

            # Optimize model
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    print(f"Episode {episode + 1}, Reward: {episode_reward}")

env.close()

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import gym

class ActorCritic(nn.Module):
    def __init__(self, input_dim, action_dim):
        super(ActorCritic, self).__init__()
        self.actor = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, action_dim),
            nn.Softmax(dim=-1)
        )
        self.critic = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        action_probs = self.actor(x)
        state_value = self.critic(x)
        return action_probs, state_value

# Environment setup with new API enabled
env = gym.make('CartPole-v1', new_step_api=True)

model = ActorCritic(input_dim=env.observation_space.shape[0], action_dim=env.action_space.n)
optimizer = optim.Adam(model.parameters(), lr=3e-4)

state, _ = env.reset()  # Reset environment for new API
for t in range(200):
    state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
    action_probs, _ = model(state_tensor)
    action = torch.argmax(action_probs).item()

    # Step environment
    next_state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated  # Combine flags for done

    if done:
        break

    state = next_state

env.close()