<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/Proximal_Policy_Optimization_(PPO).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, action_dim):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, action_dim)
        )

    def forward(self, x):
        return torch.softmax(self.fc(x), dim=-1)

def compute_ppo_loss(policy_net, old_probs, actions, states, rewards, epsilon=0.2):
    new_probs = policy_net(states)
    action_probabilities = new_probs.gather(1, actions.unsqueeze(1)).squeeze(1)
    prob_ratio = action_probabilities / old_probs.gather(1, actions.unsqueeze(1)).squeeze(1)
    clipped_ratio = torch.clamp(prob_ratio, 1 - epsilon, 1 + epsilon)
    loss = -torch.min(prob_ratio * rewards, clipped_ratio * rewards).mean()
    return loss

# Example usage
input_dim = 4  # State space dimension
action_dim = 2  # Number of possible actions
policy_net = PolicyNetwork(input_dim, action_dim)
optimizer = optim.Adam(policy_net.parameters(), lr=0.001)

states = torch.randn(32, input_dim)
old_probs = torch.randn(32, action_dim).softmax(dim=-1)
actions = torch.randint(0, action_dim, (32,))
rewards = torch.randn(32)

loss = compute_ppo_loss(policy_net, old_probs, actions, states, rewards)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f'PPO Training Loss: {loss.item():.4f}')