<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/Proximal_Policy_Optimization_(PPO).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

# Define the policy network
class PolicyNet(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(PolicyNet, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, action_dim),
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        return self.fc(x)

# Define the value network
class ValueNet(nn.Module):
    def __init__(self, state_dim):
        super(ValueNet, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.fc(x)

# Hyperparameters
env = gym.make("CartPole-v1")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
lr = 1e-3
gamma = 0.99
epsilon = 0.2
epochs = 3
batch_size = 32

# Initialize networks and optimizer
policy_net = PolicyNet(state_dim, action_dim)
value_net = ValueNet(state_dim)
optimizer_policy = optim.Adam(policy_net.parameters(), lr=lr)
optimizer_value = optim.Adam(value_net.parameters(), lr=lr)

def compute_returns(rewards, dones, next_value):
    returns = []
    R = next_value
    for reward, done in zip(reversed(rewards), reversed(dones)):
        R = reward + gamma * R * (1 - done)
        returns.insert(0, R)
    return returns

# Training loop
for episode in range(1000):
    state = env.reset()
    states, actions, rewards, dones = [], [], [], []
    total_reward = 0
    done = False

    while not done:
        state_tensor = torch.tensor(state, dtype=torch.float32)
        action_probs = policy_net(state_tensor)
        action_dist = Categorical(action_probs)
        action = action_dist.sample()

        next_state, reward, done, _ = env.step(action.item())
        total_reward += reward

        states.append(state_tensor)
        actions.append(action)
        rewards.append(reward)
        dones.append(done)

        state = next_state

    next_state_tensor = torch.tensor(state, dtype=torch.float32)
    next_value = value_net(next_state_tensor).item()
    returns = compute_returns(rewards, dones, next_value)
    returns = torch.tensor(returns, dtype=torch.float32)

    # Update value network
    value_loss = nn.functional.mse_loss(value_net(torch.stack(states)).squeeze(), returns)
    optimizer_value.zero_grad()
    value_loss.backward()
    optimizer_value.step()

    # Update policy network
    for _ in range(epochs):
        advantages = returns - value_net(torch.stack(states)).detach().squeeze()
        action_probs = policy_net(torch.stack(states))
        action_dist = Categorical(action_probs)
        old_log_probs = action_dist.log_prob(torch.stack(actions))

        action_dist = Categorical(policy_net(torch.stack(states)))
        new_log_probs = action_dist.log_prob(torch.stack(actions))
        ratio = torch.exp(new_log_probs - old_log_probs)
        clipped_ratio = torch.clamp(ratio, 1 - epsilon, 1 + epsilon)
        policy_loss = -torch.min(ratio * advantages, clipped_ratio * advantages).mean()

        optimizer_policy.zero_grad()
        policy_loss.backward()
        optimizer_policy.step()

    print(f"Episode {episode}, Total Reward: {total_reward}")

print("Training complete.")