<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/Reinforcement_Learning_(RL)_with_Proximal_Policy_Optimization_(PPO).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, action_dim):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, action_dim),
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        return self.fc(x)

class ValueNetwork(nn.Module):
    def __init__(self, input_dim):
        super(ValueNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.fc(x)

env = gym.make('CartPole-v1')
policy_net = PolicyNetwork(env.observation_space.shape[0], env.action_space.n)
value_net = ValueNetwork(env.observation_space.shape[0])

optimizer_policy = optim.Adam(policy_net.parameters(), lr=1e-3)
optimizer_value = optim.Adam(value_net.parameters(), lr=1e-3)
gamma = 0.99
clip_epsilon = 0.2

def compute_advantages(rewards, values, next_value, gamma):
    advantages = []
    advantage = 0
    for r, v in zip(reversed(rewards), reversed(values)):
        td_error = r + gamma * next_value - v
        advantage = td_error + gamma * 0.95 * advantage
        advantages.insert(0, advantage)
        next_value = v
    return advantages

for episode in range(1000):
    state = env.reset()
    states, actions, rewards, values, log_probs = [], [], [], [], []
    done = False
    while not done:
        state_tensor = torch.tensor(state, dtype=torch.float32)
        action_probs = policy_net(state_tensor)
        value = value_net(state_tensor)
        dist = Categorical(action_probs)
        action = dist.sample()
        result = env.step(action.item())

        if len(result) == 5:
            next_state, reward, terminated, truncated, _ = result
            done = terminated or truncated
        else:
            next_state, reward, done, _ = result

        states.append(state_tensor)
        actions.append(action)
        rewards.append(reward)
        values.append(value)
        log_probs.append(dist.log_prob(action).unsqueeze(0))

        state = next_state

    next_value = value_net(torch.tensor(state, dtype=torch.float32))
    advantages = compute_advantages(rewards, values, next_value, gamma)
    returns = [a + v for a, v in zip(advantages, values)]
    returns = torch.tensor(returns, dtype=torch.float32).view(-1, 1)  # Ensure correct shape
    advantages = torch.tensor(advantages, dtype=torch.float32).view(-1, 1)  # Ensure correct shape

    log_probs = torch.cat(log_probs)
    returns = torch.cat([r.view(1) for r in returns])
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-10)

    optimizer_policy.zero_grad()
    optimizer_value.zero_grad()

    new_log_probs = []
    for state, action in zip(states, actions):
        new_dist = Categorical(policy_net(state))
        new_log_probs.append(new_dist.log_prob(action).unsqueeze(0))
    new_log_probs = torch.cat(new_log_probs)

    ratio = torch.exp(new_log_probs - log_probs)
    surr1 = ratio * advantages
    surr2 = torch.clamp(ratio, 1.0 - clip_epsilon, 1.0 + clip_epsilon) * advantages
    policy_loss = -torch.min(surr1, surr2).mean()
    value_loss = nn.MSELoss()(returns, torch.cat([v.view(1) for v in values]))

    policy_loss.backward()
    value_loss.backward()

    optimizer_policy.step()
    optimizer_value.step()

    print(f"Episode {episode+1}, Policy Loss: {policy_loss.item()}, Value Loss: {value_loss.item()}")