<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/Reinforcement_Learning_with_Proximal_Policy_Optimization_(PPO).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F  # Ensure this is imported
import gym
import numpy as np

class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return F.softmax(self.fc3(x), dim=-1)

class ValueNetwork(nn.Module):
    def __init__(self, input_dim):
        super(ValueNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

class PPOAgent:
    def __init__(self, env):
        self.env = env
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.policy = PolicyNetwork(env.observation_space.shape[0], env.action_space.n).to(self.device)
        self.value = ValueNetwork(env.observation_space.shape[0]).to(self.device)
        self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=1e-3)
        self.value_optimizer = optim.Adam(self.value.parameters(), lr=1e-3)

    def select_action(self, state):
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)
        probs = self.policy(state)
        dist = torch.distributions.Categorical(probs)
        action = dist.sample()
        return action.item(), dist.log_prob(action)

    def compute_advantages(self, rewards, values, gamma=0.99, lam=0.95):
        advantages = np.zeros_like(rewards)
        last_adv = 0
        for t in reversed(range(len(rewards))):
            delta = rewards[t] + (gamma * values[t + 1] if t + 1 < len(rewards) else 0) - values[t]
            advantages[t] = last_adv = delta + gamma * lam * last_adv
        return advantages

    def update(self, trajectories):
        states, actions, log_probs, returns, advantages = trajectories
        states = torch.tensor(np.array(states), dtype=torch.float32).to(self.device)
        actions = torch.tensor(actions).to(self.device)
        log_probs = torch.tensor(log_probs).to(self.device)
        returns = torch.tensor(np.array(returns), dtype=torch.float32).to(self.device)
        advantages = torch.tensor(np.array(advantages), dtype=torch.float32).to(self.device)

        new_log_probs = torch.zeros_like(log_probs)
        for i in range(len(states)):
            dist = torch.distributions.Categorical(self.policy(states[i].unsqueeze(0)))
            new_log_probs[i] = dist.log_prob(actions[i])

        ratio = torch.exp(new_log_probs - log_probs)
        surrogate1 = ratio * advantages
        surrogate2 = torch.clamp(ratio, 0.8, 1.2) * advantages
        policy_loss = -torch.min(surrogate1, surrogate2).mean()

        value_loss = F.mse_loss(self.value(states).squeeze(-1), returns)

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        self.value_optimizer.zero_grad()
        value_loss.backward()
        self.value_optimizer.step()

env = gym.make('CartPole-v1', new_step_api=True)
agent = PPOAgent(env)

for episode in range(1000):
    state = env.reset()
    done = False
    states, actions, rewards, log_probs, values = [], [], [], [], []

    while not done:
        action, log_prob = agent.select_action(state)
        next_state, reward, done, truncated, _ = env.step(action)  # Updated unpacking
        done = done or truncated
        value = agent.value(torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(agent.device)).item()

        states.append(state)
        actions.append(action)
        rewards.append(reward)
        log_probs.append(log_prob.item())
        values.append(value)

        state = next_state

    returns = np.zeros_like(rewards)
    R = 0
    for t in reversed(range(len(rewards))):
        R = rewards[t] + (0.99 * R if t + 1 < len(rewards) else 0)
        returns[t] = R

    advantages = agent.compute_advantages(rewards, values)

    agent.update((states, actions, log_probs, returns, advantages))