<a href="https://colab.research.google.com/github/OneFineStarstuff/OneFineStardust/blob/main/Reinforcement_Learning_with_Policy_Gradient_(PPO).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np

# Define the neural network policy
class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return F.softmax(self.fc3(x), dim=-1)

# Proximal Policy Optimization (PPO) training loop
def train_ppo(env, policy, n_steps=1000, gamma=0.99, clip_eps=0.2, lr=1e-3):
    optimizer = optim.Adam(policy.parameters(), lr=lr)
    state = env.reset()
    state = torch.FloatTensor(state).unsqueeze(0).requires_grad_()

    for step in range(n_steps):
        action_probs = policy(state)

        # Debugging print statement to check dimensions
        print(f"action_probs shape: {action_probs.shape}")

        # Ensure action_probs has the correct dimensions for multinomial
        if action_probs.dim() == 3:  # If action_probs is 3D, make it 2D
            action_probs = action_probs.squeeze(1)

        action = action_probs.multinomial(num_samples=1)
        next_state, reward, done, _ = env.step(action.item())

        # Convert next_state and reward to torch Tensors
        next_state = torch.FloatTensor(next_state).unsqueeze(0)
        reward = torch.tensor([reward]).float()

        # Placeholder old_action_probs - this will need to be defined correctly for a proper PPO implementation
        old_action_probs = action_probs.detach()

        # Calculate the advantage
        td_target = reward + gamma * next_state.max()
        advantage = td_target - state.max()

        chosen_action_prob = action_probs.gather(1, action).squeeze(1)
        old_chosen_action_prob = old_action_probs.gather(1, action).squeeze(1)
        ratio = (chosen_action_prob / old_chosen_action_prob)

        clip_adv = torch.clamp(ratio, 1 - clip_eps, 1 + clip_eps) * advantage

        loss = -(torch.min(ratio * advantage, clip_adv)).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if done:
            state = env.reset()
            state = torch.FloatTensor(state).unsqueeze(0).requires_grad_()
        else:
            state = next_state.detach().numpy()
            state = torch.FloatTensor(state).unsqueeze(0).requires_grad_()  # Reapply requires_grad_

# Example usage with a simple environment
env = gym.make("CartPole-v1")
policy = PolicyNetwork(env.observation_space.shape[0], env.action_space.n)
train_ppo(env, policy)