<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/Reinforcement_Learning_(RL)_with_Policy_Gradient_Methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import gym

class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, action_dim),
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        return self.fc(x)

def train_policy_gradient(env, policy_net, optimizer, num_episodes=1000):
    for episode in range(num_episodes):
        state, _ = env.reset()  # Adjusted for new Gym API
        rewards = []
        log_probs = []
        done = False

        while not done:
            state = torch.FloatTensor(state)
            action_probs = policy_net(state)
            action = torch.multinomial(action_probs, 1).item()
            log_prob = torch.log(action_probs[action])
            log_probs.append(log_prob)

            step_result = env.step(action)
            if len(step_result) == 5:  # New API
                next_state, reward, terminated, truncated, _ = step_result
                done = terminated or truncated
            else:  # Old API
                next_state, reward, done, _ = step_result

            rewards.append(reward)
            state = next_state

        # Compute the discounted rewards
        discounted_rewards = []
        R = 0
        for r in reversed(rewards):
            R = r + 0.99 * R
            discounted_rewards.insert(0, R)

        discounted_rewards = torch.FloatTensor(discounted_rewards)
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)

        loss = 0
        for log_prob, reward in zip(log_probs, discounted_rewards):
            loss -= log_prob * reward

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

env = gym.make('CartPole-v1', new_step_api=True)  # Explicitly use new Gym API
policy_net = PolicyNetwork(state_dim=env.observation_space.shape[0], action_dim=env.action_space.n)
optimizer = optim.Adam(policy_net.parameters(), lr=1e-2)
train_policy_gradient(env, policy_net, optimizer)