<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/Policy_Gradient_Methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import gym

class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return torch.softmax(self.fc(x), dim=-1)

def train_policy_gradient(env, policy_net, optimizer, gamma=0.99, episodes=1000):
    for episode in range(episodes):
        state = env.reset()
        log_probs = []
        rewards = []
        done = False

        while not done:
            state = torch.from_numpy(state).float()
            action_probs = policy_net(state)
            action = torch.multinomial(action_probs, 1).item()
            log_prob = torch.log(action_probs[action])
            next_state, reward, done, _ = env.step(action)

            log_probs.append(log_prob)
            rewards.append(reward)
            state = next_state

        total_reward = sum(rewards)
        discounted_rewards = [reward * (gamma ** i) for i, reward in enumerate(rewards)]
        loss = -sum([log_prob * reward for log_prob, reward in zip(log_probs, discounted_rewards)])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if episode % 100 == 0:
            print(f'Episode {episode}, Total Reward: {total_reward}')

# Example usage
env = gym.make('CartPole-v1')

policy_net = PolicyNetwork(input_dim=env.observation_space.shape[0], output_dim=env.action_space.n)
optimizer = optim.Adam(policy_net.parameters(), lr=0.01)

train_policy_gradient(env, policy_net, optimizer)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import gym

class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return torch.softmax(self.fc(x), dim=-1)

def train_policy_gradient(env, policy_net, optimizer, gamma=0.99, episodes=1000):
    for episode in range(episodes):
        state = env.reset()
        log_probs = []
        rewards = []
        done = False

        while not done:
            state = torch.from_numpy(state).float()
            action_probs = policy_net(state)
            action = torch.multinomial(action_probs, 1).item()
            log_prob = torch.log(action_probs[action])
            next_state, reward, terminated, truncated, _ = env.step(action)

            done = terminated or truncated
            log_probs.append(log_prob)
            rewards.append(reward)
            state = next_state

        total_reward = sum(rewards)
        discounted_rewards = [reward * (gamma ** i) for i, reward in enumerate(rewards)]
        loss = -sum([log_prob * reward for log_prob, reward in zip(log_probs, discounted_rewards)])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if episode % 100 == 0:
            print(f'Episode {episode}, Total Reward: {total_reward}')

# Example usage
env = gym.make('CartPole-v1', new_step_api=True)

policy_net = PolicyNetwork(input_dim=env.observation_space.shape[0], output_dim=env.action_space.n)
optimizer = optim.Adam(policy_net.parameters(), lr=0.01)

train_policy_gradient(env, policy_net, optimizer)