In [26]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

# Define the policy network
class PolicyNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.softmax(self.fc2(x), dim=1)
        return x

# Function to select action
def select_action(policy_net, state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    probs = policy_net(state)
    m = Categorical(probs)
    action = m.sample()
    return action.item(), m.log_prob(action)

# Function to compute discounted rewards
def compute_discounted_rewards(rewards, gamma=0.99):
    discounted_rewards = []
    R = 0
    for r in reversed(rewards):
        R = r + gamma * R
        discounted_rewards.insert(0, R)
    return discounted_rewards

# Main training loop
def train_policy_gradient(env, policy_net, optimizer, num_episodes=1000):
    gamma = 0.99

    for episode in range(num_episodes):
        state = env.reset()
        if isinstance(state, tuple):
            state = state[0]  # Handle tuple returned by env.reset()
        
        log_probs = []
        rewards = []
        total_reward = 0

        done = False
        while not done:
            action, log_prob = select_action(policy_net, state)
            next_state, reward, done, truncated, _ = env.step(action)
            if isinstance(next_state, tuple):
                next_state = next_state[0]  # Handle tuple returned by env.step()
            
            log_probs.append(log_prob)
            rewards.append(reward)
            total_reward += reward
            state = next_state

            if done or truncated:
                break

        discounted_rewards = compute_discounted_rewards(rewards, gamma)
        discounted_rewards = torch.tensor(discounted_rewards)
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9)

        policy_loss = []
        for log_prob, reward in zip(log_probs, discounted_rewards):
            policy_loss.append(-log_prob * reward)
        policy_loss = torch.cat(policy_loss).sum()

        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()

        print(f"Episode {episode+1}/{num_episodes}, Total Reward: {total_reward}, Total Loss: {policy_loss:.4f}")

In [27]:
# Initialize environment, policy network, and optimizer
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
policy_net = PolicyNetwork(state_size, action_size)
optimizer = optim.Adam(policy_net.parameters(), lr=0.01)

# Train the policy gradient agent
train_policy_gradient(env, policy_net, optimizer)


Episode 1/1000, Total Reward: 30.0, Total Loss: 0.2857
Episode 2/1000, Total Reward: 41.0, Total Loss: -0.2600
Episode 3/1000, Total Reward: 25.0, Total Loss: 1.1561
Episode 4/1000, Total Reward: 19.0, Total Loss: 0.4861
Episode 5/1000, Total Reward: 14.0, Total Loss: 0.5826
Episode 6/1000, Total Reward: 33.0, Total Loss: -1.1324
Episode 7/1000, Total Reward: 15.0, Total Loss: -0.5757
Episode 8/1000, Total Reward: 16.0, Total Loss: 0.8058
Episode 9/1000, Total Reward: 12.0, Total Loss: 1.9251
Episode 10/1000, Total Reward: 10.0, Total Loss: -0.4776
Episode 11/1000, Total Reward: 18.0, Total Loss: 3.1433
Episode 12/1000, Total Reward: 37.0, Total Loss: 7.8965
Episode 13/1000, Total Reward: 30.0, Total Loss: 8.0176
Episode 14/1000, Total Reward: 52.0, Total Loss: 0.6334
Episode 15/1000, Total Reward: 45.0, Total Loss: -0.4117
Episode 16/1000, Total Reward: 43.0, Total Loss: 8.2262
Episode 17/1000, Total Reward: 67.0, Total Loss: 0.1068
Episode 18/1000, Total Reward: 13.0, Total Loss: 2.9

KeyboardInterrupt: 