#Setup

Install Dependencies

In [4]:
!pip install gymnasium



Import dependencies

In [1]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import time
import matplotlib.pyplot as plt


#Preperation

Define the Policy Network

In [2]:
class Policy(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(state_dim, 128)
        self.fc2 = nn.Linear(128, action_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.softmax(x, dim=1)

Create the environment, instantiate the policy network and define the optimizer

In [3]:
# Create the environment
env = gym.make('CartPole-v1',render_mode="rgb_array")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

# Initialize the policy network
policy = Policy(state_dim, action_dim)

# Define the optimizer
optimizer = optim.Adam(policy.parameters(), lr=0.01)

#Algorithm

Pick an action based on policy

In [4]:
def select_action(state):
    state = np.array(state)
    state = torch.from_numpy(state).float().unsqueeze(0)
    probs = policy(state)
    m = Categorical(probs)
    action = m.sample()
    return action.item(), m.log_prob(action)

Policy Gradiant Algorithm, the actual training loop

In [8]:
def policy_gradient():
    num_episodes = 750
    gamma = 0.99

    rewards_per_episode = []  # List to store rewards for each episode

    # for 1000 episodes
    for episode in range(num_episodes):
        observations = env.reset()
        state = np.array(observations[0])
        episode_reward = 0
        log_probs = []
        rewards = []

        # loop through each time step in one episode
        while True:
            action, log_prob = select_action(state)
            next_state, reward, done, _, _ = env.step(action)

            log_probs.append(log_prob)
            rewards.append(reward)
            episode_reward += reward

            if done:
                break

            state = next_state


        # Compute the discounted rewards
        discounts = [gamma**i for i in range(len(rewards))]
        discounted_rewards = [discount * reward for discount, reward in zip(discounts, rewards)]
        
        rewards_per_episode.append(discounted_rewards)
        
        # Convert the discounted_rewards into a Tensor
        discounted_rewards = torch.Tensor(discounted_rewards)

        # Normalize the discounted rewards
        discounted_rewards -= torch.mean(discounted_rewards)
        discounted_rewards /= torch.std(discounted_rewards)

        # Calculate the loss
        policy_loss = []
        for log_prob, reward in zip(log_probs, discounted_rewards):
            policy_loss.append(-log_prob * reward)
        policy_loss = torch.cat(policy_loss).sum()

        # Update the policy network
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()

        # Print the episode statistics
        if episode % 10 == 0:
            print('Episode {}: reward = {}'.format(episode, episode_reward))
    
    
    # Plot the rewards per episode
    plt.plot(rewards_per_episode)
    plt.xlabel('Episode')
    plt.ylabel('Reward')
    plt.title('Reward per Episode')
    plt.show()

#Run Trials

In [9]:
policy_gradient()

Episode 0: reward = 87.0
Episode 10: reward = 56.0
Episode 20: reward = 83.0
Episode 30: reward = 86.0
Episode 40: reward = 146.0
Episode 50: reward = 201.0
Episode 60: reward = 125.0
Episode 70: reward = 116.0
Episode 80: reward = 181.0
Episode 90: reward = 104.0
Episode 100: reward = 198.0
Episode 110: reward = 143.0
Episode 120: reward = 109.0
Episode 130: reward = 259.0
Episode 140: reward = 171.0
Episode 150: reward = 134.0
Episode 160: reward = 93.0
Episode 170: reward = 116.0
Episode 180: reward = 216.0
Episode 190: reward = 766.0
Episode 200: reward = 233.0
Episode 210: reward = 204.0
Episode 220: reward = 216.0
Episode 230: reward = 121.0
Episode 240: reward = 89.0
Episode 250: reward = 102.0
Episode 260: reward = 96.0
Episode 270: reward = 109.0
Episode 280: reward = 91.0
Episode 290: reward = 94.0
Episode 300: reward = 122.0
Episode 310: reward = 175.0
Episode 320: reward = 196.0
Episode 330: reward = 144.0
Episode 340: reward = 113.0
Episode 350: reward = 107.0
Episode 360:

KeyboardInterrupt: 