In [None]:
# Import necessary libraries
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
import gym

import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d

# Set the learning rate and discount factor
lr = 5e-3
gamma = 0.995

# Set the number of episodes to run
episodes = 1000

# Set the environment to use
env_name = 'CartPole-v1'

In [None]:
# Define a neural network to model the policy
class NeuralNetwork(nn.Module):
    def __init__(self, env):
        super().__init__()

        # Create fully-connected layers with ReLU activations
        self.fc1 = nn.Linear(env.observation_space.shape[0], 64)
        self.fc2 = nn.Linear(64, env.action_space.n)

    def forward(self, x):
        # Convert the input tensor to a float tensor
        x = torch.tensor(x, dtype=torch.float)

        # Apply ReLU activations to the fully-connected layers
        x = F.relu(self.fc1(x))

        # Apply a softmax activation to the final layer, to get probabilities for each action
        x = F.softmax(self.fc2(x), dim=0)

        return x

# Create the environment and the neural network
env = gym.make(env_name)
network = NeuralNetwork(env);

# Use Adam optimizer to optimize the neural network
optim = torch.optim.Adam(network.parameters(), lr=lr)


In this code, we define a class NeuralNetwork that represents the policy model for the agent. The NeuralNetwork class inherits from nn.Module, which is the base class for all neural network models in PyTorch.

The `__init__` method of the NeuralNetwork class takes an env argument, which represents the environment that the agent is interacting with. This method initializes the base class using `super().__init__()`, and then creates three fully-connected layers with ReLU activations. The first and second layers have 128 and 64 units, respectively, while the third layer has a number of units equal to the number of actions in the environment's action space.

The `forward` method of the NeuralNetwork class takes an input tensor x, and applies the fully-connected layers with ReLU activations to it. The final layer applies a softmax activation, which outputs probabilities for each action in the environment's action space.

After defining the NeuralNetwork class, we create an instance of the environment using `gym.make(env_name)`. We then create an instance of the NeuralNetwork class, passing the environment as an argument.

Finally, we use the Adam optimizer from the torch.optim module to optimize the parameters of the NeuralNetwork instance. This optimizer will be used to update the probabilities in the distribution, based on the rewards the agent receives while interacting with the environment.

In [None]:
# Import the deque class from the collections module
from collections import deque

# Initialize empty lists for rewards and losses
train_rewards = []
train_loss = []
recent_rewards = deque(maxlen=100)

# Iterate over the number of episodes
for episode in range(episodes):
    # Reset the environment and initialize empty lists for actions, states, and rewards
    state, _  = env.reset()
    Actions, States, Rewards = [], [], []

    # Train the agent for a single episode
    for _ in range(1000):
        # Get the probabilities for each action, using the current state
        probs = network.forward(state)

        # Sample an action from the distribution
        action = Categorical(probs).sample().item()

        # Take the action in the environment and get the new state, reward, and done flag
        new_state, rew, done, trunc, _ = env.step(action)

        # Save the action, state, and reward for later
        Actions.append(torch.tensor(action, dtype=torch.int))
        States.append(state)
        Rewards.append(rew)

        # Update the current state with the new state
        state = new_state

        # If the episode is done or the time limit is reached, stop training
        if done or trunc:
            break
        
    ## Discount the returns using the discount factor
    DiscountedReturns = []
    for t in range(len(Rewards)):
        G = 0.0
        for k, r in enumerate(Rewards[t:]):
            G += gamma**k * r
        DiscountedReturns.append(G)
    
    # Perform gradient ascent to update the probabilities in the distribution
    for State, Action, G in zip(States, Actions, DiscountedReturns):
        # Get the probabilities for the current state
        probs = network.forward(State)

        # Calculate the loss as the negative log probability of the chosen action
        # multiplied by the discounted return
        loss = - Categorical(probs).log_prob(Action) * G

        # Clear the gradients, backpropagate the loss, and update the network parameters
        optim.zero_grad()
        loss.backward()
        optim.step()

    # Save the total reward for the episode and append it to the recent rewards queue
    train_rewards.append(np.sum(Rewards))
    recent_rewards.append(train_rewards[-1])

    # Print the mean recent reward every 50 episodes
    if episode % 50 == 0:
        print(f"Episode {episode:>6}: \tR:{np.mean(recent_rewards):>6.3f}")

    # If the mean recent reward is greater than 200, stop training
    if np.mean(recent_rewards) > 200:
        break        

In [None]:
fig, ax = plt.subplots()

ax.plot(train_rewards)
ax.plot(gaussian_filter1d(train_rewards, sigma=20), linewidth=4)
ax.set_title('Rewards')

fig.show()

In [None]:
# Initialize the environment using the gym.make() method and setting the 
# render_mode to "human" to enable human-readable output.
env = gym.make(env_name, render_mode="human")

# Run the trained agent for five episodes, with each episode lasting for a
# maximum of 1000 steps.
for _ in range(5):
    # Initialize empty list for rewards for the current episode
    Rewards = []
    
    # Reset the environment and get the initial state
    state, _ = env.reset()
    done = False
    
    # Render the initial state
    env.render()
    
    # Run the agent for 1000 steps or until the episode ends
    for _ in range(1000):
        # Calculate the probabilities of taking each action using the trained
        # neural network
        probs = network.forward(state)
        
        # Sample an action from the resulting distribution using the 
        # torch.distributions.Categorical() method
        c = torch.distributions.Categorical(probs=probs)        
        action = c.sample().item()
        
        # Take the action and get the new state, reward, and done flag from the
        # environment
        new_state, reward, done, trunc, _ = env.step(action)
        
        # Render the new state
        env.render()

        # Update the current state for the next iteration
        state = new_state

        # Add the reward to the total rewards for the current episode
        Rewards.append(reward)

        # If the episode has ended (because the maximum number of steps is reached
        # or because the done flag is set), break out of the loop
        if done or trunc:
            break
    
    # Print the total rewards for the current episode
    print(f'Reward: {sum(Rewards)}')

# Close the environment
env.close()