In [1]:
# Import required libraries

import argparse
import gym
import matplotlib.pyplot as plt
from matplotlib import animation
from IPython.display import HTML
import numpy as np
from itertools import count
from collections import namedtuple

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

In [2]:
#Set constants for training
seed = 543
log_interval = 10
gamma = 0.99

env = gym.make('CartPole-v1')
env.reset(seed=seed)
torch.manual_seed(seed)

SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])

In [3]:
env = gym.make('CartPole-v1')
env.reset(seed=seed)
torch.manual_seed(seed)


SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])

In [4]:
class Policy(nn.Module):
    """
    Implements the policy network for REINFORCE with baseline.
    """
    def __init__(self):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(4, 128)
        self.action_head = nn.Linear(128, 2)  # Actor's layer

        # Action and reward buffer
        self.saved_actions = []
        self.rewards = []

    def forward(self, x):
        """
        Forward pass of the policy network.
        """
        x = F.relu(self.affine1(x))
        action_prob = F.softmax(self.action_head(x), dim=-1)  # Actor: Action probabilities
        return action_prob

In [5]:
model = Policy()
optimizer = optim.Adam(model.parameters(), lr=3e-2)
eps = np.finfo(np.float32).eps.item()

In [6]:
def select_action(state):
    """
    Selects an action from the policy network given the current state.
    """
    state = torch.from_numpy(state).float()
    probs = model(state)

    # Create a categorical distribution over the list of probabilities of actions
    m = Categorical(probs)

    # Sample an action using the distribution
    action = m.sample()

    # Return the action to take (left or right)
    return action.item()

In [7]:
def finish_episode():
    """
    Performs the policy update at the end of an episode.
    """
    R = 0
    saved_actions = model.saved_actions
    policy_losses = []  # List to save actor (policy) loss

    # Calculate the returns and advantages
    returns = []
    for r in model.rewards[::-1]:
        R = r + gamma * R
        returns.insert(0, R)
    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + eps)

    # Compute policy loss and update policy parameters
    for log_prob, R in zip(saved_actions, returns):
        policy_losses.append(-log_prob * R)

    # Reset gradients
    optimizer.zero_grad()

    # Sum up all the values of policy_losses
    loss = torch.stack(policy_losses).sum()

    # Perform backpropagation and optimization
    loss.backward()
    optimizer.step()

    # Reset rewards and action buffer for the next episode
    del model.rewards[:]
    del model.saved_actions[:]

In [8]:
def train():
    running_reward = 10  # Initialize running reward

    # Run infinitely many episodes
    for i_episode in range(2000):

        # Reset environment and episode reward
        state = env.reset()
        ep_reward = 0

        # For each episode, only run 9999 steps to avoid infinite loop
        for t in range(1, 10000):

            # Select action from policy
            action = select_action(state)

            # Take the action
            state, reward, done, _ = env.step(action)

            # Save rewards for this episode
            model.rewards.append(reward)
            ep_reward += reward

            if done:
                break

        # Update cumulative reward
        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward

        # Perform policy update
        finish_episode()

        # Log results
        if i_episode % log_interval == 0:
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                i_episode, ep_reward, running_reward))

        # Check if the problem is solved
        if running_reward > env.spec.reward_threshold:
            print("Solved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(running_reward, t))
            break

In [9]:
train()

TypeError: expected np.ndarray (got tuple)