# REINFORCE with Baseline as Value Function for CartPole-v1

In [None]:
# Import required libraries

import argparse
import gym
import matplotlib.pyplot as plt
from matplotlib import animation
from IPython.display import HTML
import numpy as np
from itertools import count
from collections import namedtuple

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

## Environment Parameters

In [None]:
# Set constants for training
seed = 9474
log_interval = 10
gamma = 0.99

env = gym.make('CartPole-v1')
env.reset(seed=seed)
torch.manual_seed(seed)

SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])

num_i_nodes = 4
num_h_nodes = 64
num_o_nodes = 2

## Neural Networks for Policy and Value Function

### Policy Neural Network

In [None]:
class Policy(nn.Module):
    """
    Implements the policy network for REINFORCE with baseline.
    """
    def __init__(self):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(num_i_nodes, num_h_nodes)
        self.action_head = nn.Linear(num_h_nodes, num_o_nodes)  # Output layer for actions

        # Initialize the weights
        self.init_weights()

        # Action and reward buffer (unused for REINFORCE)
        self.saved_actions = []
        self.rewards = []

    def init_weights(self):
        # Initialize the weights of the layers
        nn.init.kaiming_normal_(self.affine1.weight, nonlinearity='relu')
        nn.init.constant_(self.affine1.bias, 0)
        nn.init.kaiming_normal_(self.action_head.weight, nonlinearity='relu')
        nn.init.constant_(self.action_head.bias, 0)

    def forward(self, x):
        """
        Forward pass of the policy network.
        """
        x = F.relu(self.affine1(x))
        action_scores = F.softmax(self.action_head(x), dim=-1)  # Actor: Action probabilities
        return action_scores

### Value Function Neural Network

In [None]:
# Create NN for value function
class ValueFunction(nn.Module):
    """
    Implements the value function network for REINFORCE with baseline.
    """
    def __init__(self):
        super(ValueFunction, self).__init__()
        self.affine1 = nn.Linear(num_i_nodes, num_h_nodes)
        self.value_head = nn.Linear(num_h_nodes, 1)    # Output layer for value function

        # Initialize the weights
        self.init_weights()

        # State value buffer (unused for REINFORCE)
        self.state_values = []

    def init_weights(self):
        # Initialize the weights of the linear layers
        nn.init.kaiming_normal_(self.affine1.weight, nonlinearity='relu')
        nn.init.constant_(self.affine1.bias, 0)
        nn.init.kaiming_normal_(self.value_head.weight, nonlinearity='relu')
        nn.init.constant_(self.value_head.bias, 0)

    def forward(self, x):
        """
        Forward pass of the value function network.
        """
        x = F.relu(self.affine1(x))
        state_value = self.value_head(x)
        return state_value

## Object Definitions

In [None]:
model = Policy()
value_model = ValueFunction()
optimizer = optim.Adam(model.parameters(), lr=1e-2)
value_optimizer = optim.Adam(value_model.parameters(), lr=1e-2)
eps = np.finfo(np.float32).eps.item()

## Training Functions

### Action

In [None]:
def select_action(state):
    """
    Selects an action from the policy network given the current state.
    """
    state = torch.from_numpy(state).float()
    probs = model(state)
    state_value = value_model(state)

    # Create a categorical distribution over the list of probabilities of actions
    m = Categorical(probs)

    # Sample an action using the distribution
    action = m.sample()

    # Save the log probability and state value in the model's saved_actions
    model.saved_actions.append(SavedAction(m.log_prob(action), state_value))

    # Return the action to take (left or right)
    return action.item(), m.log_prob(action), state_value

### Returns

In [None]:
def calculate_returns(rewards, discount_factor, normalize = False):

    returns = []
    R = 0

    for r in reversed(rewards):
        R = r + R * discount_factor
        returns.insert(0, R)

    # returns = torch.tensor(returns)

    if normalize:
        returns = (returns - returns.mean()) / returns.std()

    return returns

### Completion and Updation

In [None]:
def finish_episode():
    policy_loss = []
    value_loss = []
    returns = calculate_returns(model.rewards, gamma)
    returns = torch.tensor(returns)
    for saved_action, G, state_value in zip(model.saved_actions, returns, value_model.state_values):
        log_prob, _ = saved_action
        advantage = G - state_value
        policy_loss.append(-log_prob * advantage)
        value_loss.append(F.smooth_l1_loss(state_value, G))

    optimizer.zero_grad()
    policy_loss = torch.stack(policy_loss).sum()
    policy_loss.backward(retain_graph=True)
    optimizer.step()

    value_optimizer.zero_grad()
    value_loss = torch.stack(value_loss).sum()
    value_loss.backward(retain_graph=True)
    value_optimizer.step()

    value_loss = value_loss.detach()

    del model.rewards[:]
    del model.saved_actions[:]
    del value_model.state_values[:]


### Training

In [None]:
def train(episodes):
    running_reward = 10  # Initialize running reward

    # Keep track of the rewards for plotting
    reward_history = []

    # Run infinitely many episodes
    for episode in range(episodes):

        # Reset environment and episode reward
        state = env.reset()
        ep_reward = 0

        # For each episode, only run 9999 steps to avoid infinite loop
        for t in range(1, 10000):

            # Select action
            action, log_prob, state_value = select_action(state)

            # Take action
            state, reward, done, _ = env.step(action)

            # Save reward and state value
            model.rewards.append(reward)
            value_model.state_values.append(state_value)

            ep_reward += reward

            if done:
                break

        # Update cumulative reward
        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward

        # Store the reward
        reward_history.append(running_reward)

        # Perform policy update
        finish_episode()

        # Log results
        if episode % log_interval == 0:
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                episode, ep_reward, running_reward))

        # Check if the problem is solved
        if running_reward > env.spec.reward_threshold:
            print("Solved! Running reward is now {} and the last episode ({}) runs to {} time steps!".format(running_reward, episode, t))
            break

    return reward_history

## Commence Training

In [None]:
# Best hyperparameter settings
num_expts = 5
num_episodes = 2000
lr = 1e-2
num_h_nodes = 64
log_interval = 100

reward_histories = []

mean_rewards = []
std_rewards = []

for i in range(num_expts):
    model = Policy()
    value_model = ValueFunction()
    optimizer = optim.Adam(model.parameters(), lr)
    value_optimizer = optim.Adam(value_model.parameters(), lr)
    reward_history = train(num_episodes)
    while len(reward_history) < num_episodes:
        reward_history.append(env.spec.reward_threshold)
    reward_histories.append(reward_history)

In [None]:
reward_histories = np.array(reward_histories)

mean_rewards = np.mean(reward_histories, axis=0)
std_rewards = np.std(reward_histories, axis=0)

## Reward History Plot During Training

In [None]:
# Plot the reward history

plt.plot(mean_rewards)
plt.fill_between(range(len(mean_rewards)), mean_rewards + std_rewards, mean_rewards - std_rewards, alpha=0.5)
plt.title('REINFORCE with Baseline CartPole-v1')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.legend(['Running Reward'])
plt.show()

## Save Model

In [None]:
torch.save(model, 'REINFORCEB_CartPole_v1.pth')

In [None]:
# Save the mean and standard deviation rewards to a csv file
np.savetxt('REINFORCEB_CartPole_v1_mean.csv', mean_rewards, delimiter=',')
np.savetxt('REINFORCEB_CartPole_v1_std.csv', std_rewards, delimiter=',')

## Evaluation of the Model

In [None]:
# Evaluate the policy using total regret

def calculate_total_regret(episodes):
    """
    Evaluate the policy using total regret.
    """
    total_regret = 0

    # Use reward history to calculate total regret
    for reward in reward_history:
        total_regret += env.spec.reward_threshold - reward

    return total_regret

## Hyperparameter Tuning

In [None]:
# Hyperparameter tuning

# Set the hyperparameters
hyperparameters = {
    'lr': [1e-2, 1e-3, 1e-4],
    'num_h_nodes': [32, 64, 128]
}

env.reset(seed=seed)
torch.manual_seed(seed)
log_interval = 100
episodes = 2000
best_total_regret = float('inf')
best_hyperparameters = {}
best_episodes = 2000
regret_storage = []
reward_history_storage = np.zeros((len(hyperparameters['lr']), len(hyperparameters['num_h_nodes']), episodes))

# Loop through the hyperparameters
for lr in hyperparameters['lr']:
    for num_h_nodes in hyperparameters['num_h_nodes']:

        # Set the hyperparameters
        model = Policy()
        value_model = ValueFunction()
        optimizer = optim.Adam(model.parameters(), lr=lr)
        value_optimizer = optim.Adam(value_model.parameters(), lr=lr)

        # Train the model
        reward_history = train(episodes)
        len_episodes = len(reward_history)
        reward_history += [env.spec.reward_threshold] * (episodes - len(reward_history))
        reward_history_storage[hyperparameters['lr'].index(lr), hyperparameters['num_h_nodes'].index(num_h_nodes), :] = reward_history

        # Calculate the total regret
        total_regret = calculate_total_regret(episodes)

        # Store the total regret
        regret_storage.append([lr, num_h_nodes, total_regret])

        # Check if this is the best total regret
        if total_regret < best_total_regret:
            best_total_regret = total_regret
            best_hyperparameters = {'lr': lr, 'num_h_nodes': num_h_nodes}
            best_episodes = len_episodes

In [None]:
# Print the best hyperparameters
print('Best hyperparameters:', best_hyperparameters)
print('Best total regret:', best_total_regret)

In [None]:
# Plot the reward history for the best hyperparameters

plt.plot(reward_history_storage[hyperparameters['lr'].index(best_hyperparameters['lr']), hyperparameters['num_h_nodes'].index(best_hyperparameters['num_h_nodes']), :best_episodes])
plt.title('Reward history for the best hyperparameters')
plt.xlabel('Episode')
plt.ylabel('Average Reward')
plt.legend(['lr: {}, num_h_nodes: {}'.format(best_hyperparameters['lr'], best_hyperparameters['num_h_nodes'])])
plt.show()