In [1]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, hidden_sizes, output_dim):
        super(PolicyNetwork, self).__init__()
        layers = []
        prev_size = input_dim
        for hs in hidden_sizes:
            layers.append(nn.Linear(prev_size, hs))
            layers.append(nn.ReLU())
            prev_size = hs
        layers.append(nn.Linear(prev_size, output_dim))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        logits = self.model(x)
        return torch.softmax(logits, dim=-1)


class ValueNetwork(nn.Module):
    def __init__(self, input_dim, hidden_sizes):
        super(ValueNetwork, self).__init__()
        layers = []
        prev_size = input_dim
        for hs in hidden_sizes:
            layers.append(nn.Linear(prev_size, hs))
            layers.append(nn.ReLU())
            prev_size = hs
        layers.append(nn.Linear(prev_size, 1))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

In [4]:
def reinforce(env,
              input_dim,
              output_dim,
              hidden_sizes,
              alpha=0.001,
              beta=0.001,
              episodes=500,
              gamma=0.99,
              log_dir="runs/reinforce_with_baseline"):
    """
    REINFORCE training loop with a baseline (value function).

    Args:
        env: OpenAI gym environment
        input_dim: dimension of input space
        output_dim: number of actions
        hidden_sizes: list of sizes for each hidden layer
        alpha: The learning rate for the policy network
        beta: The learning rate for the value network
        episodes: number of training episodes
        gamma: discount factor
        log_dir: directory for tensorboard logs

    Returns:
        policy_network: the trained policy network
        rewards_per_episode: a list containing the total reward per episode
    """

    writer = SummaryWriter(log_dir=log_dir)

    policy_network = PolicyNetwork(input_dim, hidden_sizes, output_dim).to(device)
    value_network = ValueNetwork(input_dim, hidden_sizes).to(device)

    policy_optimizer = optim.Adam(policy_network.parameters(), lr=alpha)
    value_optimizer = optim.Adam(value_network.parameters(), lr=beta)

    rewards_per_episode = []

    for episode in range(episodes):
        # generate a new episode
        state, _ = env.reset()
        done = False
        truncated = False
        states = []
        actions_log_probs = []
        rewards = []

        while not (done or truncated):
            state_tensor = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
            action_probs = policy_network(state_tensor)
            action_dist = torch.distributions.Categorical(action_probs)
            action = action_dist.sample()
            log_prob = action_dist.log_prob(action)

            next_state, reward, done, truncated, info = env.step(action.item())

            states.append(state)
            actions_log_probs.append(log_prob)
            rewards.append(reward)

            state = next_state

        # compute the returns (G_t) for the episode (including gamma)
        returns = []
        G = 0
        for r in reversed(rewards):
            G = r + gamma * G
            returns.append(G)
        returns.reverse()

        states_tensor = torch.tensor(states, dtype=torch.float32, device=device)
        returns_tensor = torch.tensor(returns, dtype=torch.float32, device=device).unsqueeze(1)

        # compute value predictions
        values = value_network(states_tensor)

        # compute advantages A_t = G_t - V(s_t)
        advantages = returns_tensor - values.detach()

        # update policy network
        policy_loss = []
        for log_prob, A in zip(actions_log_probs, advantages):
            policy_loss.append(-log_prob * A)
        policy_loss = torch.cat(policy_loss).sum()

        policy_optimizer.zero_grad()
        policy_loss.backward()
        policy_optimizer.step()

        # update value network (MSE loss between returns and values)
        value_loss = nn.MSELoss()(values, returns_tensor)
        value_optimizer.zero_grad()
        value_loss.backward()
        value_optimizer.step()

        total_reward = sum(rewards)
        rewards_per_episode.append(total_reward)

        print(f"Episode {episode + 1}/{episodes}, Total Reward: {total_reward}")

        # log metrics to TensorBoard
        writer.add_scalar("Policy Loss", policy_loss.item(), episode)
        writer.add_scalar("Value Loss", value_loss.item(), episode)
        writer.add_scalar("Total Reward", total_reward, episode)

        # early stopping if solved
        if len(rewards_per_episode) >= 100:
            avg_reward = np.mean(rewards_per_episode[-100:])
            print(f"Episode {episode + 1}/{episodes}, Total Reward: {total_reward}, Average Reward: {avg_reward}")
            if avg_reward >= 475.0:
                print(f"Solved in {episode + 1} episodes!")
                break
        else:
            print(f"Episode {episode + 1}/{episodes}, Total Reward: {total_reward}")

    writer.close()
    return policy_network, value_network, rewards_per_episode

In [5]:
def test_policy(env, policy_network, episodes=10):
    policy_network.eval()
    avg_reward = 0
    for ep in range(episodes):
        state, _ = env.reset()
        total_reward = 0
        done = False
        while not done:
            state_tensor = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
            with torch.no_grad():
                action_probs = policy_network(state_tensor)
            action = torch.argmax(action_probs).item()
            state, reward, done, truncated, info = env.step(action)
            total_reward += reward
            if total_reward >= 10000:
                break

        avg_reward += total_reward
        print(f"Test Episode {ep + 1}, Total Reward: {total_reward}")
    return avg_reward / episodes

In [6]:
def grid_search(env, input_dim, output_dim, hidden_sizes, gamma_values, alpha_values, episodes=2000):
    best_combination = None
    best_episodes = float('inf')
    best_policy_network = None
    best_value_network = None
    best_reward = float('inf')
    results = []

    for gamma in gamma_values:
        for alpha in alpha_values:
            print(f"Training with gamma={gamma}, alpha={alpha}")
            policy_network, value_network, rewards = reinforce(
                env,
                input_dim=input_dim,
                output_dim=output_dim,
                hidden_sizes=hidden_sizes,
                alpha=alpha,
                episodes=episodes,
                gamma=gamma,
                log_dir=f"runs/reinforce_with_baseline_{alpha}_{gamma}"
            )

            avg_reward = np.mean(rewards[-100:])
            solved_episodes = len(rewards)
            results.append((gamma, alpha, solved_episodes, avg_reward))

            if avg_reward >= 475 and solved_episodes < best_episodes:
                best_combination = (gamma, alpha)
                best_episodes = solved_episodes
                best_policy_network = policy_network
                best_value_network = value_network
                best_reward = avg_reward

            print(f"Gamma={gamma}, Alpha={alpha}, Solved in {solved_episodes} episodes")

    return best_policy_network, best_value_network, best_combination, best_episodes, best_reward, results

In [7]:
# Define the environment and input/output dimensions
env = gym.make("CartPole-v1")
input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n
hidden_sizes = [16, 32, 16]

# Define the ranges for gamma and alpha
gamma_values = [0.95, 0.99, 0.995]
alpha_values = [0.001, 0.0005, 0.0001]

# Perform grid search
best_policy_network, best_value_network, best_params, best_episodes, best_reward, all_results = grid_search(env,
                                                                                                            input_dim,
                                                                                                            output_dim,
                                                                                                            hidden_sizes,
                                                                                                            gamma_values,
                                                                                                            alpha_values,
                                                                                                            1000)

# Print the best parameters
if best_params:
    print("\nBest Parameters:")
    print(f"Gamma: {best_params[0]}, Alpha: {best_params[1]}, Solved in {best_episodes} episodes\n")

for gamma, alpha, solved_episode, average_reward in all_results:
    print(f"Gamma: {gamma}, Alpha: {alpha}, Solved in: {solved_episode} episodes, Average Reward: {average_reward}")

Training with gamma=0.95, alpha=0.001


  if not isinstance(terminated, (bool, np.bool8)):
  states_tensor = torch.tensor(states, dtype=torch.float32, device=device)


Episode 1/1000, Total Reward: 17.0
Episode 1/1000, Total Reward: 17.0
Episode 2/1000, Total Reward: 20.0
Episode 2/1000, Total Reward: 20.0
Episode 3/1000, Total Reward: 9.0
Episode 3/1000, Total Reward: 9.0
Episode 4/1000, Total Reward: 12.0
Episode 4/1000, Total Reward: 12.0
Episode 5/1000, Total Reward: 15.0
Episode 5/1000, Total Reward: 15.0
Episode 6/1000, Total Reward: 18.0
Episode 6/1000, Total Reward: 18.0
Episode 7/1000, Total Reward: 27.0
Episode 7/1000, Total Reward: 27.0
Episode 8/1000, Total Reward: 23.0
Episode 8/1000, Total Reward: 23.0
Episode 9/1000, Total Reward: 20.0
Episode 9/1000, Total Reward: 20.0
Episode 10/1000, Total Reward: 25.0
Episode 10/1000, Total Reward: 25.0
Episode 11/1000, Total Reward: 10.0
Episode 11/1000, Total Reward: 10.0
Episode 12/1000, Total Reward: 18.0
Episode 12/1000, Total Reward: 18.0
Episode 13/1000, Total Reward: 16.0
Episode 13/1000, Total Reward: 16.0
Episode 14/1000, Total Reward: 54.0
Episode 14/1000, Total Reward: 54.0
Episode 15/1

In [8]:
# test the trained policy
print("Testing the policy...")
test_avg_reward = test_policy(env, best_policy_network, episodes=10)
print(f"Average Test Reward: {test_avg_reward}")

Testing the policy...
Test Episode 1, Total Reward: 10000.0
Test Episode 2, Total Reward: 10000.0
Test Episode 3, Total Reward: 10000.0
Test Episode 4, Total Reward: 10000.0
Test Episode 5, Total Reward: 10000.0
Test Episode 6, Total Reward: 10000.0
Test Episode 7, Total Reward: 10000.0
Test Episode 8, Total Reward: 10000.0
Test Episode 9, Total Reward: 10000.0
Test Episode 10, Total Reward: 10000.0
Average Test Reward: 10000.0


In [10]:
# Save the policy
torch.save(best_policy_network.state_dict(), 'reinforce_with_baseline_policy.pth')
torch.save(best_value_network.state_dict(), 'reinforce_with_baseline_value.pth')