In [1]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, hidden_sizes, output_dim):
        super(PolicyNetwork, self).__init__()
        layers = []
        prev_size = input_dim
        for hs in hidden_sizes:
            layers.append(nn.Linear(prev_size, hs))
            layers.append(nn.ReLU())
            prev_size = hs
        layers.append(nn.Linear(prev_size, output_dim))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        logits = self.model(x)
        return torch.softmax(logits, dim=-1)


class ValueNetwork(nn.Module):
    def __init__(self, input_dim, hidden_sizes):
        super(ValueNetwork, self).__init__()
        layers = []
        prev_size = input_dim
        for hs in hidden_sizes:
            layers.append(nn.Linear(prev_size, hs))
            layers.append(nn.ReLU())
            prev_size = hs
        layers.append(nn.Linear(prev_size, 1))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

In [4]:
def actor_critic(env,
                 input_dim,
                 output_dim,
                 hidden_sizes_theta,
                 hidden_sizes_w,
                 alpha_theta=0.001,
                 alpha_w=0.001,
                 episodes=500,
                 gamma=0.99,
                 log_dir="runs/actor_critic"):
    """
    One-step Actor-Critic (episodic).

    Args:
        env: OpenAI gym environment
        input_dim: dimension of input space
        output_dim: number of actions
        hidden_sizes_theta: list of sizes for each hidden layer
        hidden_sizes_w: list of sizes for each hidden layer
        alpha_theta: learning rate for the policy network
        alpha_w: learning rate for the value network
        episodes: number of training episodes
        gamma: discount factor
        log_dir: directory for tensorboard logs

    Returns:
        policy_network: the trained policy network
        rewards_per_episode: a list containing the total reward per episode
    """

    writer = SummaryWriter(log_dir=log_dir)

    policy_network = PolicyNetwork(input_dim, hidden_sizes_theta, output_dim).to(device)
    value_network = ValueNetwork(input_dim, hidden_sizes_w).to(device)

    policy_optimizer = optim.Adam(policy_network.parameters(), lr=alpha_theta)
    value_optimizer = optim.Adam(value_network.parameters(), lr=alpha_w)

    rewards_per_episode = []

    for episode in range(episodes):
        # generate a new episode
        state, _ = env.reset()
        done = False
        truncated = False
        total_reward = 0.0
        I = 1

        while not (done or truncated):
            state_tensor = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

            # compute policy and choose action
            action_probs = policy_network(state_tensor)
            dist = torch.distributions.Categorical(action_probs)
            action = dist.sample()
            log_prob_action = dist.log_prob(action)

            next_state, reward, done, truncated, info = env.step(action.item())
            total_reward += reward

            # compute value estimates
            value = value_network(state_tensor)
            with torch.no_grad():
                if done:
                    next_value = torch.tensor([[0.0]], dtype=torch.float32, device=device)
                else:
                    next_state_tensor = torch.tensor(next_state, dtype=torch.float32, device=device).unsqueeze(0)
                    next_value = value_network(next_state_tensor)

            # TD-error
            delta = reward + gamma * next_value - value

            # update value network
            value_loss = -value * delta.detach() * I
            value_optimizer.zero_grad()
            value_loss.backward()
            value_optimizer.step()

            # update policy network
            policy_loss = -log_prob_action * delta.detach() * I
            policy_optimizer.zero_grad()
            policy_loss.backward()
            policy_optimizer.step()

            # update I
            I *= gamma

            state = next_state

        rewards_per_episode.append(total_reward)

        writer.add_scalar("Value Loss", value_loss.item(), episode)
        writer.add_scalar("Total Reward", total_reward, episode)

        # Early stopping if solved
        if len(rewards_per_episode) >= 100:
            avg_reward = np.mean(rewards_per_episode[-100:])
            print(f"Episode {episode + 1}/{episodes}, Total Reward: {total_reward}, Average Reward: {avg_reward}")
            if avg_reward >= 475.0:
                print(f"Solved in {episode + 1} episodes!")
                break
        else:
            print(f"Episode {episode + 1}/{episodes}, Total Reward: {total_reward}")

    writer.close()
    return policy_network, value_network, rewards_per_episode

In [5]:
def test_policy(env, policy_network, episodes=10):
    policy_network.eval()
    avg_reward = 0
    for ep in range(episodes):
        state, _ = env.reset()
        total_reward = 0
        done = False
        while not done:
            state_tensor = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
            with torch.no_grad():
                action_probs = policy_network(state_tensor)
            action = torch.argmax(action_probs).item()
            state, reward, done, truncated, info = env.step(action)
            total_reward += reward
            if total_reward >= 10000:
                break

        avg_reward += total_reward
        print(f"Test Episode {ep + 1}, Total Reward: {total_reward}")
    return avg_reward / episodes

In [12]:
def grid_search(env, input_dim, output_dim, hidden_sizes_theta, hidden_sizes_w, gamma_values, alpha_theta_values,
                alpha_w_values, episodes=2000):
    best_combination = None
    best_episodes = float('inf')
    best_policy_network = None
    best_value_network = None
    best_reward = float('inf')
    results = []

    for gamma in gamma_values:
        for alpha_theta in alpha_theta_values:
            for alpha_w in alpha_w_values:
                print(f"Training with gamma={gamma}, alpha_theta={alpha_theta}, alpha_w={alpha_w}")
                policy_network, value_network, rewards = actor_critic(
                    env,
                    input_dim=input_dim,
                    output_dim=output_dim,
                    hidden_sizes_theta=hidden_sizes_theta,
                    hidden_sizes_w=hidden_sizes_w,
                    alpha_theta=alpha_theta,
                    alpha_w=alpha_w,
                    episodes=episodes,
                    gamma=gamma,
                    log_dir=f"runs/actor_critic_{alpha_theta}_{alpha_w}_{gamma}"
                )

                avg_reward = np.mean(rewards[-100:])
                solved_episodes = len(rewards)
                results.append((gamma, alpha_theta, alpha_w, solved_episodes, avg_reward))

                if avg_reward >= 475 and solved_episodes < best_episodes:
                    best_combination = (gamma, alpha_theta, alpha_w)
                    best_episodes = solved_episodes
                    best_policy_network = policy_network
                    best_value_network = value_network
                    best_reward = avg_reward

                print(f"Gamma={gamma}, alpha_theta={alpha_theta}, alpha_w={alpha_w}, Solved in {solved_episodes} episodes")

    return best_policy_network, best_value_network, best_combination, best_episodes, best_reward, results

In [14]:
# Define the environment and input/output dimensions
env = gym.make("CartPole-v1")
input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n

hidden_sizes_theta = [16, 32, 16]
hidden_sizes_w = [16, 32, 16]

# Define the ranges for gamma and alpha
gamma_values = [0.95, 0.97, 0.99]
alpha_theta_values = [0.0008, 0.0005]
alpha_w_values = [0.0008, 0.0005]

# Perform grid search
best_policy_network, best_value_network, best_params, best_episodes, best_reward, all_results = grid_search(env,
                                                                                                            input_dim,
                                                                                                            output_dim,
                                                                                                            hidden_sizes_theta,
                                                                                                            hidden_sizes_w,
                                                                                                            gamma_values,
                                                                                                            alpha_theta_values,
                                                                                                            alpha_w_values,
                                                                                                            1000)

# Print the best parameters
if best_params:
    print("\nBest Parameters:")
    print(f"Gamma: {best_params[0]}, Alpha: {best_params[1]}, Solved in {best_episodes} episodes\n")


Training with gamma=0.95, alpha_theta=0.0008, alpha_w=0.0008
Episode 1/1000, Total Reward: 18.0
Episode 2/1000, Total Reward: 11.0
Episode 3/1000, Total Reward: 13.0
Episode 4/1000, Total Reward: 16.0
Episode 5/1000, Total Reward: 26.0
Episode 6/1000, Total Reward: 15.0
Episode 7/1000, Total Reward: 10.0
Episode 8/1000, Total Reward: 11.0
Episode 9/1000, Total Reward: 13.0
Episode 10/1000, Total Reward: 15.0
Episode 11/1000, Total Reward: 17.0
Episode 12/1000, Total Reward: 14.0
Episode 13/1000, Total Reward: 11.0
Episode 14/1000, Total Reward: 13.0
Episode 15/1000, Total Reward: 9.0
Episode 16/1000, Total Reward: 22.0
Episode 17/1000, Total Reward: 21.0
Episode 18/1000, Total Reward: 16.0
Episode 19/1000, Total Reward: 13.0
Episode 20/1000, Total Reward: 8.0
Episode 21/1000, Total Reward: 10.0
Episode 22/1000, Total Reward: 14.0
Episode 23/1000, Total Reward: 17.0
Episode 24/1000, Total Reward: 11.0
Episode 25/1000, Total Reward: 34.0
Episode 26/1000, Total Reward: 11.0
Episode 27/100

ValueError: too many values to unpack (expected 4)

In [15]:
for gamma, alpha_theta, alpha_w, solved_episode, average_reward in all_results:
    print(f"Gamma: {gamma}, alpha_theta={alpha_theta}, alpha_w={alpha_w}, Solved in: {solved_episode} episodes, Average Reward: {average_reward}")

Gamma: 0.95, alpha_theta=0.0008, alpha_w=0.0008, Solved in: 1000 episodes, Average Reward: 9.31
Gamma: 0.95, alpha_theta=0.0008, alpha_w=0.0005, Solved in: 1000 episodes, Average Reward: 13.37
Gamma: 0.95, alpha_theta=0.0005, alpha_w=0.0008, Solved in: 1000 episodes, Average Reward: 9.41
Gamma: 0.95, alpha_theta=0.0005, alpha_w=0.0005, Solved in: 1000 episodes, Average Reward: 206.82
Gamma: 0.97, alpha_theta=0.0008, alpha_w=0.0008, Solved in: 1000 episodes, Average Reward: 9.43
Gamma: 0.97, alpha_theta=0.0008, alpha_w=0.0005, Solved in: 1000 episodes, Average Reward: 9.36
Gamma: 0.97, alpha_theta=0.0005, alpha_w=0.0008, Solved in: 1000 episodes, Average Reward: 9.33
Gamma: 0.97, alpha_theta=0.0005, alpha_w=0.0005, Solved in: 1000 episodes, Average Reward: 56.7
Gamma: 0.99, alpha_theta=0.0008, alpha_w=0.0008, Solved in: 1000 episodes, Average Reward: 9.28
Gamma: 0.99, alpha_theta=0.0008, alpha_w=0.0005, Solved in: 1000 episodes, Average Reward: 182.45
Gamma: 0.99, alpha_theta=0.0005, al

In [16]:
# test the trained policy
print("Testing the policy...")
test_avg_reward = test_policy(env, best_policy_network, episodes=10)
print(f"Average Test Reward: {test_avg_reward}")

Testing the policy...
Test Episode 1, Total Reward: 10000.0
Test Episode 2, Total Reward: 10000.0
Test Episode 3, Total Reward: 10000.0
Test Episode 4, Total Reward: 10000.0
Test Episode 5, Total Reward: 10000.0
Test Episode 6, Total Reward: 10000.0
Test Episode 7, Total Reward: 10000.0
Test Episode 8, Total Reward: 10000.0
Test Episode 9, Total Reward: 10000.0
Test Episode 10, Total Reward: 10000.0
Average Test Reward: 10000.0


In [19]:
# save the policy and value networks
torch.save(best_policy_network.state_dict(), 'actor_critic_policy.pth')
torch.save(best_value_network.state_dict(), 'actor_critic_value.pth')