In [None]:
pip install torch gym



In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

In [None]:
# Define the Actor-Critic Network
class ActorCritic(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(ActorCritic, self).__init__()
        # Common layers
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)

        # Actor's layers
        self.actor = nn.Linear(hidden_dim, output_dim)

        # Critic's layers
        self.critic = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))

        # Actor: returns the probability distribution over actions
        actor_probs = torch.softmax(self.actor(x), dim=-1)

        # Critic: returns the value of the state
        state_value = self.critic(x)

        return actor_probs, state_value


In [None]:
def train(env, model, optimizer, num_episodes=1000, gamma=0.99):
    episode_rewards = []

    for episode in range(num_episodes):
        state = env.reset()
        state = torch.FloatTensor(state)
        log_probs = []
        values = []
        rewards = []
        done = False

        while not done:
            # Get the action probabilities and state value from the model
            action_probs, state_value = model(state)
            dist = torch.distributions.Categorical(action_probs)
            action = dist.sample()
            log_prob = dist.log_prob(action)

            # Perform the action
            next_state, reward, done, _ = env.step(action.item())
            next_state = torch.FloatTensor(next_state)

            # Store log_probs, values, and rewards
            log_probs.append(log_prob)
            values.append(state_value)
            rewards.append(reward)

            # Move to the next state
            state = next_state

        # Compute returns and advantages
        returns = []
        Gt = 0
        for reward in reversed(rewards):
            Gt = reward + gamma * Gt
            returns.insert(0, Gt)
        returns = torch.FloatTensor(returns)
        values = torch.cat(values)

        # Normalize returns
        returns = (returns - returns.mean()) / (returns.std() + 1e-8)

        # Compute loss for Actor and Critic
        log_probs = torch.stack(log_probs)
        advantage = returns - values

        actor_loss = -torch.mean(log_probs * advantage.detach())
        critic_loss = torch.mean(advantage ** 2)
        loss = actor_loss + critic_loss

        # Update model parameters
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Store the total reward of this episode
        episode_rewards.append(sum(rewards))

        # Print progress every 100 episodes
        if (episode + 1) % 100 == 0:
            avg_reward = np.mean(episode_rewards[-100:])
            print(f'Episode {episode + 1}/{num_episodes}, Average Reward: {avg_reward}')

    return episode_rewards

In [None]:
# Hyperparameters
env = gym.make('CartPole-v1')
input_dim = env.observation_space.shape[0]
hidden_dim = 128
output_dim = env.action_space.n
lr = 0.001
num_episodes = 1000
gamma = 0.99

# Initialize the model, optimizer, and train
model = ActorCritic(input_dim, hidden_dim, output_dim)
optimizer = optim.Adam(model.parameters(), lr=lr)
episode_rewards = train(env, model, optimizer, num_episodes, gamma)

# Save the trained model
torch.save(model.state_dict(), 'actor_critic_cartpole.pth')

# To load the model:
# model.load_state_dict(torch.load('actor_critic_cartpole.pth'))
# model.eval()

env.close()


  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):


Episode 100/1000, Average Reward: 27.06
Episode 200/1000, Average Reward: 52.88
Episode 300/1000, Average Reward: 135.9
Episode 400/1000, Average Reward: 216.3
Episode 500/1000, Average Reward: 269.01
Episode 600/1000, Average Reward: 361.47
Episode 700/1000, Average Reward: 352.27
Episode 800/1000, Average Reward: 369.56
Episode 900/1000, Average Reward: 399.2
Episode 1000/1000, Average Reward: 392.14
