In [None]:
# Install required packages
# Explicitly install a compatible triton version
!pip install gymnasium[atari] ale-py opencv-python torch torchvision tensorboard triton==2.3.0

# Check GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")

Collecting triton==2.3.0
  Downloading triton-2.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
INFO: pip is looking at multiple versions of torch to determine which version is compatible with other requirements. This could take a while.
Collecting torch
  Downloading torch-2.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.8.93 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-runtime-cu12==12.8.90 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-cupti-cu12==12.8.90 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cublas-cu12==12.8.4.1 (from torch)
  Downloading nvidia_cublas_cu12-12.8.4.1-py3-non

In [None]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
import time
import collections

In [None]:
class CartPoleA2C(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(CartPoleA2C, self).__init__()

        # Policy Network (Actor)
        self.policy = nn.Sequential(
            nn.Linear(input_shape[0], 128),
            nn.ReLU(),
            nn.Linear(128, n_actions)
        )

        # Value Network (Critic)
        self.value = nn.Sequential(
            nn.Linear(input_shape[0], 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
    def forward(self, x):
        # x is a vector, no division by 255 needed
        return self.policy(x), self.value(x)

In [None]:
GAMMA = 0.99
LEARNING_RATE = 1e-3    # Faster for CartPole
ENTROPY_BETA = 0.01    # Less exploration needed
NUM_ENVS = 8            # 8 parallel environments is enough
REWARD_STEPS = 5
CLIP_GRAD = 0.5
def iterate_batches(envs, net, device="cpu"):
    n_actions = envs[0].action_space.n
    obs = [e.reset()[0] for e in envs]

    batch_dones = [[False] for _ in range(NUM_ENVS)]
    total_reward = [0.0] * NUM_ENVS
    total_steps = [0] * NUM_ENVS

    # CartPole observation shape is (4,)
    obs_shape = obs[0].shape

    mb_obs = np.zeros((NUM_ENVS, REWARD_STEPS) + obs_shape, dtype=np.float32)
    mb_rewards = np.zeros((NUM_ENVS, REWARD_STEPS), dtype=np.float32)
    mb_values = np.zeros((NUM_ENVS, REWARD_STEPS), dtype=np.float32)
    mb_actions = np.zeros((NUM_ENVS, REWARD_STEPS), dtype=np.int32)

    while True:
        batch_dones = [[dones[-1]] for dones in batch_dones]
        done_rewards = []
        done_steps = []

        for n in range(REWARD_STEPS):
            obs_v = torch.FloatTensor(np.array(obs)).to(device)
            mb_obs[:, n] = obs_v.data.cpu().numpy()

            logits_v, values_v = net(obs_v)
            probs_v = F.softmax(logits_v, dim=1)
            probs = probs_v.data.cpu().numpy()

            actions = []
            for p in probs:
                actions.append(np.random.choice(len(p), p=p))
            actions = np.array(actions)

            mb_actions[:, n] = actions
            mb_values[:, n] = values_v.squeeze().data.cpu().numpy()

            for e_idx, e in enumerate(envs):
                o, r, terminated, truncated, _ = e.step(actions[e_idx])
                done = terminated or truncated

                total_reward[e_idx] += r
                total_steps[e_idx] += 1

                if done:
                    o, _ = e.reset()
                    done_rewards.append(total_reward[e_idx])
                    done_steps.append(total_steps[e_idx])
                    total_reward[e_idx] = 0.0
                    total_steps[e_idx] = 0

                obs[e_idx] = o
                mb_rewards[e_idx, n] = r
                batch_dones[e_idx].append(done)

        # Value of the last state
        obs_v = torch.FloatTensor(np.array(obs)).to(device)
        _, values_v = net(obs_v)
        values_last = values_v.squeeze().data.cpu().numpy()

        # Discount rewards
        for e_idx, (rewards, dones, value) in enumerate(zip(mb_rewards, batch_dones, values_last)):
            rewards = rewards.tolist()
            if not dones[-1]:
                r = value
            else:
                r = 0.0

            discounted_rewards = []
            for reward in reversed(rewards):
                r = reward + GAMMA * r
                discounted_rewards.append(r)
            mb_rewards[e_idx] = list(reversed(discounted_rewards))
        out_mb_obs = mb_obs.reshape((-1,) + obs_shape)
        out_mb_rewards = mb_rewards.flatten()
        out_mb_actions = mb_actions.flatten()
        out_mb_values = mb_values.flatten()

        yield out_mb_obs, out_mb_rewards, out_mb_actions, out_mb_values, \
              np.array(done_rewards), np.array(done_steps)
def train_a2c_step(net, mb_obs, mb_rewards, mb_actions, mb_values, optimizer, step_idx, writer=None, device="cpu"):
    optimizer.zero_grad()

    # Prepare tensors
    obs_v = torch.FloatTensor(mb_obs).to(device)
    rewards_v = torch.FloatTensor(mb_rewards).to(device)
    actions_t = torch.LongTensor(mb_actions).to(device)
    values_v = torch.FloatTensor(mb_values).to(device)

    # Calculate Advantage
    # A(s,a) = Q(s,a) - V(s) approx R - V(s)
    adv_v = rewards_v - values_v

    # Network forward
    logits_v, values_pred = net(obs_v)

    # Value Loss
    loss_value_v = F.mse_loss(values_pred.squeeze(-1), rewards_v)

    # Policy Loss
    log_prob_v = F.log_softmax(logits_v, dim=1)
    log_prob_actions_v = adv_v * log_prob_v[range(len(mb_actions)), actions_t]
    loss_policy_v = -log_prob_actions_v.mean()

    # Entropy Loss
    prob_v = F.softmax(logits_v, dim=1)
    entropy_loss_v = (prob_v * log_prob_v).sum(dim=1).mean()

    # Total Loss
    loss_v = ENTROPY_BETA * entropy_loss_v + loss_value_v + loss_policy_v
    loss_v.backward()
    nn.utils.clip_grad_norm_(net.parameters(), CLIP_GRAD)
    optimizer.step()
    # Metrics & Logging
    if writer:
        # Value Bias: Mean Value - Mean Return
        mean_val = values_pred.mean().item()
        mean_ret = rewards_v.mean().item()
        val_bias = mean_val - mean_ret

        writer.add_scalar("value_bias", val_bias, step_idx)
        writer.add_scalar("mean_value", mean_val, step_idx)
        writer.add_scalar("mean_return", mean_ret, step_idx)
        writer.add_scalar("loss_entropy", entropy_loss_v.item(), step_idx)
        writer.add_scalar("loss_policy", loss_policy_v.item(), step_idx)
        writer.add_scalar("loss_value", loss_value_v.item(), step_idx)
        writer.add_scalar("loss_total", loss_v.item(), step_idx)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Create Env
make_env = lambda: gym.make("CartPole-v1")
envs = [make_env() for _ in range(NUM_ENVS)]
# Create Model
net = CartPoleA2C(envs[0].observation_space.shape, envs[0].action_space.n).to(device)
print(net)
# TensorBoard Writer
writer = SummaryWriter(comment="-cartpole-a2c")
optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
step_idx = 0
total_steps = 0
best_reward = None
ts_start = time.time()
mean_rewards = []
print("Starting CartPole A2C Training...")
try:
    for mb_obs, mb_rewards, mb_actions, mb_values, done_rewards, done_steps in iterate_batches(envs, net, device=device):
        if len(done_rewards) > 0:
            total_steps += sum(done_steps)
            speed = total_steps / (time.time() - ts_start)

            if best_reward is None:
                best_reward = done_rewards.max()
            elif best_reward < done_rewards.max():
                best_reward = done_rewards.max()

            mean_reward = done_rewards.mean()
            mean_rewards.append(mean_reward)
            if len(mean_rewards) > 100:
                mean_rewards.pop(0)
            avg_mean_reward = np.mean(mean_rewards)

            print("%d: done %d episodes, mean_reward=%.2f, best_reward=%.2f, avg_100=%.2f, speed=%.2f f/s" % (
                step_idx, len(done_rewards), mean_reward, best_reward, avg_mean_reward, speed))

            # Log Episode Metrics
            writer.add_scalar("speed", speed, step_idx)
            writer.add_scalar("reward_100", avg_mean_reward, step_idx)
            writer.add_scalar("reward", mean_reward, step_idx)

            if avg_mean_reward > 195.0:
                print(f"SOLVED in {step_idx} steps!")
                print(f"Total Training Time: {time.time() - ts_start:.2f} seconds")
                print(f"Total Training Samples: {total_steps}")
                break

        train_a2c_step(net, mb_obs, mb_rewards, mb_actions, mb_values, optimizer, step_idx, writer=writer, device=device)
        step_idx += 1
except KeyboardInterrupt:
    print("Training stopped")
finally:
    writer.close()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
13501: done 5 episodes, mean_reward=9.80, best_reward=500.00, avg_100=9.36, speed=4496.80 f/s
13502: done 3 episodes, mean_reward=9.33, best_reward=500.00, avg_100=9.35, speed=4496.68 f/s
13503: done 5 episodes, mean_reward=9.20, best_reward=500.00, avg_100=9.35, speed=4496.72 f/s
13504: done 5 episodes, mean_reward=9.20, best_reward=500.00, avg_100=9.35, speed=4496.76 f/s
13505: done 3 episodes, mean_reward=9.00, best_reward=500.00, avg_100=9.35, speed=4496.64 f/s
13506: done 7 episodes, mean_reward=8.86, best_reward=500.00, avg_100=9.35, speed=4496.79 f/s
13507: done 3 episodes, mean_reward=9.00, best_reward=500.00, avg_100=9.34, speed=4496.64 f/s
13508: done 5 episodes, mean_reward=9.40, best_reward=500.00, avg_100=9.34, speed=4496.68 f/s
13509: done 4 episodes, mean_reward=9.50, best_reward=500.00, avg_100=9.34, speed=4496.65 f/s
13510: done 4 episodes, mean_reward=9.00, best_reward=500.00, avg_100=9.34, speed=4496.61

In [None]:
from google.colab import output
output.serve_kernel_port_as_iframe(6006)

<IPython.core.display.Javascript object>