In [3]:
!sudo apt-get install -y build-essential swig libopenmpi-dev

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
build-essential is already the newest version (12.9ubuntu3).
libopenmpi-dev is already the newest version (4.1.2-2ubuntu1).
The following additional packages will be installed:
  swig4.0
Suggested packages:
  swig-doc swig-examples swig4.0-examples swig4.0-doc
The following NEW packages will be installed:
  swig swig4.0
0 upgraded, 2 newly installed, 0 to remove and 29 not upgraded.
Need to get 1,116 kB of archives.
After this operation, 5,542 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig4.0 amd64 4.0.2-1ubuntu1 [1,110 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig all 4.0.2-1ubuntu1 [5,632 B]
Fetched 1,116 kB in 2s (696 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Di

In [4]:
!pip install gymnasium[box2d]

Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Using cached box2d-py-2.3.5.tar.gz (374 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting swig==4.* (from gymnasium[box2d])
  Using cached swig-4.3.0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (3.5 kB)
Using cached swig-4.3.0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.9 MB)
Building wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone
  Created wheel for box2d-py: filename=box2d_py-2.3.5-cp311-cp311-linux_x86_64.whl size=2351178 sha256=315472ecb5f883d305130a276a36983429bfb068dc42097b0d3c944b4642228d
  Stored in directory: /root/.cache/pip/wheels/ab/f1/0c/d56f4a2bdd12bae0a0693ec33f2f0daadb5eb9753c78fa5308
Successfully built box2d-py
Installing collected packages: swig, box2d-py
Successfully installed box2d-py-2.3.5 swig-4.3.0


In [17]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import imageio

class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(ActorCritic, self).__init__()
        self.actor = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.Tanh(),
            nn.Linear(128, 128),
            nn.Tanh(),
            nn.Linear(128, action_dim),
            nn.Softmax(dim=-1)
        )
        self.critic = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.Tanh(),
            nn.Linear(128, 128),
            nn.Tanh(),
            nn.Linear(128, 1)
        )

    def forward(self, state):
        action_probs = self.actor(state)
        value = self.critic(state)
        return action_probs, value

def record_video(agent, filename="lunar_lander.mp4", max_steps=1000):
    env = gym.make("LunarLander-v3", render_mode="rgb_array")
    state, _ = env.reset()
    frames = []

    for _ in range(max_steps):
        frame = env.render()
        frames.append(frame)

        state_tensor = torch.FloatTensor(state)
        action_probs, value = agent(state_tensor)
        action_dist = torch.distributions.Categorical(action_probs)
        action = action_dist.sample()

        state, _, terminated, truncated, _ = env.step(action.item())

        if terminated or truncated:
            break

    env.close()
    imageio.mimsave(filename, frames, fps=30)
    print(f"Video saved as {filename}")

def a2c(env, num_episodes, lr=0.001, gamma=0.99):
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    model = ActorCritic(state_dim, action_dim)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for episode in range(num_episodes):
        state, _ = env.reset()
        done = False
        episode_rewards = []
        log_probs = []
        values = []
        rewards = []

        while not done:
            state_tensor = torch.FloatTensor(state)
            action_probs, value = model(state_tensor)
            action_dist = torch.distributions.Categorical(action_probs)
            action = action_dist.sample()
            log_prob = action_dist.log_prob(action)

            next_state, reward, terminated, truncated, _ = env.step(action.item())
            done = terminated or truncated

            episode_rewards.append(reward)
            log_probs.append(log_prob)
            values.append(value)
            rewards.append(reward)
            state = next_state

        R = 0
        returns = []
        for r in reversed(rewards):
            R = r + gamma * R
            returns.insert(0, R)
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + 1e-8) # Normalize returns

        actor_losses = []
        critic_losses = []

        for log_prob, value, ret in zip(log_probs, values, returns):
            advantage = ret - value.squeeze()
            actor_loss = -log_prob * advantage
            critic_loss = (ret - value.squeeze()) ** 2

            actor_losses.append(actor_loss)
            critic_losses.append(critic_loss)

        actor_loss = torch.stack(actor_losses).mean()
        critic_loss = torch.stack(critic_losses).mean()

        loss = actor_loss + critic_loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_reward = sum(episode_rewards)
        if episode % 10 == 0:
            print(f"Episode {episode + 1}, Total Reward: {total_reward}")

    return model

if __name__ == "__main__":
    env = gym.make("LunarLander-v3")
    model = a2c(env, num_episodes=1000)
    env.close()
    record_video(model)

Episode 1, Total Reward: -143.44459430416927
Episode 11, Total Reward: -80.8122206671884
Episode 21, Total Reward: -504.9605307767998
Episode 31, Total Reward: -539.0449824176541
Episode 41, Total Reward: -135.97904181446552
Episode 51, Total Reward: -63.081606443897236
Episode 61, Total Reward: -153.3198251042425
Episode 71, Total Reward: -181.87435596561232
Episode 81, Total Reward: -81.17411488928859
Episode 91, Total Reward: -143.47633967183455
Episode 101, Total Reward: -204.5243730045829
Episode 111, Total Reward: -44.66312456864934
Episode 121, Total Reward: -34.95299309396995
Episode 131, Total Reward: -62.49685196126113
Episode 141, Total Reward: -134.24189648438406
Episode 151, Total Reward: -30.56108172166992
Episode 161, Total Reward: -63.092337773548756
Episode 171, Total Reward: -304.85197182301187
Episode 181, Total Reward: 21.78769002773491
Episode 191, Total Reward: 9.823293397086431
Episode 201, Total Reward: -175.4219761726331
Episode 211, Total Reward: -94.616161828



Video saved as lunar_lander.mp4
