In [None]:
!sudo apt-get install -y build-essential swig libopenmpi-dev

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
build-essential is already the newest version (12.9ubuntu3).
libopenmpi-dev is already the newest version (4.1.2-2ubuntu1).
The following additional packages will be installed:
  swig4.0
Suggested packages:
  swig-doc swig-examples swig4.0-examples swig4.0-doc
The following NEW packages will be installed:
  swig swig4.0
0 upgraded, 2 newly installed, 0 to remove and 29 not upgraded.
Need to get 1,116 kB of archives.
After this operation, 5,542 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig4.0 amd64 4.0.2-1ubuntu1 [1,110 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig all 4.0.2-1ubuntu1 [5,632 B]
Fetched 1,116 kB in 1s (1,090 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/

In [None]:
!pip install gymnasium[box2d]

Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting swig==4.* (from gymnasium[box2d])
  Downloading swig-4.3.0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (3.5 kB)
Downloading swig-4.3.0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone
  Created wheel for box2d-py: filename=box2d_py-2.3.5-cp311-cp311-linux_x86_64.whl size=2351172 sha256=854102401aa689184c423782db50195956b336934d9ea3130a750c9a72f2d6ba
  Stored in directory: /root/.cache/pip/wheels/ab/f1/0c/d56f4a2bdd12bae0a0693ec33f2f0daad

In [24]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
import imageio

class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(ActorCritic, self).__init__()
        self.actor = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.Tanh(),
            nn.Linear(128, 128),
            nn.Tanh(),
            nn.Linear(128, action_dim),
            nn.Softmax(dim=-1)
        )
        self.critic = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.Tanh(),
            nn.Linear(128, 128),
            nn.Tanh(),
            nn.Linear(128, 1)
        )

    def forward(self, state):
        action_probs = self.actor(state)
        value = self.critic(state)
        return action_probs, value

def record_video(agent, filename="lunar_lander.mp4", max_steps=1000):
    env = gym.make("LunarLander-v3", render_mode="rgb_array")
    state, _ = env.reset()
    frames = []

    for _ in range(max_steps):
        frame = env.render()
        frames.append(frame)

        state_tensor = torch.FloatTensor(state)
        action_probs, value = agent(state_tensor)
        action_dist = torch.distributions.Categorical(action_probs)
        action = action_dist.sample()

        state, _, terminated, truncated, _ = env.step(action.item())

        if terminated or truncated:
            break

    env.close()
    imageio.mimsave(filename, frames, fps=30)
    print(f"Video saved as {filename}")

def a2c(env, num_episodes, lr=0.001, gamma=0.99):
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    model = ActorCritic(state_dim, action_dim)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    score_track = deque(maxlen=100)

    for episode in range(num_episodes):
        state, _ = env.reset()
        done = False
        episode_rewards = []
        log_probs = []
        values = []
        rewards = []

        score = 0

        while not done:
            state_tensor = torch.FloatTensor(state)
            action_probs, value = model(state_tensor)
            action_dist = torch.distributions.Categorical(action_probs)
            action = action_dist.sample()
            log_prob = action_dist.log_prob(action)

            next_state, reward, terminated, truncated, _ = env.step(action.item())
            done = terminated or truncated

            episode_rewards.append(reward)
            score += reward
            log_probs.append(log_prob)
            values.append(value)
            rewards.append(reward)
            state = next_state

        R = 0
        returns = []
        for r in reversed(rewards):
            R = r + gamma * R
            returns.insert(0, R)
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + 1e-8) # Normalize returns

        actor_losses = []
        critic_losses = []

        for log_prob, value, ret in zip(log_probs, values, returns):
            advantage = ret - value.squeeze()
            actor_loss = -log_prob * advantage
            critic_loss = (ret - value.squeeze()) ** 2

            actor_losses.append(actor_loss)
            critic_losses.append(critic_loss)

        actor_loss = torch.stack(actor_losses).mean()
        critic_loss = torch.stack(critic_losses).mean()

        loss = actor_loss + critic_loss

        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=10.0)
        optimizer.step()

        total_reward = sum(episode_rewards)
        score_track.append(score)
        avg_reward = np.mean(score_track)
        if episode % 10 == 0:
            print(f"Episode {episode + 1}, Total Reward: {total_reward:.2f}, Avg Reward: {avg_reward:.2f}")

        if avg_reward >= 200:
            print('Solved!')
            break

    return model

if __name__ == "__main__":
    env = gym.make("LunarLander-v3")
    model = a2c(env, num_episodes=1000)
    env.close()
    record_video(model)

Episode 1, Total Reward: -386.73, Avg Reward: -386.73
Episode 11, Total Reward: -187.78, Avg Reward: -233.78
Episode 21, Total Reward: -173.35, Avg Reward: -199.66
Episode 31, Total Reward: -282.48, Avg Reward: -192.10
Episode 41, Total Reward: -223.79, Avg Reward: -191.95
Episode 51, Total Reward: -269.91, Avg Reward: -188.12
Episode 61, Total Reward: -85.55, Avg Reward: -190.43
Episode 71, Total Reward: -147.63, Avg Reward: -180.41
Episode 81, Total Reward: -89.54, Avg Reward: -169.56
Episode 91, Total Reward: -25.66, Avg Reward: -159.30
Episode 101, Total Reward: -106.18, Avg Reward: -149.24
Episode 111, Total Reward: -3.54, Avg Reward: -135.61
Episode 121, Total Reward: -36.94, Avg Reward: -128.80
Episode 131, Total Reward: -165.78, Avg Reward: -120.40
Episode 141, Total Reward: -70.91, Avg Reward: -111.72
Episode 151, Total Reward: -80.83, Avg Reward: -112.10
Episode 161, Total Reward: 6.75, Avg Reward: -95.16
Episode 171, Total Reward: -27.80, Avg Reward: -87.41
Episode 181, Tota



Video saved as lunar_lander.mp4
