In [1]:
!sudo apt-get install -y build-essential swig libopenmpi-dev

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
build-essential is already the newest version (12.9ubuntu3).
libopenmpi-dev is already the newest version (4.1.2-2ubuntu1).
The following additional packages will be installed:
  swig4.0
Suggested packages:
  swig-doc swig-examples swig4.0-examples swig4.0-doc
The following NEW packages will be installed:
  swig swig4.0
0 upgraded, 2 newly installed, 0 to remove and 29 not upgraded.
Need to get 1,116 kB of archives.
After this operation, 5,542 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig4.0 amd64 4.0.2-1ubuntu1 [1,110 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig all 4.0.2-1ubuntu1 [5,632 B]
Fetched 1,116 kB in 1s (808 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Di

In [2]:
!pip install gymnasium[box2d]

Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/374.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m368.6/374.4 kB[0m [31m12.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting swig==4.* (from gymnasium[box2d])
  Downloading swig-4.3.0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (3.5 kB)
Downloading swig-4.3.0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m58.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone
  Created 

In [9]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
import imageio

# Policy Network
class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_size=128):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, action_dim)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return self.softmax(x)

# REINFORCE Algorithm
class REINFORCEAgent:
    def __init__(self, state_dim, action_dim, lr=1e-3, gamma=0.995):
        self.policy = PolicyNetwork(state_dim, action_dim)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)
        self.gamma = gamma
        self.baseline = 0

    def select_action(self, state):
        state = torch.tensor(state, dtype=torch.float32)
        action_probs = self.policy(state)
        action_dist = torch.distributions.Categorical(action_probs)
        action = action_dist.sample()
        return action.item(), action_dist.log_prob(action)

    def normalize_rewards(self, rewards):
        rewards = np.array(rewards)
        rewards -= rewards.mean()
        rewards /= (rewards.std() + 1e-6)
        return rewards

    def update_policy(self, rewards, log_probs):
        discounted_rewards = []
        G = 0
        for r in reversed(rewards):
            G = r + self.gamma * G
            discounted_rewards.insert(0, G)

        discounted_rewards = self.normalize_rewards(discounted_rewards)
        discounted_rewards = torch.tensor(discounted_rewards, dtype=torch.float32)
        self.baseline = 0.9 * self.baseline + 0.1 * discounted_rewards.mean()
        advantages = discounted_rewards - self.baseline

        # Ensure advantages are detached for the loss calculation
        loss = torch.sum(-torch.stack(log_probs) * advantages.detach())

        self.optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(self.policy.parameters(), max_norm=1.0)
        self.optimizer.step()

def record_video(agent, filename="lunar_lander.mp4", max_steps=1000):
    env = gym.make("LunarLander-v3", render_mode="rgb_array")
    state, _ = env.reset()
    frames = []

    for _ in range(max_steps):
        frame = env.render()
        frames.append(frame)
        action, _ = agent.select_action(state)
        state, _, terminated, truncated, _ = env.step(action)
        if terminated or truncated:
            break

    env.close()
    imageio.mimsave(filename, frames, fps=30)
    print(f"Video saved as {filename}")

# Training loop
def train_agent(episodes=2000, max_steps=1000):
    env = gym.make("LunarLander-v3")
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    agent = REINFORCEAgent(state_dim, action_dim)

    scores = deque(maxlen=100)

    for episode in range(episodes):
        state, _ = env.reset()
        log_probs = []
        rewards = []
        score = 0

        for _ in range(max_steps):
            action, log_prob = agent.select_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)

            log_probs.append(log_prob)
            rewards.append(reward)
            score += reward
            state = next_state

            if terminated or truncated:
                break

        agent.update_policy(rewards, log_probs)
        scores.append(score)
        avg_score = np.mean(scores)

        if episode % 10 == 0:
            print(f"Episode {episode + 1}: Score = {score}, Avg Score = {avg_score:.2f}")

        if avg_score >= 200:
            print("Environment solved!")
            break

    env.close()
    record_video(agent=agent)

if __name__ == "__main__":
    train_agent()


Episode 1: Score = -119.29343324065972, Avg Score = -119.29
Episode 11: Score = -141.1762166414236, Avg Score = -156.45
Episode 21: Score = -248.61103874304916, Avg Score = -186.31
Episode 31: Score = -246.10220665389983, Avg Score = -169.02
Episode 41: Score = -403.32881514134453, Avg Score = -188.92
Episode 51: Score = -149.19830651571058, Avg Score = -184.56
Episode 61: Score = -31.932345993517544, Avg Score = -190.04
Episode 71: Score = -83.19592660737484, Avg Score = -178.35
Episode 81: Score = -54.57273820874984, Avg Score = -171.67
Episode 91: Score = -81.99677183204506, Avg Score = -168.85
Episode 101: Score = -199.3870056588955, Avg Score = -167.00
Episode 111: Score = -155.12409953340443, Avg Score = -160.42
Episode 121: Score = -186.66137387224376, Avg Score = -156.89
Episode 131: Score = -80.93929446665268, Avg Score = -159.00
Episode 141: Score = -143.01664897626458, Avg Score = -146.57
Episode 151: Score = -98.23742098207072, Avg Score = -139.86
Episode 161: Score = -75.4



Video saved as lunar_lander.mp4
