In [None]:
# !pip install swig
# !pip install "gymnasium[box2d]"

In [11]:
import sys
from pathlib import Path

project_root = Path().absolute().parent.parent
results_dir = project_root / 'results'

sys.path.insert(0, str(project_root))

In [None]:
import torch
import torch.nn as nn
import gymnasium as gym

from tqdm import tqdm

from dqn import DQNAgent, DQNAgentConfig

In [22]:
agent = DQNAgent(DQNAgentConfig)

In [None]:
scores = []

for i_episode in range(600): # Run for 600 episodes
    state, info = agent.env.reset()
    total_reward = 0
    
    for t in range(1000):
        action = agent.select_action(state)
        
        observation, reward, terminated, truncated, _ = agent.env.step(action.item())
        done = terminated or truncated
        
        agent.memory.push(state, action.item(), reward, observation, done)
        agent.optimize_model()
        
        state = observation
        total_reward += reward
        
        if done:
            break
    
    scores.append(total_reward)
    
    # Logging
    if i_episode % 20 == 0:
        avg_score = np.mean(scores[-20:])
        print(f"Episode {i_episode}, Score: {total_reward:.2f}, Avg (last 20): {avg_score:.2f}, Epsilon: {agent.eps_end + (agent.eps_start - agent.eps_end) * math.exp(-1. * agent.steps_done / agent.eps_decay):.2f}")

print("Training Complete!")

Episode 0, Score: -110.35, Avg (last 20): -110.35, Epsilon: 0.68
Episode 20, Score: -176.17, Avg (last 20): -237.89, Epsilon: 0.05
Episode 40, Score: -156.15, Avg (last 20): -54.54, Epsilon: 0.05
Episode 60, Score: 145.12, Avg (last 20): -51.61, Epsilon: 0.05
Episode 80, Score: -145.00, Avg (last 20): -121.08, Epsilon: 0.05
Episode 100, Score: -0.82, Avg (last 20): -74.39, Epsilon: 0.05
Episode 120, Score: -32.83, Avg (last 20): -45.05, Epsilon: 0.05
Episode 140, Score: 157.99, Avg (last 20): -2.95, Epsilon: 0.05
Episode 160, Score: -64.84, Avg (last 20): -0.40, Epsilon: 0.05
Episode 180, Score: -23.17, Avg (last 20): -29.83, Epsilon: 0.05
Episode 200, Score: -3.46, Avg (last 20): 19.70, Epsilon: 0.05
Episode 220, Score: -20.50, Avg (last 20): -24.23, Epsilon: 0.05
Episode 240, Score: -58.33, Avg (last 20): 20.11, Epsilon: 0.05
Episode 260, Score: 137.87, Avg (last 20): 15.84, Epsilon: 0.05
Episode 280, Score: 257.22, Avg (last 20): 195.31, Epsilon: 0.05
Episode 300, Score: 171.44, Avg

In [None]:
torch.save(agent.policy_net.state_dict(), results_dir / "lunar_lander_dqn.pth")

In [27]:
visual_env = gym.make("LunarLander-v3", render_mode="human")

num_test_episodes = 5

for ep in range(num_test_episodes):
    state, info = visual_env.reset()
    done = False
    total_reward = 0
    
    while not done:
        # Prepare the state for the network
        state_tensor = torch.tensor(state, dtype=torch.float32, device=agent.device).unsqueeze(0)
        
        # Ask the trained network for the best move
        with torch.no_grad():
            action_index = agent.policy_net(state_tensor).max(1)[1].item()
        
        # Take the action
        state, reward, terminated, truncated, _ = visual_env.step(action_index)
        done = terminated or truncated
        total_reward += reward

    print(f"Test Episode {ep + 1} Score: {total_reward:.2f}")

visual_env.close()

Test Episode 1 Score: 261.00
Test Episode 2 Score: 298.20
Test Episode 3 Score: 270.90
Test Episode 4 Score: 228.09
Test Episode 5 Score: 258.59
