In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import gymnasium as gym
import random
from collections import deque

In [None]:
class DQN(nn.Module):
    def __init__(self, n_obs, n_act):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(n_obs, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, n_act)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [None]:
memory = deque(maxlen=100000)

env = gym.make("LunarLander-v2", render_mode="rgb_array")
state, info = env.reset()
n_obs, n_act = len(state), env.action_space.n

dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net = DQN(n_obs, n_act).to(dev)
optimizer = optim.Adam(net.parameters(), lr=0.001)
loss_f = nn.MSELoss()
target_net = DQN(n_obs, n_act).to(dev)

In [None]:
gamma = 0.99 #discount rate
epsilon = 1.0 #exploration rate
eps_decay = 0.995
eps_min = 0.01
batch_size = 64 #batch size for training after every episode
episodes = 10000
total_reward = 0.

In [None]:
def act_no_random(state):
    act_val = net(torch.tensor(state, dtype=torch.float).to(dev))
    return torch.argmax(act_val).item()

def act(state):
    if random.random() <= epsilon:
        return env.action_space.sample()
    act_val = net(torch.tensor(state, dtype=torch.float).to(dev))
    return torch.argmax(act_val).item()

def train(memory, batch_size, epsilon, eps_decay, gamma):
    minibatch = random.sample(memory, batch_size)
    for state, action, reward, next_state, done in minibatch:
        next_state = torch.tensor(next_state, dtype=torch.float).to(dev)
        state = torch.tensor(state, dtype=torch.float).to(dev)
        target = reward
        if not done:
            target = reward + gamma * torch.max(target_net(next_state)).item()
        target = float(target)
        target_f = net(state)
        target_f[action] = target
        optimizer.zero_grad()
        loss = loss_f(net(state), target_f)
        loss.backward()
        optimizer.step()
    if epsilon > eps_min:
        return epsilon * eps_decay
    else:
        return eps_min

In [None]:
for e in range(1, episodes):
    state = env.reset()
    state = np.array(state[0])
    score = 0.
    done = False
    count = 0
    while not done:
        action = act(state)
        next_state, reward, done, _, _ = env.step(action)
        if count > 1000:
            done = True
        score += float(reward)
        memory.append((state, action, reward, next_state, done))
        state = next_state
        count += 1
        if len(memory) > batch_size:
            epsilon = train(memory, batch_size, epsilon, eps_decay, gamma)
    if e % 10 == 0:
        print("Episode:", e, "Mean Score:", total_reward / 10)
        total_reward = 0.
    target_net.load_state_dict(net.state_dict())
    total_reward += score

In [None]:
env = gym.make("LunarLander-v2", render_mode="human")

while True:
    state = env.reset()
    state = np.array(state[0])
    score = 0.
    count = 0
    done = False
    while not done and count < 400:
        action = act_no_random(state)
        next_state, reward, done, _, _ = env.step(action)
        score += float(reward)
        state = next_state
        count += 1
    print("Score:", score)

In [None]:
env = gym.make("LunarLander-v2", render_mode="rgb_array")