In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import gymnasium as gym
import random
from collections import deque, namedtuple

#### Define the neural network

In [None]:
class DQN(nn.Module):
    def __init__(self, input_size, output_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, output_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

#### Define the Agent class

Includes:  
    -Hyperparameters  
    -Training, action selection and memory functions  
    -Policy network and target network to predict q values

In [None]:
Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward', 'done'))

class Agent:
    def __init__(self, n_obs, n_act, lr=0.0001, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01, batch_size=32, max_mem=100000):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.n_obs = n_obs
        self.n_act = n_act
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.memory = deque(maxlen=max_mem)
        self.batch_size = batch_size
        self.policy_net = DQN(n_obs, n_act).to(self.device)
        self.target_net = DQN(n_obs, n_act).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.loss_fn = nn.SmoothL1Loss()

    def act(self, state):
        if random.random() < self.epsilon:
            return torch.tensor([[random.randrange(self.n_act)]], device=self.device)
        else:
            with torch.no_grad():
                return self.policy_net(state).max(1)[1].view(1, 1)
            
    def act_eval(self, state):
        with torch.no_grad():
            return self.policy_net(state).max(1)[1].view(1, 1)
            
    def store(self, transition):
        self.memory.append(transition)

    def train(self):
        if len(self.memory) < self.batch_size:
            return
        
        transitions = random.sample(self.memory, self.batch_size)
        batch = Transition(*zip(*transitions))

        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=self.device, dtype=torch.bool)
        non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        state_action_values = self.policy_net(state_batch).gather(1, action_batch)

        next_state_values = torch.zeros(self.batch_size, device=self.device)
        next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach()

        expected_state_action_values = (next_state_values * self.gamma) + reward_batch

        loss = self.loss_fn(state_action_values, expected_state_action_values.unsqueeze(1))

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_target(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def update_epsilon(self):
        self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)

#### Create environment and Agent objects

In [None]:
env = gym.make('LunarLander-v2', render_mode='rgb_array')
n_obs, n_act = env.observation_space.shape[0], env.action_space.n
agent = Agent(n_obs, n_act)

episodes = 1500
max_steps = 300
total_reward = 0.

#### Training loop

In [None]:
for e in range(1, episodes+1):
    state = np.array(env.reset()[0])
    state = torch.tensor([state], device=agent.device, dtype=torch.float32)
    score = 0.
    for s in range(max_steps):
        action = agent.act(state)
        next_state, reward, done, _, _ = env.step(action.item())
        next_state = torch.tensor([next_state], device=agent.device, dtype=torch.float32)
        reward = torch.tensor([reward], device=agent.device, dtype=torch.float32)
        agent.store(Transition(state, action, next_state, reward, done))
        score += reward.item()
        state = next_state
        agent.train()
        agent.update_epsilon()
        if done:
            break
    total_reward += score
    agent.update_target()
    if e % 100 == 0:
        print(f'Episode {e}, Reward: {total_reward / 100}')
        total_reward = 0.

#### Evaluation loop

In [None]:
env = gym.make('LunarLander-v2', render_mode='human')

while True:
    state = np.array(env.reset()[0])
    state = torch.tensor([state], device=agent.device, dtype=torch.float32)
    score = 0.
    for s in range(400):
        action = agent.act_eval(state)
        next_state, reward, done, _, _ = env.step(action.item())
        state = torch.tensor([next_state], device=agent.device, dtype=torch.float32)
        score += float(reward)
        if done:
            break
    print(f'Score: {score}')

#### If you want to continue training after evaluation, firstly you must run this cell

In [None]:
env = gym.make('LunarLander-v2', render_mode='rgb_array')