# Import

In [1]:
import gymnasium as gym
import math
import random
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

<contextlib.ExitStack at 0x7f815b1d8dc0>

# Memory & Model

In [3]:
Transition = namedtuple('Transition', ('memory', 'state', 'action', 'next_state', 'reward'))

In [4]:
class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [5]:
class DQN(nn.Module):

    def __init__(self, n_observations, n_actions):
        super(DQN, self).__init__()
        self.lstm = nn.LSTM(input_size=n_observations, hidden_size=128, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, n_actions)
        )

    def forward(self, x):
        # Assuming x is of shape (batch_size, sequence_length, n_observations)
        # output, (hn, cn) = rnn(input, (h0, c0))
        lstm_out, _ = self.lstm(x)
        # Taking the last time step's output
        lstm_out_last = lstm_out[:, -1, :]
        return self.fc(lstm_out_last)

# Training

In [6]:
env = gym.make("LunarLander-v2")

In [7]:
BATCH_SIZE = 128
GAMMA = 0.99  # GAMMA is the discount factor as mentioned in the previous section
EPS_START = 0.9  # EPS_START is the starting value of epsilon
EPS_END = 0.05  # EPS_END is the final value of epsilon
EPS_DECAY = 1000  # EPS_DECAY controls the rate of exponential decay of epsilon, higher means a slower decay
TAU = 0.01  # TAU is the update rate of the target network
LR = 1e-4  # LR is the learning rate of the ``AdamW`` optimizer

MEMORY_SIZE = 10000