# AS3.1 - Deep Q-learning Network (Lunar Lander)

## Imports

In [1]:
import numpy as np
import gymnasium as gym
from random import randint
import torch

from src.lmodel import Lmodel
from src.agent import Agent
from src.policy import Policy
from src.memory import Memory

In [2]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


<br>

# Preparation

## Parameters

In [3]:
num_epochs = 1000
max_steps = 2000
avg_reward_threshold = 200

learning_rate = 0.001
epsilon = 1.0
epsilon_decay = 0.99
discount = 0.99

memory_size = 32000
sample_size = 64

available_actions = [0,1,2,3]

## Visualizing Model Structure

In [4]:
test = Lmodel().to(device)
print(test)

Lmodel(
  (l1): Linear(in_features=8, out_features=128, bias=True)
  (l2): Linear(in_features=128, out_features=64, bias=True)
  (l3): Linear(in_features=64, out_features=4, bias=True)
)


## Creating Environment

In [5]:
env = gym.make("LunarLander-v2", render_mode=None)

## Defining Objects

In [6]:
# Memory class for the Agent
me0 = Memory(memory_size)

# The Policy class for the Agent
p0 = Policy(Lmodel().to(device), learning_rate, epsilon, available_actions, epsilon_decay)

# The Agent class
a0 = Agent(env.step, me0, p0, device, sample_size, num_epochs, max_steps, discount)

<br>

## Training

In [7]:
rewards = []
losses = []

state, info = env.reset(seed=randint(0, 1000))
for i in range(num_epochs):
    epoch_reward = 0
    epoch_loss = 0
    for step in range(max_steps):
        state, reward, loss, terminated, truncated = a0.train(state)
        epoch_reward += reward
        epoch_loss += loss
        
        if terminated or truncated:
            state, info = env.reset(seed=randint(0, 1000))
            break
        
    a0.policy.decay()
    rewards.append(epoch_reward)
    losses.append(epoch_loss)
    
    # ===== Visualization ===== #
    print(f"Epoch {i} | Epoch rewards: {epoch_reward} | Training losses: {epoch_loss} | Epsilon: {a0.policy.epsilon}")
    
    if i in [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]:
        run_avg_reward = np.mean(rewards)
        run_avg_loss = np.mean(losses)
        if run_avg_reward >= 200:
            print(f"Epoch {i} | Average reward: {run_avg_reward} and Loss: {run_avg_loss} | Epsilon: {a0.policy.epsilon}\n")
            break
        else:
            print(f"Epoch {i} | Average reward: {run_avg_reward} and Loss: {run_avg_loss} | Epsilon: {a0.policy.epsilon}\n")

env.close()


Epoch 0 | Epoch rewards: 11.696699693236255 | Training losses: 31189.942172540825 | Epsilon: 0.99
Epoch 1 | Epoch rewards: -170.6104836700173 | Training losses: 231306.9749620823 | Epsilon: 0.9801
Epoch 2 | Epoch rewards: -471.4013349848496 | Training losses: 192921.48842534237 | Epsilon: 0.9702989999999999
Epoch 3 | Epoch rewards: -346.9902891404704 | Training losses: 200828.80273651084 | Epsilon: 0.96059601
Epoch 4 | Epoch rewards: -262.45698033601457 | Training losses: 246448.25836025784 | Epsilon: 0.9509900498999999
Epoch 5 | Epoch rewards: -145.30539737772276 | Training losses: 194177.72397519829 | Epsilon: 0.9414801494009999
Epoch 6 | Epoch rewards: -98.73329421631539 | Training losses: 170292.03541759972 | Epsilon: 0.9320653479069899
Epoch 7 | Epoch rewards: -153.54804943346687 | Training losses: 162581.77649735354 | Epsilon: 0.92274469442792
Epoch 8 | Epoch rewards: -332.6737502718716 | Training losses: 160875.97879105527 | Epsilon: 0.9135172474836407
Epoch 9 | Epoch rewards: -

Epoch 73 | Epoch rewards: -556.1094718095037 | Training losses: 139750.6404932277 | Epsilon: 0.475340042005707
Epoch 74 | Epoch rewards: -451.6607062267181 | Training losses: 141333.89342812978 | Epsilon: 0.47058664158564995
Epoch 75 | Epoch rewards: -474.18769858108317 | Training losses: 141581.04969435136 | Epsilon: 0.4658807751697934
Epoch 76 | Epoch rewards: -336.94088015454804 | Training losses: 164791.4580710476 | Epsilon: 0.4612219674180955
Epoch 77 | Epoch rewards: -871.7524813444039 | Training losses: 294015.52077287715 | Epsilon: 0.45660974774391455
Epoch 78 | Epoch rewards: -319.467228087012 | Training losses: 266536.346928328 | Epsilon: 0.4520436502664754
Epoch 79 | Epoch rewards: -514.9211980693451 | Training losses: 247694.38454810335 | Epsilon: 0.44752321376381066
Epoch 80 | Epoch rewards: -455.88736368936804 | Training losses: 116451.037508512 | Epsilon: 0.44304798162617254
Epoch 81 | Epoch rewards: -474.6784079343268 | Training losses: 155572.04932295717 | Epsilon: 0.4


KeyboardInterrupt

